mirror of
https://github.com/GOSTSec/ccminer
synced 2025-03-13 05:51:19 +00:00
bump to revision V1.1 with Killer Groestl
This commit is contained in:
parent
ac40fac038
commit
3b21069504
@ -567,8 +567,6 @@ __host__ void jackpot_keccak512_cpu_hash(int thr_id, int threads, uint32_t start
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
||||
jackpot_keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
|
||||
MyStreamSynchronize(NULL, order, thr_id);
|
||||
}
|
||||
|
@ -101,14 +101,12 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
|
||||
{
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
// TODO: entfernen für eine Release! Ist nur zum Testen!
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x0000ff;
|
||||
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
|
||||
const int throughput = 256*4096*4; // 100;
|
||||
//const int throughput = 256*256*2+100; // 100;
|
||||
|
||||
static bool init[8] = {0,0,0,0,0,0,0,0};
|
||||
if (!init[thr_id])
|
||||
@ -167,16 +165,18 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
|
||||
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
|
||||
}
|
||||
|
||||
// Runde 2 (ohne Gröstl)
|
||||
// Runde 3 (komplett)
|
||||
|
||||
// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
|
||||
jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
|
||||
d_branch1Nonces[thr_id], &nrm1,
|
||||
d_branch3Nonces[thr_id], &nrm3,
|
||||
d_branch2Nonces[thr_id], &nrm2,
|
||||
order++);
|
||||
|
||||
// verfolge den skein-pfad weiter
|
||||
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
|
||||
if (nrm1+nrm2 == nrm3) {
|
||||
quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
|
||||
quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
|
||||
}
|
||||
|
||||
// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
|
||||
jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
|
||||
@ -226,7 +226,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
|
||||
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
|
||||
|
||||
pdata[19] = foundNonce;
|
||||
*hashes_done = (foundNonce - first_nonce + 1)/4;
|
||||
*hashes_done = (foundNonce - first_nonce + 1)/2;
|
||||
//applog(LOG_INFO, "GPU #%d: result for nonce $%08X does validate on CPU (%d rounds)!", thr_id, foundNonce, rounds);
|
||||
return 1;
|
||||
} else {
|
||||
@ -238,6 +238,6 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
|
||||
|
||||
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = (pdata[19] - first_nonce + 1)/4;
|
||||
*hashes_done = (pdata[19] - first_nonce + 1)/2;
|
||||
return 0;
|
||||
}
|
||||
|
13
README.txt
13
README.txt
@ -1,5 +1,5 @@
|
||||
|
||||
ccMiner release 1.0 (May 10th 2014) - "Did anyone say X11?"
|
||||
ccMiner release 1.1 (June 14th 2014) - "Killer Groestl!"
|
||||
-------------------------------------------------------------
|
||||
|
||||
***************************************************************
|
||||
@ -30,13 +30,12 @@ FugueCoin
|
||||
GroestlCoin & Myriad-Groestl
|
||||
JackpotCoin
|
||||
QuarkCoin family & AnimeCoin
|
||||
TalkCoin
|
||||
DarkCoin and other X11 coins
|
||||
|
||||
where some of these coins have a VERY NOTABLE nVidia advantage
|
||||
over competing AMD (OpenCL) implementations.
|
||||
|
||||
X11 algo is being worked on. It will be released when we
|
||||
have achieved a nice nVidia advantage.
|
||||
|
||||
We did not take a big effort on improving usability, so please set
|
||||
your parameters carefuly.
|
||||
|
||||
@ -140,6 +139,12 @@ features.
|
||||
|
||||
>>> RELEASE HISTORY <<<
|
||||
|
||||
June 14th 2014 released Killer Groestl quad version which I deem
|
||||
sufficiently hard to port over to AMD. It isn't
|
||||
the fastest option for Compute 3.5 and 5.0 cards,
|
||||
but it is still much faster than the table based
|
||||
versions.
|
||||
|
||||
May 10th 2014 added X11, but without the bells & whistles
|
||||
(no killer Groestl, SIMD hash quite slow still)
|
||||
|
||||
|
418
bitslice_transformations_quad.cu
Normal file
418
bitslice_transformations_quad.cu
Normal file
@ -0,0 +1,418 @@
|
||||
|
||||
__device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *output)
|
||||
{
|
||||
int n = threadIdx.x % 4;
|
||||
uint32_t other[8];
|
||||
#pragma unroll 8
|
||||
for (int i = 0; i < 8; i++) {
|
||||
input[i] =__shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4);
|
||||
other[i] =__shfl((int)input[i], (threadIdx.x + 1) % 4, 4);
|
||||
input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);
|
||||
other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);
|
||||
if (threadIdx.x & 1) {
|
||||
input[i] = __byte_perm(input[i], 0, 0x1032);
|
||||
other[i] = __byte_perm(other[i], 0, 0x1032);
|
||||
}
|
||||
output[i] = 0;
|
||||
}
|
||||
|
||||
output[ 0] |= (input[ 0] & 0x00000001);
|
||||
output[ 0] |= ((other[ 0] & 0x00000001) << 1);
|
||||
output[ 0] |= ((input[ 1] & 0x00000001) << 2);
|
||||
output[ 0] |= ((other[ 1] & 0x00000001) << 3);
|
||||
output[ 0] |= ((input[ 2] & 0x00000001) << 4);
|
||||
output[ 0] |= ((other[ 2] & 0x00000001) << 5);
|
||||
output[ 0] |= ((input[ 3] & 0x00000001) << 6);
|
||||
output[ 0] |= ((other[ 3] & 0x00000001) << 7);
|
||||
output[ 0] |= ((input[ 4] & 0x00000001) << 8);
|
||||
output[ 0] |= ((other[ 4] & 0x00000001) << 9);
|
||||
output[ 0] |= ((input[ 5] & 0x00000001) <<10);
|
||||
output[ 0] |= ((other[ 5] & 0x00000001) <<11);
|
||||
output[ 0] |= ((input[ 6] & 0x00000001) <<12);
|
||||
output[ 0] |= ((other[ 6] & 0x00000001) <<13);
|
||||
output[ 0] |= ((input[ 7] & 0x00000001) <<14);
|
||||
output[ 0] |= ((other[ 7] & 0x00000001) <<15);
|
||||
output[ 0] |= ((input[ 0] & 0x00000100) << 8);
|
||||
output[ 0] |= ((other[ 0] & 0x00000100) << 9);
|
||||
output[ 0] |= ((input[ 1] & 0x00000100) <<10);
|
||||
output[ 0] |= ((other[ 1] & 0x00000100) <<11);
|
||||
output[ 0] |= ((input[ 2] & 0x00000100) <<12);
|
||||
output[ 0] |= ((other[ 2] & 0x00000100) <<13);
|
||||
output[ 0] |= ((input[ 3] & 0x00000100) <<14);
|
||||
output[ 0] |= ((other[ 3] & 0x00000100) <<15);
|
||||
output[ 0] |= ((input[ 4] & 0x00000100) <<16);
|
||||
output[ 0] |= ((other[ 4] & 0x00000100) <<17);
|
||||
output[ 0] |= ((input[ 5] & 0x00000100) <<18);
|
||||
output[ 0] |= ((other[ 5] & 0x00000100) <<19);
|
||||
output[ 0] |= ((input[ 6] & 0x00000100) <<20);
|
||||
output[ 0] |= ((other[ 6] & 0x00000100) <<21);
|
||||
output[ 0] |= ((input[ 7] & 0x00000100) <<22);
|
||||
output[ 0] |= ((other[ 7] & 0x00000100) <<23);
|
||||
output[ 1] |= ((input[ 0] & 0x00000002) >> 1);
|
||||
output[ 1] |= (other[ 0] & 0x00000002);
|
||||
output[ 1] |= ((input[ 1] & 0x00000002) << 1);
|
||||
output[ 1] |= ((other[ 1] & 0x00000002) << 2);
|
||||
output[ 1] |= ((input[ 2] & 0x00000002) << 3);
|
||||
output[ 1] |= ((other[ 2] & 0x00000002) << 4);
|
||||
output[ 1] |= ((input[ 3] & 0x00000002) << 5);
|
||||
output[ 1] |= ((other[ 3] & 0x00000002) << 6);
|
||||
output[ 1] |= ((input[ 4] & 0x00000002) << 7);
|
||||
output[ 1] |= ((other[ 4] & 0x00000002) << 8);
|
||||
output[ 1] |= ((input[ 5] & 0x00000002) << 9);
|
||||
output[ 1] |= ((other[ 5] & 0x00000002) <<10);
|
||||
output[ 1] |= ((input[ 6] & 0x00000002) <<11);
|
||||
output[ 1] |= ((other[ 6] & 0x00000002) <<12);
|
||||
output[ 1] |= ((input[ 7] & 0x00000002) <<13);
|
||||
output[ 1] |= ((other[ 7] & 0x00000002) <<14);
|
||||
output[ 1] |= ((input[ 0] & 0x00000200) << 7);
|
||||
output[ 1] |= ((other[ 0] & 0x00000200) << 8);
|
||||
output[ 1] |= ((input[ 1] & 0x00000200) << 9);
|
||||
output[ 1] |= ((other[ 1] & 0x00000200) <<10);
|
||||
output[ 1] |= ((input[ 2] & 0x00000200) <<11);
|
||||
output[ 1] |= ((other[ 2] & 0x00000200) <<12);
|
||||
output[ 1] |= ((input[ 3] & 0x00000200) <<13);
|
||||
output[ 1] |= ((other[ 3] & 0x00000200) <<14);
|
||||
output[ 1] |= ((input[ 4] & 0x00000200) <<15);
|
||||
output[ 1] |= ((other[ 4] & 0x00000200) <<16);
|
||||
output[ 1] |= ((input[ 5] & 0x00000200) <<17);
|
||||
output[ 1] |= ((other[ 5] & 0x00000200) <<18);
|
||||
output[ 1] |= ((input[ 6] & 0x00000200) <<19);
|
||||
output[ 1] |= ((other[ 6] & 0x00000200) <<20);
|
||||
output[ 1] |= ((input[ 7] & 0x00000200) <<21);
|
||||
output[ 1] |= ((other[ 7] & 0x00000200) <<22);
|
||||
output[ 2] |= ((input[ 0] & 0x00000004) >> 2);
|
||||
output[ 2] |= ((other[ 0] & 0x00000004) >> 1);
|
||||
output[ 2] |= (input[ 1] & 0x00000004);
|
||||
output[ 2] |= ((other[ 1] & 0x00000004) << 1);
|
||||
output[ 2] |= ((input[ 2] & 0x00000004) << 2);
|
||||
output[ 2] |= ((other[ 2] & 0x00000004) << 3);
|
||||
output[ 2] |= ((input[ 3] & 0x00000004) << 4);
|
||||
output[ 2] |= ((other[ 3] & 0x00000004) << 5);
|
||||
output[ 2] |= ((input[ 4] & 0x00000004) << 6);
|
||||
output[ 2] |= ((other[ 4] & 0x00000004) << 7);
|
||||
output[ 2] |= ((input[ 5] & 0x00000004) << 8);
|
||||
output[ 2] |= ((other[ 5] & 0x00000004) << 9);
|
||||
output[ 2] |= ((input[ 6] & 0x00000004) <<10);
|
||||
output[ 2] |= ((other[ 6] & 0x00000004) <<11);
|
||||
output[ 2] |= ((input[ 7] & 0x00000004) <<12);
|
||||
output[ 2] |= ((other[ 7] & 0x00000004) <<13);
|
||||
output[ 2] |= ((input[ 0] & 0x00000400) << 6);
|
||||
output[ 2] |= ((other[ 0] & 0x00000400) << 7);
|
||||
output[ 2] |= ((input[ 1] & 0x00000400) << 8);
|
||||
output[ 2] |= ((other[ 1] & 0x00000400) << 9);
|
||||
output[ 2] |= ((input[ 2] & 0x00000400) <<10);
|
||||
output[ 2] |= ((other[ 2] & 0x00000400) <<11);
|
||||
output[ 2] |= ((input[ 3] & 0x00000400) <<12);
|
||||
output[ 2] |= ((other[ 3] & 0x00000400) <<13);
|
||||
output[ 2] |= ((input[ 4] & 0x00000400) <<14);
|
||||
output[ 2] |= ((other[ 4] & 0x00000400) <<15);
|
||||
output[ 2] |= ((input[ 5] & 0x00000400) <<16);
|
||||
output[ 2] |= ((other[ 5] & 0x00000400) <<17);
|
||||
output[ 2] |= ((input[ 6] & 0x00000400) <<18);
|
||||
output[ 2] |= ((other[ 6] & 0x00000400) <<19);
|
||||
output[ 2] |= ((input[ 7] & 0x00000400) <<20);
|
||||
output[ 2] |= ((other[ 7] & 0x00000400) <<21);
|
||||
output[ 3] |= ((input[ 0] & 0x00000008) >> 3);
|
||||
output[ 3] |= ((other[ 0] & 0x00000008) >> 2);
|
||||
output[ 3] |= ((input[ 1] & 0x00000008) >> 1);
|
||||
output[ 3] |= (other[ 1] & 0x00000008);
|
||||
output[ 3] |= ((input[ 2] & 0x00000008) << 1);
|
||||
output[ 3] |= ((other[ 2] & 0x00000008) << 2);
|
||||
output[ 3] |= ((input[ 3] & 0x00000008) << 3);
|
||||
output[ 3] |= ((other[ 3] & 0x00000008) << 4);
|
||||
output[ 3] |= ((input[ 4] & 0x00000008) << 5);
|
||||
output[ 3] |= ((other[ 4] & 0x00000008) << 6);
|
||||
output[ 3] |= ((input[ 5] & 0x00000008) << 7);
|
||||
output[ 3] |= ((other[ 5] & 0x00000008) << 8);
|
||||
output[ 3] |= ((input[ 6] & 0x00000008) << 9);
|
||||
output[ 3] |= ((other[ 6] & 0x00000008) <<10);
|
||||
output[ 3] |= ((input[ 7] & 0x00000008) <<11);
|
||||
output[ 3] |= ((other[ 7] & 0x00000008) <<12);
|
||||
output[ 3] |= ((input[ 0] & 0x00000800) << 5);
|
||||
output[ 3] |= ((other[ 0] & 0x00000800) << 6);
|
||||
output[ 3] |= ((input[ 1] & 0x00000800) << 7);
|
||||
output[ 3] |= ((other[ 1] & 0x00000800) << 8);
|
||||
output[ 3] |= ((input[ 2] & 0x00000800) << 9);
|
||||
output[ 3] |= ((other[ 2] & 0x00000800) <<10);
|
||||
output[ 3] |= ((input[ 3] & 0x00000800) <<11);
|
||||
output[ 3] |= ((other[ 3] & 0x00000800) <<12);
|
||||
output[ 3] |= ((input[ 4] & 0x00000800) <<13);
|
||||
output[ 3] |= ((other[ 4] & 0x00000800) <<14);
|
||||
output[ 3] |= ((input[ 5] & 0x00000800) <<15);
|
||||
output[ 3] |= ((other[ 5] & 0x00000800) <<16);
|
||||
output[ 3] |= ((input[ 6] & 0x00000800) <<17);
|
||||
output[ 3] |= ((other[ 6] & 0x00000800) <<18);
|
||||
output[ 3] |= ((input[ 7] & 0x00000800) <<19);
|
||||
output[ 3] |= ((other[ 7] & 0x00000800) <<20);
|
||||
output[ 4] |= ((input[ 0] & 0x00000010) >> 4);
|
||||
output[ 4] |= ((other[ 0] & 0x00000010) >> 3);
|
||||
output[ 4] |= ((input[ 1] & 0x00000010) >> 2);
|
||||
output[ 4] |= ((other[ 1] & 0x00000010) >> 1);
|
||||
output[ 4] |= (input[ 2] & 0x00000010);
|
||||
output[ 4] |= ((other[ 2] & 0x00000010) << 1);
|
||||
output[ 4] |= ((input[ 3] & 0x00000010) << 2);
|
||||
output[ 4] |= ((other[ 3] & 0x00000010) << 3);
|
||||
output[ 4] |= ((input[ 4] & 0x00000010) << 4);
|
||||
output[ 4] |= ((other[ 4] & 0x00000010) << 5);
|
||||
output[ 4] |= ((input[ 5] & 0x00000010) << 6);
|
||||
output[ 4] |= ((other[ 5] & 0x00000010) << 7);
|
||||
output[ 4] |= ((input[ 6] & 0x00000010) << 8);
|
||||
output[ 4] |= ((other[ 6] & 0x00000010) << 9);
|
||||
output[ 4] |= ((input[ 7] & 0x00000010) <<10);
|
||||
output[ 4] |= ((other[ 7] & 0x00000010) <<11);
|
||||
output[ 4] |= ((input[ 0] & 0x00001000) << 4);
|
||||
output[ 4] |= ((other[ 0] & 0x00001000) << 5);
|
||||
output[ 4] |= ((input[ 1] & 0x00001000) << 6);
|
||||
output[ 4] |= ((other[ 1] & 0x00001000) << 7);
|
||||
output[ 4] |= ((input[ 2] & 0x00001000) << 8);
|
||||
output[ 4] |= ((other[ 2] & 0x00001000) << 9);
|
||||
output[ 4] |= ((input[ 3] & 0x00001000) <<10);
|
||||
output[ 4] |= ((other[ 3] & 0x00001000) <<11);
|
||||
output[ 4] |= ((input[ 4] & 0x00001000) <<12);
|
||||
output[ 4] |= ((other[ 4] & 0x00001000) <<13);
|
||||
output[ 4] |= ((input[ 5] & 0x00001000) <<14);
|
||||
output[ 4] |= ((other[ 5] & 0x00001000) <<15);
|
||||
output[ 4] |= ((input[ 6] & 0x00001000) <<16);
|
||||
output[ 4] |= ((other[ 6] & 0x00001000) <<17);
|
||||
output[ 4] |= ((input[ 7] & 0x00001000) <<18);
|
||||
output[ 4] |= ((other[ 7] & 0x00001000) <<19);
|
||||
output[ 5] |= ((input[ 0] & 0x00000020) >> 5);
|
||||
output[ 5] |= ((other[ 0] & 0x00000020) >> 4);
|
||||
output[ 5] |= ((input[ 1] & 0x00000020) >> 3);
|
||||
output[ 5] |= ((other[ 1] & 0x00000020) >> 2);
|
||||
output[ 5] |= ((input[ 2] & 0x00000020) >> 1);
|
||||
output[ 5] |= (other[ 2] & 0x00000020);
|
||||
output[ 5] |= ((input[ 3] & 0x00000020) << 1);
|
||||
output[ 5] |= ((other[ 3] & 0x00000020) << 2);
|
||||
output[ 5] |= ((input[ 4] & 0x00000020) << 3);
|
||||
output[ 5] |= ((other[ 4] & 0x00000020) << 4);
|
||||
output[ 5] |= ((input[ 5] & 0x00000020) << 5);
|
||||
output[ 5] |= ((other[ 5] & 0x00000020) << 6);
|
||||
output[ 5] |= ((input[ 6] & 0x00000020) << 7);
|
||||
output[ 5] |= ((other[ 6] & 0x00000020) << 8);
|
||||
output[ 5] |= ((input[ 7] & 0x00000020) << 9);
|
||||
output[ 5] |= ((other[ 7] & 0x00000020) <<10);
|
||||
output[ 5] |= ((input[ 0] & 0x00002000) << 3);
|
||||
output[ 5] |= ((other[ 0] & 0x00002000) << 4);
|
||||
output[ 5] |= ((input[ 1] & 0x00002000) << 5);
|
||||
output[ 5] |= ((other[ 1] & 0x00002000) << 6);
|
||||
output[ 5] |= ((input[ 2] & 0x00002000) << 7);
|
||||
output[ 5] |= ((other[ 2] & 0x00002000) << 8);
|
||||
output[ 5] |= ((input[ 3] & 0x00002000) << 9);
|
||||
output[ 5] |= ((other[ 3] & 0x00002000) <<10);
|
||||
output[ 5] |= ((input[ 4] & 0x00002000) <<11);
|
||||
output[ 5] |= ((other[ 4] & 0x00002000) <<12);
|
||||
output[ 5] |= ((input[ 5] & 0x00002000) <<13);
|
||||
output[ 5] |= ((other[ 5] & 0x00002000) <<14);
|
||||
output[ 5] |= ((input[ 6] & 0x00002000) <<15);
|
||||
output[ 5] |= ((other[ 6] & 0x00002000) <<16);
|
||||
output[ 5] |= ((input[ 7] & 0x00002000) <<17);
|
||||
output[ 5] |= ((other[ 7] & 0x00002000) <<18);
|
||||
output[ 6] |= ((input[ 0] & 0x00000040) >> 6);
|
||||
output[ 6] |= ((other[ 0] & 0x00000040) >> 5);
|
||||
output[ 6] |= ((input[ 1] & 0x00000040) >> 4);
|
||||
output[ 6] |= ((other[ 1] & 0x00000040) >> 3);
|
||||
output[ 6] |= ((input[ 2] & 0x00000040) >> 2);
|
||||
output[ 6] |= ((other[ 2] & 0x00000040) >> 1);
|
||||
output[ 6] |= (input[ 3] & 0x00000040);
|
||||
output[ 6] |= ((other[ 3] & 0x00000040) << 1);
|
||||
output[ 6] |= ((input[ 4] & 0x00000040) << 2);
|
||||
output[ 6] |= ((other[ 4] & 0x00000040) << 3);
|
||||
output[ 6] |= ((input[ 5] & 0x00000040) << 4);
|
||||
output[ 6] |= ((other[ 5] & 0x00000040) << 5);
|
||||
output[ 6] |= ((input[ 6] & 0x00000040) << 6);
|
||||
output[ 6] |= ((other[ 6] & 0x00000040) << 7);
|
||||
output[ 6] |= ((input[ 7] & 0x00000040) << 8);
|
||||
output[ 6] |= ((other[ 7] & 0x00000040) << 9);
|
||||
output[ 6] |= ((input[ 0] & 0x00004000) << 2);
|
||||
output[ 6] |= ((other[ 0] & 0x00004000) << 3);
|
||||
output[ 6] |= ((input[ 1] & 0x00004000) << 4);
|
||||
output[ 6] |= ((other[ 1] & 0x00004000) << 5);
|
||||
output[ 6] |= ((input[ 2] & 0x00004000) << 6);
|
||||
output[ 6] |= ((other[ 2] & 0x00004000) << 7);
|
||||
output[ 6] |= ((input[ 3] & 0x00004000) << 8);
|
||||
output[ 6] |= ((other[ 3] & 0x00004000) << 9);
|
||||
output[ 6] |= ((input[ 4] & 0x00004000) <<10);
|
||||
output[ 6] |= ((other[ 4] & 0x00004000) <<11);
|
||||
output[ 6] |= ((input[ 5] & 0x00004000) <<12);
|
||||
output[ 6] |= ((other[ 5] & 0x00004000) <<13);
|
||||
output[ 6] |= ((input[ 6] & 0x00004000) <<14);
|
||||
output[ 6] |= ((other[ 6] & 0x00004000) <<15);
|
||||
output[ 6] |= ((input[ 7] & 0x00004000) <<16);
|
||||
output[ 6] |= ((other[ 7] & 0x00004000) <<17);
|
||||
output[ 7] |= ((input[ 0] & 0x00000080) >> 7);
|
||||
output[ 7] |= ((other[ 0] & 0x00000080) >> 6);
|
||||
output[ 7] |= ((input[ 1] & 0x00000080) >> 5);
|
||||
output[ 7] |= ((other[ 1] & 0x00000080) >> 4);
|
||||
output[ 7] |= ((input[ 2] & 0x00000080) >> 3);
|
||||
output[ 7] |= ((other[ 2] & 0x00000080) >> 2);
|
||||
output[ 7] |= ((input[ 3] & 0x00000080) >> 1);
|
||||
output[ 7] |= (other[ 3] & 0x00000080);
|
||||
output[ 7] |= ((input[ 4] & 0x00000080) << 1);
|
||||
output[ 7] |= ((other[ 4] & 0x00000080) << 2);
|
||||
output[ 7] |= ((input[ 5] & 0x00000080) << 3);
|
||||
output[ 7] |= ((other[ 5] & 0x00000080) << 4);
|
||||
output[ 7] |= ((input[ 6] & 0x00000080) << 5);
|
||||
output[ 7] |= ((other[ 6] & 0x00000080) << 6);
|
||||
output[ 7] |= ((input[ 7] & 0x00000080) << 7);
|
||||
output[ 7] |= ((other[ 7] & 0x00000080) << 8);
|
||||
output[ 7] |= ((input[ 0] & 0x00008000) << 1);
|
||||
output[ 7] |= ((other[ 0] & 0x00008000) << 2);
|
||||
output[ 7] |= ((input[ 1] & 0x00008000) << 3);
|
||||
output[ 7] |= ((other[ 1] & 0x00008000) << 4);
|
||||
output[ 7] |= ((input[ 2] & 0x00008000) << 5);
|
||||
output[ 7] |= ((other[ 2] & 0x00008000) << 6);
|
||||
output[ 7] |= ((input[ 3] & 0x00008000) << 7);
|
||||
output[ 7] |= ((other[ 3] & 0x00008000) << 8);
|
||||
output[ 7] |= ((input[ 4] & 0x00008000) << 9);
|
||||
output[ 7] |= ((other[ 4] & 0x00008000) <<10);
|
||||
output[ 7] |= ((input[ 5] & 0x00008000) <<11);
|
||||
output[ 7] |= ((other[ 5] & 0x00008000) <<12);
|
||||
output[ 7] |= ((input[ 6] & 0x00008000) <<13);
|
||||
output[ 7] |= ((other[ 6] & 0x00008000) <<14);
|
||||
output[ 7] |= ((input[ 7] & 0x00008000) <<15);
|
||||
output[ 7] |= ((other[ 7] & 0x00008000) <<16);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void from_bitslice_quad(uint32_t *input, uint32_t *output)
|
||||
{
|
||||
#pragma unroll 8
|
||||
for (int i=0; i < 16; i+=2) output[i] = 0;
|
||||
|
||||
output[ 0] |= ((input[ 0] & 0x00000100) >> 8);
|
||||
output[ 0] |= ((input[ 1] & 0x00000100) >> 7);
|
||||
output[ 0] |= ((input[ 2] & 0x00000100) >> 6);
|
||||
output[ 0] |= ((input[ 3] & 0x00000100) >> 5);
|
||||
output[ 0] |= ((input[ 4] & 0x00000100) >> 4);
|
||||
output[ 0] |= ((input[ 5] & 0x00000100) >> 3);
|
||||
output[ 0] |= ((input[ 6] & 0x00000100) >> 2);
|
||||
output[ 0] |= ((input[ 7] & 0x00000100) >> 1);
|
||||
output[ 0] |= ((input[ 0] & 0x01000000) >>16);
|
||||
output[ 0] |= ((input[ 1] & 0x01000000) >>15);
|
||||
output[ 0] |= ((input[ 2] & 0x01000000) >>14);
|
||||
output[ 0] |= ((input[ 3] & 0x01000000) >>13);
|
||||
output[ 0] |= ((input[ 4] & 0x01000000) >>12);
|
||||
output[ 0] |= ((input[ 5] & 0x01000000) >>11);
|
||||
output[ 0] |= ((input[ 6] & 0x01000000) >>10);
|
||||
output[ 0] |= ((input[ 7] & 0x01000000) >> 9);
|
||||
output[ 2] |= ((input[ 0] & 0x00000200) >> 9);
|
||||
output[ 2] |= ((input[ 1] & 0x00000200) >> 8);
|
||||
output[ 2] |= ((input[ 2] & 0x00000200) >> 7);
|
||||
output[ 2] |= ((input[ 3] & 0x00000200) >> 6);
|
||||
output[ 2] |= ((input[ 4] & 0x00000200) >> 5);
|
||||
output[ 2] |= ((input[ 5] & 0x00000200) >> 4);
|
||||
output[ 2] |= ((input[ 6] & 0x00000200) >> 3);
|
||||
output[ 2] |= ((input[ 7] & 0x00000200) >> 2);
|
||||
output[ 2] |= ((input[ 0] & 0x02000000) >>17);
|
||||
output[ 2] |= ((input[ 1] & 0x02000000) >>16);
|
||||
output[ 2] |= ((input[ 2] & 0x02000000) >>15);
|
||||
output[ 2] |= ((input[ 3] & 0x02000000) >>14);
|
||||
output[ 2] |= ((input[ 4] & 0x02000000) >>13);
|
||||
output[ 2] |= ((input[ 5] & 0x02000000) >>12);
|
||||
output[ 2] |= ((input[ 6] & 0x02000000) >>11);
|
||||
output[ 2] |= ((input[ 7] & 0x02000000) >>10);
|
||||
output[ 4] |= ((input[ 0] & 0x00000400) >>10);
|
||||
output[ 4] |= ((input[ 1] & 0x00000400) >> 9);
|
||||
output[ 4] |= ((input[ 2] & 0x00000400) >> 8);
|
||||
output[ 4] |= ((input[ 3] & 0x00000400) >> 7);
|
||||
output[ 4] |= ((input[ 4] & 0x00000400) >> 6);
|
||||
output[ 4] |= ((input[ 5] & 0x00000400) >> 5);
|
||||
output[ 4] |= ((input[ 6] & 0x00000400) >> 4);
|
||||
output[ 4] |= ((input[ 7] & 0x00000400) >> 3);
|
||||
output[ 4] |= ((input[ 0] & 0x04000000) >>18);
|
||||
output[ 4] |= ((input[ 1] & 0x04000000) >>17);
|
||||
output[ 4] |= ((input[ 2] & 0x04000000) >>16);
|
||||
output[ 4] |= ((input[ 3] & 0x04000000) >>15);
|
||||
output[ 4] |= ((input[ 4] & 0x04000000) >>14);
|
||||
output[ 4] |= ((input[ 5] & 0x04000000) >>13);
|
||||
output[ 4] |= ((input[ 6] & 0x04000000) >>12);
|
||||
output[ 4] |= ((input[ 7] & 0x04000000) >>11);
|
||||
output[ 6] |= ((input[ 0] & 0x00000800) >>11);
|
||||
output[ 6] |= ((input[ 1] & 0x00000800) >>10);
|
||||
output[ 6] |= ((input[ 2] & 0x00000800) >> 9);
|
||||
output[ 6] |= ((input[ 3] & 0x00000800) >> 8);
|
||||
output[ 6] |= ((input[ 4] & 0x00000800) >> 7);
|
||||
output[ 6] |= ((input[ 5] & 0x00000800) >> 6);
|
||||
output[ 6] |= ((input[ 6] & 0x00000800) >> 5);
|
||||
output[ 6] |= ((input[ 7] & 0x00000800) >> 4);
|
||||
output[ 6] |= ((input[ 0] & 0x08000000) >>19);
|
||||
output[ 6] |= ((input[ 1] & 0x08000000) >>18);
|
||||
output[ 6] |= ((input[ 2] & 0x08000000) >>17);
|
||||
output[ 6] |= ((input[ 3] & 0x08000000) >>16);
|
||||
output[ 6] |= ((input[ 4] & 0x08000000) >>15);
|
||||
output[ 6] |= ((input[ 5] & 0x08000000) >>14);
|
||||
output[ 6] |= ((input[ 6] & 0x08000000) >>13);
|
||||
output[ 6] |= ((input[ 7] & 0x08000000) >>12);
|
||||
output[ 8] |= ((input[ 0] & 0x00001000) >>12);
|
||||
output[ 8] |= ((input[ 1] & 0x00001000) >>11);
|
||||
output[ 8] |= ((input[ 2] & 0x00001000) >>10);
|
||||
output[ 8] |= ((input[ 3] & 0x00001000) >> 9);
|
||||
output[ 8] |= ((input[ 4] & 0x00001000) >> 8);
|
||||
output[ 8] |= ((input[ 5] & 0x00001000) >> 7);
|
||||
output[ 8] |= ((input[ 6] & 0x00001000) >> 6);
|
||||
output[ 8] |= ((input[ 7] & 0x00001000) >> 5);
|
||||
output[ 8] |= ((input[ 0] & 0x10000000) >>20);
|
||||
output[ 8] |= ((input[ 1] & 0x10000000) >>19);
|
||||
output[ 8] |= ((input[ 2] & 0x10000000) >>18);
|
||||
output[ 8] |= ((input[ 3] & 0x10000000) >>17);
|
||||
output[ 8] |= ((input[ 4] & 0x10000000) >>16);
|
||||
output[ 8] |= ((input[ 5] & 0x10000000) >>15);
|
||||
output[ 8] |= ((input[ 6] & 0x10000000) >>14);
|
||||
output[ 8] |= ((input[ 7] & 0x10000000) >>13);
|
||||
output[10] |= ((input[ 0] & 0x00002000) >>13);
|
||||
output[10] |= ((input[ 1] & 0x00002000) >>12);
|
||||
output[10] |= ((input[ 2] & 0x00002000) >>11);
|
||||
output[10] |= ((input[ 3] & 0x00002000) >>10);
|
||||
output[10] |= ((input[ 4] & 0x00002000) >> 9);
|
||||
output[10] |= ((input[ 5] & 0x00002000) >> 8);
|
||||
output[10] |= ((input[ 6] & 0x00002000) >> 7);
|
||||
output[10] |= ((input[ 7] & 0x00002000) >> 6);
|
||||
output[10] |= ((input[ 0] & 0x20000000) >>21);
|
||||
output[10] |= ((input[ 1] & 0x20000000) >>20);
|
||||
output[10] |= ((input[ 2] & 0x20000000) >>19);
|
||||
output[10] |= ((input[ 3] & 0x20000000) >>18);
|
||||
output[10] |= ((input[ 4] & 0x20000000) >>17);
|
||||
output[10] |= ((input[ 5] & 0x20000000) >>16);
|
||||
output[10] |= ((input[ 6] & 0x20000000) >>15);
|
||||
output[10] |= ((input[ 7] & 0x20000000) >>14);
|
||||
output[12] |= ((input[ 0] & 0x00004000) >>14);
|
||||
output[12] |= ((input[ 1] & 0x00004000) >>13);
|
||||
output[12] |= ((input[ 2] & 0x00004000) >>12);
|
||||
output[12] |= ((input[ 3] & 0x00004000) >>11);
|
||||
output[12] |= ((input[ 4] & 0x00004000) >>10);
|
||||
output[12] |= ((input[ 5] & 0x00004000) >> 9);
|
||||
output[12] |= ((input[ 6] & 0x00004000) >> 8);
|
||||
output[12] |= ((input[ 7] & 0x00004000) >> 7);
|
||||
output[12] |= ((input[ 0] & 0x40000000) >>22);
|
||||
output[12] |= ((input[ 1] & 0x40000000) >>21);
|
||||
output[12] |= ((input[ 2] & 0x40000000) >>20);
|
||||
output[12] |= ((input[ 3] & 0x40000000) >>19);
|
||||
output[12] |= ((input[ 4] & 0x40000000) >>18);
|
||||
output[12] |= ((input[ 5] & 0x40000000) >>17);
|
||||
output[12] |= ((input[ 6] & 0x40000000) >>16);
|
||||
output[12] |= ((input[ 7] & 0x40000000) >>15);
|
||||
output[14] |= ((input[ 0] & 0x00008000) >>15);
|
||||
output[14] |= ((input[ 1] & 0x00008000) >>14);
|
||||
output[14] |= ((input[ 2] & 0x00008000) >>13);
|
||||
output[14] |= ((input[ 3] & 0x00008000) >>12);
|
||||
output[14] |= ((input[ 4] & 0x00008000) >>11);
|
||||
output[14] |= ((input[ 5] & 0x00008000) >>10);
|
||||
output[14] |= ((input[ 6] & 0x00008000) >> 9);
|
||||
output[14] |= ((input[ 7] & 0x00008000) >> 8);
|
||||
output[14] |= ((input[ 0] & 0x80000000) >>23);
|
||||
output[14] |= ((input[ 1] & 0x80000000) >>22);
|
||||
output[14] |= ((input[ 2] & 0x80000000) >>21);
|
||||
output[14] |= ((input[ 3] & 0x80000000) >>20);
|
||||
output[14] |= ((input[ 4] & 0x80000000) >>19);
|
||||
output[14] |= ((input[ 5] & 0x80000000) >>18);
|
||||
output[14] |= ((input[ 6] & 0x80000000) >>17);
|
||||
output[14] |= ((input[ 7] & 0x80000000) >>16);
|
||||
|
||||
#pragma unroll 8
|
||||
for (int i = 0; i < 16; i+=2) {
|
||||
if (threadIdx.x & 1) output[i] = __byte_perm(output[i], 0, 0x1032);
|
||||
output[i] = __byte_perm(output[i], __shfl((int)output[i], (threadIdx.x+1)%4, 4), 0x7610);
|
||||
output[i+1] = __shfl((int)output[i], (threadIdx.x+2)%4, 4);
|
||||
if ((threadIdx.x % 4) != 0) output[i] = output[i+1] = 0;
|
||||
}
|
||||
}
|
@ -287,6 +287,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
|
||||
<ClInclude Include="uint256.h" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<CudaCompile Include="bitslice_transformations_quad.cu">
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="cuda_fugue256.cu">
|
||||
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||
@ -311,6 +317,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
|
||||
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="groestl_functions_quad.cu">
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
|
||||
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="heavy\cuda_blake512.cu">
|
||||
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
|
||||
|
@ -340,5 +340,11 @@
|
||||
<CudaCompile Include="x11\simd_functions.cu">
|
||||
<Filter>Source Files\CUDA\x11</Filter>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="bitslice_transformations_quad.cu">
|
||||
<Filter>Source Files\CUDA</Filter>
|
||||
</CudaCompile>
|
||||
<CudaCompile Include="groestl_functions_quad.cu">
|
||||
<Filter>Source Files\CUDA</Filter>
|
||||
</CudaCompile>
|
||||
</ItemGroup>
|
||||
</Project>
|
@ -1,4 +1,4 @@
|
||||
AC_INIT([ccminer], [2014.05.10])
|
||||
AC_INIT([ccminer], [2014.06.14])
|
||||
|
||||
AC_PREREQ([2.59c])
|
||||
AC_CANONICAL_SYSTEM
|
||||
|
@ -917,8 +917,8 @@ static void *miner_thread(void *userdata)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (opt_benchmark)
|
||||
if (++rounds == 1) exit(0);
|
||||
// if (opt_benchmark)
|
||||
// if (++rounds == 1) exit(0);
|
||||
|
||||
/* record scanhash elapsed time */
|
||||
gettimeofday(&tv_end, NULL);
|
||||
@ -1469,7 +1469,7 @@ static void signal_handler(int sig)
|
||||
}
|
||||
#endif
|
||||
|
||||
#define PROGRAM_VERSION "1.0"
|
||||
#define PROGRAM_VERSION "1.1"
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
struct thr_info *thr;
|
||||
|
@ -152,7 +152,7 @@
|
||||
#define PACKAGE_NAME "ccminer"
|
||||
|
||||
/* Define to the full name and version of this package. */
|
||||
#define PACKAGE_STRING "ccminer 2014.05.10"
|
||||
#define PACKAGE_STRING "ccminer 2014.06.14"
|
||||
|
||||
/* Define to the one symbol short name of this package. */
|
||||
#undef PACKAGE_TARNAME
|
||||
@ -161,7 +161,7 @@
|
||||
#undef PACKAGE_URL
|
||||
|
||||
/* Define to the version of this package. */
|
||||
#define PACKAGE_VERSION "2014.05.10"
|
||||
#define PACKAGE_VERSION "2014.06.14"
|
||||
|
||||
/* If using the C implementation of alloca, define if you know the
|
||||
direction of stack growth for your system; otherwise it will be
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Auf Groestlcoin spezialisierte Version von Groestl
|
||||
// Auf Groestlcoin spezialisierte Version von Groestl inkl. Bitslice
|
||||
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
@ -7,9 +7,6 @@
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
// it's unfortunate that this is a compile time constant.
|
||||
#define MAXWELL_OR_FERMI 1
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
@ -18,456 +15,152 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
|
||||
// diese Struktur wird in der Init Funktion angefordert
|
||||
static cudaDeviceProp props;
|
||||
static cudaDeviceProp props[8];
|
||||
|
||||
// globaler Speicher für alle HeftyHashes aller Threads
|
||||
__constant__ uint32_t pTarget[8]; // Single GPU
|
||||
extern uint32_t *d_resultNonce[8];
|
||||
|
||||
__constant__ uint32_t groestlcoin_gpu_msg[32];
|
||||
|
||||
#define SPH_C32(x) ((uint32_t)(x ## U))
|
||||
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
|
||||
// 64 Register Variante für Compute 3.0
|
||||
#include "groestl_functions_quad.cu"
|
||||
#include "bitslice_transformations_quad.cu"
|
||||
|
||||
#define PC32up(j, r) ((uint32_t)((j) + (r)))
|
||||
#define PC32dn(j, r) 0
|
||||
#define QC32up(j, r) 0xFFFFFFFF
|
||||
#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
|
||||
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
|
||||
|
||||
#define B32_0(x) __byte_perm(x, 0, 0x4440)
|
||||
//((x) & 0xFF)
|
||||
#define B32_1(x) __byte_perm(x, 0, 0x4441)
|
||||
//(((x) >> 8) & 0xFF)
|
||||
#define B32_2(x) __byte_perm(x, 0, 0x4442)
|
||||
//(((x) >> 16) & 0xFF)
|
||||
#define B32_3(x) __byte_perm(x, 0, 0x4443)
|
||||
//((x) >> 24)
|
||||
|
||||
#if MAXWELL_OR_FERMI
|
||||
#define USE_SHARED 1
|
||||
// Maxwell and Fermi cards get the best speed with SHARED access it seems.
|
||||
#if USE_SHARED
|
||||
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
|
||||
#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
|
||||
#define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
|
||||
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
|
||||
#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
|
||||
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
|
||||
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
|
||||
#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
|
||||
#else
|
||||
#define T0up(x) tex1Dfetch(t0up1, x)
|
||||
#define T0dn(x) tex1Dfetch(t0dn1, x)
|
||||
#define T1up(x) tex1Dfetch(t1up1, x)
|
||||
#define T1dn(x) tex1Dfetch(t1dn1, x)
|
||||
#define T2up(x) tex1Dfetch(t2up1, x)
|
||||
#define T2dn(x) tex1Dfetch(t2dn1, x)
|
||||
#define T3up(x) tex1Dfetch(t3up1, x)
|
||||
#define T3dn(x) tex1Dfetch(t3dn1, x)
|
||||
#endif
|
||||
#else
|
||||
#define USE_SHARED 1
|
||||
// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
|
||||
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
|
||||
#define T0dn(x) tex1Dfetch(t0dn1, x)
|
||||
#define T1up(x) tex1Dfetch(t1up1, x)
|
||||
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
|
||||
#define T2up(x) tex1Dfetch(t2up1, x)
|
||||
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
|
||||
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
|
||||
#define T3dn(x) tex1Dfetch(t3dn1, x)
|
||||
#endif
|
||||
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t0up1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t1up1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t2up1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t3up1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
|
||||
|
||||
extern uint32_t T0up_cpu[];
|
||||
extern uint32_t T0dn_cpu[];
|
||||
extern uint32_t T1up_cpu[];
|
||||
extern uint32_t T1dn_cpu[];
|
||||
extern uint32_t T2up_cpu[];
|
||||
extern uint32_t T2dn_cpu[];
|
||||
extern uint32_t T3up_cpu[];
|
||||
extern uint32_t T3dn_cpu[];
|
||||
|
||||
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
|
||||
|
||||
|
||||
__device__ __forceinline__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
|
||||
__global__ void __launch_bounds__(256, 4)
|
||||
groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce)
|
||||
{
|
||||
uint32_t t[32];
|
||||
|
||||
//#pragma unroll 14
|
||||
for(int r=0;r<14;r++)
|
||||
{
|
||||
switch(r)
|
||||
{
|
||||
case 0:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break;
|
||||
case 1:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
|
||||
case 2:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
|
||||
case 3:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
|
||||
case 4:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
|
||||
case 5:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
|
||||
case 6:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
|
||||
case 7:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
|
||||
case 8:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
|
||||
case 9:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
|
||||
case 10:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
|
||||
case 11:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
|
||||
case 12:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
|
||||
case 13:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
|
||||
}
|
||||
|
||||
// RBTT
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<32;k+=2)
|
||||
{
|
||||
uint32_t t0_0 = B32_0(a[(k ) & 0x1f]), t9_0 = B32_0(a[(k + 9) & 0x1f]);
|
||||
uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
|
||||
uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
|
||||
uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
|
||||
|
||||
t[k + 0] = T0up( t0_0 ) ^ T1up( t2_1 ) ^ T2up( t4_2 ) ^ T3up( t6_3 ) ^
|
||||
T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 );
|
||||
|
||||
t[k + 1] = T0dn( t0_0 ) ^ T1dn( t2_1 ) ^ T2dn( t4_2 ) ^ T3dn( t6_3 ) ^
|
||||
T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 );
|
||||
}
|
||||
#pragma unroll 32
|
||||
for(int k=0;k<32;k++)
|
||||
a[k] = t[k];
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
|
||||
{
|
||||
//#pragma unroll 14
|
||||
for(int r=0;r<14;r++)
|
||||
{
|
||||
uint32_t t[32];
|
||||
|
||||
switch(r)
|
||||
{
|
||||
case 0:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break;
|
||||
case 1:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break;
|
||||
case 2:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break;
|
||||
case 3:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
|
||||
case 4:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
|
||||
case 5:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
|
||||
case 6:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
|
||||
case 7:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
|
||||
case 8:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
|
||||
case 9:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
|
||||
case 10:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
|
||||
case 11:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
|
||||
case 12:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
|
||||
case 13:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
|
||||
}
|
||||
|
||||
// RBTT
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<32;k+=2)
|
||||
{
|
||||
uint32_t t2_0 = B32_0(a[(k + 2) & 0x1f]), t1_0 = B32_0(a[(k + 1) & 0x1f]);
|
||||
uint32_t t6_1 = B32_1(a[(k + 6) & 0x1f]), t5_1 = B32_1(a[(k + 5) & 0x1f]);
|
||||
uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2 = B32_2(a[(k + 9) & 0x1f]);
|
||||
uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
|
||||
|
||||
t[k + 0] = T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^
|
||||
T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn( t9_2 ) ^ T3dn( t13_3 );
|
||||
|
||||
t[k + 1] = T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^
|
||||
T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up( t9_2 ) ^ T3up( t13_3 );
|
||||
}
|
||||
#pragma unroll 32
|
||||
for(int k=0;k<32;k++)
|
||||
a[k] = t[k];
|
||||
}
|
||||
}
|
||||
#if USE_SHARED
|
||||
__global__ void /* __launch_bounds__(256) */
|
||||
#else
|
||||
__global__ void
|
||||
#endif
|
||||
|
||||
groestlcoin_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
|
||||
{
|
||||
#if USE_SHARED
|
||||
extern __shared__ char mixtabs[];
|
||||
|
||||
if (threadIdx.x < 256)
|
||||
{
|
||||
*((uint32_t*)mixtabs + ( threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
#endif
|
||||
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads)
|
||||
{
|
||||
// GROESTL
|
||||
uint32_t message[32];
|
||||
uint32_t state[32];
|
||||
|
||||
#pragma unroll 32
|
||||
for(int k=0;k<32;k++) message[k] = groestlcoin_gpu_msg[k];
|
||||
|
||||
uint32_t nounce = startNounce + thread;
|
||||
message[19] = SWAB32(nounce);
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++) state[u] = message[u];
|
||||
state[31] ^= 0x20000;
|
||||
|
||||
// Perm
|
||||
#if USE_SHARED
|
||||
groestlcoin_perm_P(state, mixtabs);
|
||||
state[31] ^= 0x20000;
|
||||
groestlcoin_perm_Q(message, mixtabs);
|
||||
#else
|
||||
groestlcoin_perm_P(state, NULL);
|
||||
state[31] ^= 0x20000;
|
||||
groestlcoin_perm_Q(message, NULL);
|
||||
#endif
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++) state[u] ^= message[u];
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++) message[u] = state[u];
|
||||
|
||||
#if USE_SHARED
|
||||
groestlcoin_perm_P(message, mixtabs);
|
||||
#else
|
||||
groestlcoin_perm_P(message, NULL);
|
||||
#endif
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++) state[u] ^= message[u];
|
||||
|
||||
////
|
||||
//// 2. Runde groestl
|
||||
////
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) message[k] = state[k + 16];
|
||||
#pragma unroll 14
|
||||
for(int k=1;k<15;k++)
|
||||
message[k+16] = 0;
|
||||
|
||||
message[16] = 0x80;
|
||||
message[31] = 0x01000000;
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++)
|
||||
state[u] = message[u];
|
||||
state[31] ^= 0x20000;
|
||||
|
||||
// Perm
|
||||
#if USE_SHARED
|
||||
groestlcoin_perm_P(state, mixtabs);
|
||||
state[31] ^= 0x20000;
|
||||
groestlcoin_perm_Q(message, mixtabs);
|
||||
#else
|
||||
groestlcoin_perm_P(state, NULL);
|
||||
state[31] ^= 0x20000;
|
||||
groestlcoin_perm_Q(message, NULL);
|
||||
#endif
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++) state[u] ^= message[u];
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++) message[u] = state[u];
|
||||
|
||||
#if USE_SHARED
|
||||
groestlcoin_perm_P(message, mixtabs);
|
||||
#else
|
||||
groestlcoin_perm_P(message, NULL);
|
||||
#endif
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++) state[u] ^= message[u];
|
||||
|
||||
// kopiere Ergebnis
|
||||
int i, position = -1;
|
||||
bool rc = true;
|
||||
|
||||
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
|
||||
if (thread < threads)
|
||||
{
|
||||
// GROESTL
|
||||
uint32_t paddedInput[8];
|
||||
#pragma unroll 8
|
||||
for (i = 7; i >= 0; i--) {
|
||||
if (state[i+16] > pTarget[i]) {
|
||||
if(position < i) {
|
||||
position = i;
|
||||
rc = false;
|
||||
}
|
||||
}
|
||||
if (state[i+16] < pTarget[i]) {
|
||||
if(position < i) {
|
||||
position = i;
|
||||
rc = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
for(int k=0;k<8;k++) paddedInput[k] = groestlcoin_gpu_msg[4*k+threadIdx.x%4];
|
||||
|
||||
if(rc == true)
|
||||
if(resNounce[0] > nounce)
|
||||
resNounce[0] = nounce;
|
||||
}
|
||||
uint32_t nounce = startNounce + thread;
|
||||
if ((threadIdx.x % 4) == 3)
|
||||
paddedInput[4] = SWAB32(nounce); // 4*4+3 = 19
|
||||
|
||||
uint32_t msgBitsliced[8];
|
||||
to_bitslice_quad(paddedInput, msgBitsliced);
|
||||
|
||||
uint32_t state[8];
|
||||
for (int round=0; round<2; round++)
|
||||
{
|
||||
groestl512_progressMessage_quad(state, msgBitsliced);
|
||||
|
||||
if (round < 1)
|
||||
{
|
||||
// Verkettung zweier Runden inclusive Padding.
|
||||
msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + ((threadIdx.x%4)==3)*0x2000);
|
||||
msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
|
||||
msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
|
||||
msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
|
||||
msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
|
||||
msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
|
||||
msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
|
||||
msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + ((threadIdx.x%4)==0)*0x0010);
|
||||
}
|
||||
}
|
||||
|
||||
// Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
|
||||
uint32_t out_state[16];
|
||||
from_bitslice_quad(state, out_state);
|
||||
|
||||
if (threadIdx.x % 4 == 0)
|
||||
{
|
||||
int i, position = -1;
|
||||
bool rc = true;
|
||||
|
||||
#pragma unroll 8
|
||||
for (i = 7; i >= 0; i--) {
|
||||
if (out_state[i] > pTarget[i]) {
|
||||
if(position < i) {
|
||||
position = i;
|
||||
rc = false;
|
||||
}
|
||||
}
|
||||
if (out_state[i] < pTarget[i]) {
|
||||
if(position < i) {
|
||||
position = i;
|
||||
rc = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if(rc == true)
|
||||
if(resNounce[0] > nounce)
|
||||
resNounce[0] = nounce;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define texDef(texname, texmem, texsource, texsize) \
|
||||
unsigned int *texmem; \
|
||||
cudaMalloc(&texmem, texsize); \
|
||||
cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
|
||||
texname.normalized = 0; \
|
||||
texname.filterMode = cudaFilterModePoint; \
|
||||
texname.addressMode[0] = cudaAddressModeClamp; \
|
||||
{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
|
||||
cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
|
||||
|
||||
// Setup-Funktionen
|
||||
__host__ void groestlcoin_cpu_init(int thr_id, int threads)
|
||||
{
|
||||
cudaSetDevice(device_map[thr_id]);
|
||||
cudaSetDevice(device_map[thr_id]);
|
||||
|
||||
cudaGetDeviceProperties(&props, device_map[thr_id]);
|
||||
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
|
||||
|
||||
// Texturen mit obigem Makro initialisieren
|
||||
texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
|
||||
|
||||
// Speicher für Gewinner-Nonce belegen
|
||||
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
|
||||
// Speicher für Gewinner-Nonce belegen
|
||||
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
|
||||
}
|
||||
|
||||
__host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
|
||||
{
|
||||
// Nachricht expandieren und setzen
|
||||
uint32_t msgBlock[32];
|
||||
// Nachricht expandieren und setzen
|
||||
uint32_t msgBlock[32];
|
||||
|
||||
memset(msgBlock, 0, sizeof(uint32_t) * 32);
|
||||
memcpy(&msgBlock[0], data, 80);
|
||||
memset(msgBlock, 0, sizeof(uint32_t) * 32);
|
||||
memcpy(&msgBlock[0], data, 80);
|
||||
|
||||
// Erweitere die Nachricht auf den Nachrichtenblock (padding)
|
||||
// Unsere Nachricht hat 80 Byte
|
||||
msgBlock[20] = 0x80;
|
||||
msgBlock[31] = 0x01000000;
|
||||
// Erweitere die Nachricht auf den Nachrichtenblock (padding)
|
||||
// Unsere Nachricht hat 80 Byte
|
||||
msgBlock[20] = 0x80;
|
||||
msgBlock[31] = 0x01000000;
|
||||
|
||||
// groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
|
||||
// auf der GPU ausgeführt)
|
||||
// groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
|
||||
// auf der GPU ausgeführt)
|
||||
|
||||
// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
|
||||
cudaMemcpyToSymbol( groestlcoin_gpu_msg,
|
||||
msgBlock,
|
||||
128);
|
||||
// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
|
||||
cudaMemcpyToSymbol( groestlcoin_gpu_msg,
|
||||
msgBlock,
|
||||
128);
|
||||
|
||||
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
|
||||
cudaMemcpyToSymbol( pTarget,
|
||||
pTargetIn,
|
||||
sizeof(uint32_t) * 8 );
|
||||
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
|
||||
cudaMemcpyToSymbol( pTarget,
|
||||
pTargetIn,
|
||||
sizeof(uint32_t) * 8 );
|
||||
}
|
||||
|
||||
__host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
|
||||
{
|
||||
// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
|
||||
// alle anderen mit 512 Threads.
|
||||
int threadsperblock = (props.major >= 3) ? 768 : 512;
|
||||
int threadsperblock = 256;
|
||||
|
||||
// berechne wie viele Thread Blocks wir brauchen
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 block(threadsperblock);
|
||||
// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
|
||||
// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
|
||||
int factor = 4;
|
||||
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
#if USE_SHARED
|
||||
size_t shared_size = 8 * 256 * sizeof(uint32_t);
|
||||
#else
|
||||
size_t shared_size = 0;
|
||||
#endif
|
||||
// berechne wie viele Thread Blocks wir brauchen
|
||||
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
//fprintf(stderr, "ThrID: %d\n", thr_id);
|
||||
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
|
||||
groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// Strategisches Sleep Kommando zur Senkung der CPU Last
|
||||
MyStreamSynchronize(NULL, 0, thr_id);
|
||||
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
|
||||
groestlcoin_gpu_hash_quad<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
|
||||
|
||||
cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
// Strategisches Sleep Kommando zur Senkung der CPU Last
|
||||
MyStreamSynchronize(NULL, 0, thr_id);
|
||||
|
||||
cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
|
||||
}
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Auf Myriadcoin spezialisierte Version von Groestl
|
||||
// Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice
|
||||
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
@ -7,9 +7,6 @@
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
// it's unfortunate that this is a compile time constant.
|
||||
#define MAXWELL_OR_FERMI 1
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
@ -22,30 +19,49 @@ typedef unsigned short uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
|
||||
// diese Struktur wird in der Init Funktion angefordert
|
||||
static cudaDeviceProp props;
|
||||
static cudaDeviceProp props[8];
|
||||
|
||||
// globaler Speicher für alle HeftyHashes aller Threads
|
||||
__constant__ uint32_t pTarget[8]; // Single GPU
|
||||
uint32_t *d_outputHashes[8];
|
||||
extern uint32_t *d_resultNonce[8];
|
||||
|
||||
__constant__ uint32_t myriadgroestl_gpu_msg[32];
|
||||
|
||||
// muss expandiert werden
|
||||
__constant__ uint32_t myr_sha256_gpu_constantTable[64];
|
||||
__constant__ uint32_t myr_sha256_gpu_constantTable2[64];
|
||||
__constant__ uint32_t myr_sha256_gpu_hashTable[8];
|
||||
|
||||
uint32_t myr_sha256_cpu_hashTable[] = {
|
||||
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
|
||||
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
|
||||
uint32_t myr_sha256_cpu_constantTable[] = {
|
||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
|
||||
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
|
||||
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
|
||||
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
|
||||
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
|
||||
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
|
||||
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
|
||||
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
|
||||
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
|
||||
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
|
||||
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
|
||||
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
|
||||
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
|
||||
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
|
||||
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
|
||||
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
|
||||
};
|
||||
|
||||
uint32_t myr_sha256_cpu_w2Table[] = {
|
||||
0x80000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
|
||||
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000200,
|
||||
0x80000000, 0x01400000, 0x00205000, 0x00005088, 0x22000800, 0x22550014, 0x05089742, 0xa0000020,
|
||||
0x5a880000, 0x005c9400, 0x0016d49d, 0xfa801f00, 0xd33225d0, 0x11675959, 0xf6e6bfda, 0xb30c1549,
|
||||
0x08b2b050, 0x9d7c4c27, 0x0ce2a393, 0x88e6e1ea, 0xa52b4335, 0x67a16f49, 0xd732016f, 0x4eeb2e91,
|
||||
0x5dbf55e5, 0x8eee2335, 0xe2bc5ec2, 0xa83f4394, 0x45ad78f7, 0x36f3d0cd, 0xd99c05e8, 0xb0511dc7,
|
||||
0x69bc7ac4, 0xbd11375b, 0xe3ba71e5, 0x3b209ff2, 0x18feee17, 0xe25ad9e7, 0x13375046, 0x0515089d,
|
||||
0x4f0d0f04, 0x2627484e, 0x310128d2, 0xc668b434, 0x420841cc, 0x62d311b8, 0xe59ba771, 0x85a7a484 };
|
||||
|
||||
// 64 Register Variante für Compute 3.0
|
||||
#include "groestl_functions_quad.cu"
|
||||
#include "bitslice_transformations_quad.cu"
|
||||
|
||||
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
|
||||
|
||||
#if __CUDA_ARCH__ < 350
|
||||
// Kepler (Compute 3.0)
|
||||
#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
|
||||
@ -53,452 +69,219 @@ uint32_t myr_sha256_cpu_constantTable[] = {
|
||||
// Kepler (Compute 3.5)
|
||||
#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
|
||||
#endif
|
||||
#define R(x, n) ((x) >> (n))
|
||||
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
|
||||
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
|
||||
#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
|
||||
#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
|
||||
#define s0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3))
|
||||
#define s1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10))
|
||||
|
||||
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
|
||||
#define R(x, n) ((x) >> (n))
|
||||
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
|
||||
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
|
||||
#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
|
||||
#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
|
||||
#define s0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3))
|
||||
#define s1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10))
|
||||
|
||||
__device__ void myriadgroestl_gpu_sha256(uint32_t *message)
|
||||
{
|
||||
uint32_t W1[16];
|
||||
uint32_t W2[16];
|
||||
uint32_t W1[16];
|
||||
uint32_t W2[16];
|
||||
|
||||
// Initialisiere die register a bis h mit der Hash-Tabelle
|
||||
uint32_t regs[8];
|
||||
uint32_t hash[8];
|
||||
// Initialisiere die register a bis h mit der Hash-Tabelle
|
||||
uint32_t regs[8];
|
||||
uint32_t hash[8];
|
||||
|
||||
// pre
|
||||
// pre
|
||||
#pragma unroll 8
|
||||
for (int k=0; k < 8; k++)
|
||||
{
|
||||
regs[k] = myr_sha256_gpu_hashTable[k];
|
||||
hash[k] = regs[k];
|
||||
}
|
||||
|
||||
for (int k=0; k < 8; k++)
|
||||
{
|
||||
regs[k] = myr_sha256_gpu_hashTable[k];
|
||||
hash[k] = regs[k];
|
||||
}
|
||||
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++)
|
||||
W1[k] = SWAB32(message[k]);
|
||||
for(int k=0;k<16;k++)
|
||||
W1[k] = SWAB32(message[k]);
|
||||
|
||||
// Progress W1
|
||||
#pragma unroll 16
|
||||
for(int j=0;j<16;j++)
|
||||
{
|
||||
uint32_t T1, T2;
|
||||
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j];
|
||||
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
|
||||
|
||||
#pragma unroll 7
|
||||
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
|
||||
regs[0] = T1 + T2;
|
||||
regs[4] += T1;
|
||||
}
|
||||
for(int j=0;j<16;j++)
|
||||
{
|
||||
uint32_t T1, T2;
|
||||
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j];
|
||||
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
|
||||
|
||||
#pragma unroll 7
|
||||
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
|
||||
regs[0] = T1 + T2;
|
||||
regs[4] += T1;
|
||||
}
|
||||
|
||||
// Progress W2...W3
|
||||
#pragma unroll 3
|
||||
for(int k=0;k<3;k++)
|
||||
{
|
||||
////// PART 1
|
||||
#pragma unroll 2
|
||||
for(int j=0;j<2;j++)
|
||||
W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
|
||||
for(int j=0;j<2;j++)
|
||||
W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
|
||||
#pragma unroll 5
|
||||
for(int j=2;j<7;j++)
|
||||
W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
|
||||
for(int j=2;j<7;j++)
|
||||
W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
|
||||
|
||||
#pragma unroll 8
|
||||
for(int j=7;j<15;j++)
|
||||
W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
|
||||
for(int j=7;j<15;j++)
|
||||
W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
|
||||
|
||||
W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
|
||||
W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
|
||||
|
||||
// Rundenfunktion
|
||||
// Rundenfunktion
|
||||
#pragma unroll 16
|
||||
for(int j=0;j<16;j++)
|
||||
{
|
||||
uint32_t T1, T2;
|
||||
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];
|
||||
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
|
||||
|
||||
#pragma unroll 7
|
||||
for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
|
||||
regs[0] = T1 + T2;
|
||||
regs[4] += T1;
|
||||
}
|
||||
for(int j=0;j<16;j++)
|
||||
{
|
||||
uint32_t T1, T2;
|
||||
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16] + W2[j];
|
||||
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
|
||||
|
||||
#pragma unroll 7
|
||||
for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
|
||||
regs[0] = T1 + T2;
|
||||
regs[4] += T1;
|
||||
}
|
||||
|
||||
////// PART 2
|
||||
#pragma unroll 2
|
||||
for(int j=0;j<2;j++)
|
||||
W1[j] = s1(W2[14+j]) + W2[9+j] + s0(W2[1+j]) + W2[j];
|
||||
#pragma unroll 5
|
||||
for(int j=2;j<7;j++)
|
||||
W1[j] = s1(W1[j-2]) + W2[9+j] + s0(W2[1+j]) + W2[j];
|
||||
|
||||
#pragma unroll 8
|
||||
for(int j=7;j<15;j++)
|
||||
W1[j] = s1(W1[j-2]) + W1[j-7] + s0(W2[1+j]) + W2[j];
|
||||
|
||||
W1[15] = s1(W1[13]) + W1[8] + s0(W1[0]) + W2[15];
|
||||
|
||||
// Rundenfunktion
|
||||
#pragma unroll 16
|
||||
for(int j=0;j<16;j++)
|
||||
W1[j] = W2[j];
|
||||
}
|
||||
for(int j=0;j<16;j++)
|
||||
{
|
||||
uint32_t T1, T2;
|
||||
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 32] + W1[j];
|
||||
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
|
||||
|
||||
#pragma unroll 7
|
||||
for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
|
||||
regs[0] = T1 + T2;
|
||||
regs[4] += T1;
|
||||
}
|
||||
|
||||
////// PART 3
|
||||
#pragma unroll 2
|
||||
for(int j=0;j<2;j++)
|
||||
W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
|
||||
#pragma unroll 5
|
||||
for(int j=2;j<7;j++)
|
||||
W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
|
||||
|
||||
#pragma unroll 8
|
||||
for(int k=0;k<8;k++)
|
||||
hash[k] += regs[k];
|
||||
for(int j=7;j<15;j++)
|
||||
W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
|
||||
|
||||
W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
|
||||
|
||||
// Rundenfunktion
|
||||
#pragma unroll 16
|
||||
for(int j=0;j<16;j++)
|
||||
{
|
||||
uint32_t T1, T2;
|
||||
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 48] + W2[j];
|
||||
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
|
||||
|
||||
#pragma unroll 7
|
||||
for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
|
||||
regs[0] = T1 + T2;
|
||||
regs[4] += T1;
|
||||
}
|
||||
|
||||
/////
|
||||
///// Zweite Runde (wegen Msg-Padding)
|
||||
/////
|
||||
#pragma unroll 8
|
||||
for(int k=0;k<8;k++)
|
||||
regs[k] = hash[k];
|
||||
for(int k=0;k<8;k++)
|
||||
hash[k] += regs[k];
|
||||
|
||||
W1[0] = SWAB32(0x80);
|
||||
#pragma unroll 14
|
||||
for(int k=1;k<15;k++)
|
||||
W1[k] = 0;
|
||||
W1[15] = 512;
|
||||
/////
|
||||
///// Zweite Runde (wegen Msg-Padding)
|
||||
/////
|
||||
#pragma unroll 8
|
||||
for(int k=0;k<8;k++)
|
||||
regs[k] = hash[k];
|
||||
|
||||
// Progress W1
|
||||
#pragma unroll 16
|
||||
for(int j=0;j<16;j++)
|
||||
{
|
||||
uint32_t T1, T2;
|
||||
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j];
|
||||
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
|
||||
|
||||
#pragma unroll 7
|
||||
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
|
||||
regs[0] = T1 + T2;
|
||||
regs[4] += T1;
|
||||
}
|
||||
|
||||
// Progress W2...W3
|
||||
#pragma unroll 3
|
||||
for(int k=0;k<3;k++)
|
||||
{
|
||||
#pragma unroll 2
|
||||
for(int j=0;j<2;j++)
|
||||
W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
|
||||
#pragma unroll 5
|
||||
for(int j=2;j<7;j++)
|
||||
W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
|
||||
#pragma unroll 64
|
||||
for(int j=0;j<64;j++)
|
||||
{
|
||||
uint32_t T1, T2;
|
||||
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable2[j];
|
||||
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
|
||||
|
||||
#pragma unroll 7
|
||||
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
|
||||
regs[0] = T1 + T2;
|
||||
regs[4] += T1;
|
||||
}
|
||||
|
||||
#pragma unroll 8
|
||||
for(int j=7;j<15;j++)
|
||||
W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
|
||||
for(int k=0;k<8;k++)
|
||||
hash[k] += regs[k];
|
||||
|
||||
W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
|
||||
|
||||
// Rundenfunktion
|
||||
#pragma unroll 16
|
||||
for(int j=0;j<16;j++)
|
||||
{
|
||||
uint32_t T1, T2;
|
||||
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];
|
||||
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
|
||||
|
||||
#pragma unroll 7
|
||||
for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
|
||||
regs[0] = T1 + T2;
|
||||
regs[4] += T1;
|
||||
}
|
||||
|
||||
#pragma unroll 16
|
||||
for(int j=0;j<16;j++)
|
||||
W1[j] = W2[j];
|
||||
}
|
||||
//// FERTIG
|
||||
|
||||
#pragma unroll 8
|
||||
for(int k=0;k<8;k++)
|
||||
hash[k] += regs[k];
|
||||
|
||||
//// FERTIG
|
||||
|
||||
#pragma unroll 8
|
||||
for(int k=0;k<8;k++)
|
||||
message[k] = SWAB32(hash[k]);
|
||||
for(int k=0;k<8;k++)
|
||||
message[k] = SWAB32(hash[k]);
|
||||
}
|
||||
|
||||
#define SPH_C32(x) ((uint32_t)(x ## U))
|
||||
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
|
||||
|
||||
#define PC32up(j, r) ((uint32_t)((j) + (r)))
|
||||
#define PC32dn(j, r) 0
|
||||
#define QC32up(j, r) 0xFFFFFFFF
|
||||
#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
|
||||
|
||||
#define B32_0(x) __byte_perm(x, 0, 0x4440)
|
||||
//((x) & 0xFF)
|
||||
#define B32_1(x) __byte_perm(x, 0, 0x4441)
|
||||
//(((x) >> 8) & 0xFF)
|
||||
#define B32_2(x) __byte_perm(x, 0, 0x4442)
|
||||
//(((x) >> 16) & 0xFF)
|
||||
#define B32_3(x) __byte_perm(x, 0, 0x4443)
|
||||
//((x) >> 24)
|
||||
|
||||
#if MAXWELL_OR_FERMI
|
||||
#define USE_SHARED 1
|
||||
// Maxwell and Fermi cards get the best speed with SHARED access it seems.
|
||||
#if USE_SHARED
|
||||
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
|
||||
#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
|
||||
#define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
|
||||
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
|
||||
#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
|
||||
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
|
||||
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
|
||||
#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
|
||||
#else
|
||||
#define T0up(x) tex1Dfetch(t0up1, x)
|
||||
#define T0dn(x) tex1Dfetch(t0dn1, x)
|
||||
#define T1up(x) tex1Dfetch(t1up1, x)
|
||||
#define T1dn(x) tex1Dfetch(t1dn1, x)
|
||||
#define T2up(x) tex1Dfetch(t2up1, x)
|
||||
#define T2dn(x) tex1Dfetch(t2dn1, x)
|
||||
#define T3up(x) tex1Dfetch(t3up1, x)
|
||||
#define T3dn(x) tex1Dfetch(t3dn1, x)
|
||||
#endif
|
||||
#else
|
||||
#define USE_SHARED 1
|
||||
// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
|
||||
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
|
||||
#define T0dn(x) tex1Dfetch(t0dn1, x)
|
||||
#define T1up(x) tex1Dfetch(t1up1, x)
|
||||
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
|
||||
#define T2up(x) tex1Dfetch(t2up1, x)
|
||||
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
|
||||
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
|
||||
#define T3dn(x) tex1Dfetch(t3dn1, x)
|
||||
#endif
|
||||
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t0up1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t1up1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t2up1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t3up1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
|
||||
|
||||
extern uint32_t T0up_cpu[];
|
||||
extern uint32_t T0dn_cpu[];
|
||||
extern uint32_t T1up_cpu[];
|
||||
extern uint32_t T1dn_cpu[];
|
||||
extern uint32_t T2up_cpu[];
|
||||
extern uint32_t T2dn_cpu[];
|
||||
extern uint32_t T3up_cpu[];
|
||||
extern uint32_t T3dn_cpu[];
|
||||
|
||||
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
|
||||
|
||||
|
||||
__device__ __forceinline__ void myriadgroestl_perm_P(uint32_t *a, char *mixtabs)
|
||||
__global__ void __launch_bounds__(256, 4)
|
||||
myriadgroestl_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *hashBuffer)
|
||||
{
|
||||
uint32_t t[32];
|
||||
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
|
||||
if (thread < threads)
|
||||
{
|
||||
// GROESTL
|
||||
uint32_t paddedInput[8];
|
||||
#pragma unroll 8
|
||||
for(int k=0;k<8;k++) paddedInput[k] = myriadgroestl_gpu_msg[4*k+threadIdx.x%4];
|
||||
|
||||
//#pragma unroll 14
|
||||
for(int r=0;r<14;r++)
|
||||
{
|
||||
switch(r)
|
||||
{
|
||||
case 0:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break;
|
||||
case 1:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
|
||||
case 2:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
|
||||
case 3:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
|
||||
case 4:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
|
||||
case 5:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
|
||||
case 6:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
|
||||
case 7:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
|
||||
case 8:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
|
||||
case 9:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
|
||||
case 10:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
|
||||
case 11:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
|
||||
case 12:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
|
||||
case 13:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
|
||||
}
|
||||
uint32_t nounce = startNounce + thread;
|
||||
if ((threadIdx.x % 4) == 3)
|
||||
paddedInput[4] = SWAB32(nounce); // 4*4+3 = 19
|
||||
|
||||
// RBTT
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<32;k+=2)
|
||||
uint32_t msgBitsliced[8];
|
||||
to_bitslice_quad(paddedInput, msgBitsliced);
|
||||
|
||||
uint32_t state[8];
|
||||
|
||||
groestl512_progressMessage_quad(state, msgBitsliced);
|
||||
|
||||
uint32_t out_state[16];
|
||||
from_bitslice_quad(state, out_state);
|
||||
|
||||
if ((threadIdx.x & 0x03) == 0)
|
||||
{
|
||||
uint32_t t0_0 = B32_0(a[(k ) & 0x1f]), t9_0 = B32_0(a[(k + 9) & 0x1f]);
|
||||
uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
|
||||
uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
|
||||
uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
|
||||
|
||||
t[k + 0] = T0up( t0_0 ) ^ T1up( t2_1 ) ^ T2up( t4_2 ) ^ T3up( t6_3 ) ^
|
||||
T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 );
|
||||
|
||||
t[k + 1] = T0dn( t0_0 ) ^ T1dn( t2_1 ) ^ T2dn( t4_2 ) ^ T3dn( t6_3 ) ^
|
||||
T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 );
|
||||
uint32_t *outpHash = &hashBuffer[16 * thread];
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) outpHash[k] = out_state[k];
|
||||
}
|
||||
#pragma unroll 32
|
||||
for(int k=0;k<32;k++)
|
||||
a[k] = t[k];
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void myriadgroestl_perm_Q(uint32_t *a, char *mixtabs)
|
||||
{
|
||||
//#pragma unroll 14
|
||||
for(int r=0;r<14;r++)
|
||||
{
|
||||
uint32_t t[32];
|
||||
|
||||
switch(r)
|
||||
{
|
||||
case 0:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break;
|
||||
case 1:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break;
|
||||
case 2:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break;
|
||||
case 3:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
|
||||
case 4:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
|
||||
case 5:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
|
||||
case 6:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
|
||||
case 7:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
|
||||
case 8:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
|
||||
case 9:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
|
||||
case 10:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
|
||||
case 11:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
|
||||
case 12:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
|
||||
case 13:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
|
||||
}
|
||||
|
||||
// RBTT
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<32;k+=2)
|
||||
{
|
||||
uint32_t t2_0 = B32_0(a[(k + 2) & 0x1f]), t1_0 = B32_0(a[(k + 1) & 0x1f]);
|
||||
uint32_t t6_1 = B32_1(a[(k + 6) & 0x1f]), t5_1 = B32_1(a[(k + 5) & 0x1f]);
|
||||
uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2 = B32_2(a[(k + 9) & 0x1f]);
|
||||
uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
|
||||
|
||||
t[k + 0] = T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^
|
||||
T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn( t9_2 ) ^ T3dn( t13_3 );
|
||||
|
||||
t[k + 1] = T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^
|
||||
T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up( t9_2 ) ^ T3up( t13_3 );
|
||||
}
|
||||
#pragma unroll 32
|
||||
for(int k=0;k<32;k++)
|
||||
a[k] = t[k];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void
|
||||
myriadgroestl_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
|
||||
__global__ void
|
||||
myriadgroestl_gpu_hash_quad2(int threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer)
|
||||
{
|
||||
#if USE_SHARED
|
||||
extern __shared__ char mixtabs[];
|
||||
|
||||
if (threadIdx.x < 256)
|
||||
{
|
||||
*((uint32_t*)mixtabs + ( threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
#endif
|
||||
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
if (thread < threads)
|
||||
{
|
||||
// GROESTL
|
||||
uint32_t message[32];
|
||||
uint32_t state[32];
|
||||
|
||||
#pragma unroll 32
|
||||
for(int k=0;k<32;k++) message[k] = myriadgroestl_gpu_msg[k];
|
||||
|
||||
uint32_t nounce = startNounce + thread;
|
||||
message[19] = SWAB32(nounce);
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++) state[u] = message[u];
|
||||
state[31] ^= 0x20000;
|
||||
|
||||
// Perm
|
||||
#if USE_SHARED
|
||||
myriadgroestl_perm_P(state, mixtabs);
|
||||
state[31] ^= 0x20000;
|
||||
myriadgroestl_perm_Q(message, mixtabs);
|
||||
#else
|
||||
myriadgroestl_perm_P(state, NULL);
|
||||
state[31] ^= 0x20000;
|
||||
myriadgroestl_perm_Q(message, NULL);
|
||||
#endif
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++) state[u] ^= message[u];
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++) message[u] = state[u];
|
||||
|
||||
#if USE_SHARED
|
||||
myriadgroestl_perm_P(message, mixtabs);
|
||||
#else
|
||||
myriadgroestl_perm_P(message, NULL);
|
||||
#endif
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++) state[u] ^= message[u];
|
||||
uint32_t nounce = startNounce + thread;
|
||||
|
||||
uint32_t out_state[16];
|
||||
uint32_t *inpHash = &hashBuffer[16 * thread];
|
||||
#pragma unroll 16
|
||||
for(int u=0;u<16;u++) out_state[u] = state[u+16];
|
||||
for (int i=0; i < 16; i++)
|
||||
out_state[i] = inpHash[i];
|
||||
|
||||
myriadgroestl_gpu_sha256(out_state);
|
||||
|
||||
int i, position = -1;
|
||||
@ -526,43 +309,35 @@ myriadgroestl_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
|
||||
}
|
||||
}
|
||||
|
||||
#define texDef(texname, texmem, texsource, texsize) \
|
||||
unsigned int *texmem; \
|
||||
cudaMalloc(&texmem, texsize); \
|
||||
cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
|
||||
texname.normalized = 0; \
|
||||
texname.filterMode = cudaFilterModePoint; \
|
||||
texname.addressMode[0] = cudaAddressModeClamp; \
|
||||
{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
|
||||
cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
|
||||
|
||||
// Setup-Funktionen
|
||||
__host__ void myriadgroestl_cpu_init(int thr_id, int threads)
|
||||
{
|
||||
cudaSetDevice(device_map[thr_id]);
|
||||
|
||||
cudaMemcpyToSymbol( myr_sha256_gpu_hashTable,
|
||||
myr_sha256_cpu_hashTable,
|
||||
sizeof(uint32_t) * 8 );
|
||||
cudaSetDevice(device_map[thr_id]);
|
||||
|
||||
cudaMemcpyToSymbol( myr_sha256_gpu_hashTable,
|
||||
myr_sha256_cpu_hashTable,
|
||||
sizeof(uint32_t) * 8 );
|
||||
|
||||
cudaMemcpyToSymbol( myr_sha256_gpu_constantTable,
|
||||
myr_sha256_cpu_constantTable,
|
||||
sizeof(uint32_t) * 64 );
|
||||
cudaMemcpyToSymbol( myr_sha256_gpu_constantTable,
|
||||
myr_sha256_cpu_constantTable,
|
||||
sizeof(uint32_t) * 64 );
|
||||
|
||||
cudaGetDeviceProperties(&props, device_map[thr_id]);
|
||||
// zweite CPU-Tabelle bauen und auf die GPU laden
|
||||
uint32_t temp[64];
|
||||
for(int i=0;i<64;i++)
|
||||
temp[i] = myr_sha256_cpu_w2Table[i] + myr_sha256_cpu_constantTable[i];
|
||||
|
||||
// Texturen mit obigem Makro initialisieren
|
||||
texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
|
||||
cudaMemcpyToSymbol( myr_sha256_gpu_constantTable2,
|
||||
temp,
|
||||
sizeof(uint32_t) * 64 );
|
||||
|
||||
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
|
||||
|
||||
// Speicher für Gewinner-Nonce belegen
|
||||
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
|
||||
|
||||
// Speicher für temporäreHashes
|
||||
cudaMalloc(&d_outputHashes[thr_id], 16*sizeof(uint32_t)*threads);
|
||||
}
|
||||
|
||||
__host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
|
||||
@ -594,25 +369,23 @@ __host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn
|
||||
|
||||
__host__ void myriadgroestl_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
|
||||
{
|
||||
// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
|
||||
// alle anderen mit 512 Threads.
|
||||
int threadsperblock = (props.major >= 3) ? 768 : 512;
|
||||
int threadsperblock = 256;
|
||||
|
||||
// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
|
||||
// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
|
||||
const int factor=4;
|
||||
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
|
||||
// berechne wie viele Thread Blocks wir brauchen
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
#if USE_SHARED
|
||||
size_t shared_size = 8 * 256 * sizeof(uint32_t);
|
||||
#else
|
||||
size_t shared_size = 0;
|
||||
#endif
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
//fprintf(stderr, "ThrID: %d\n", thr_id);
|
||||
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
|
||||
myriadgroestl_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
|
||||
myriadgroestl_gpu_hash_quad<<<grid, block, shared_size>>>(threads, startNounce, d_outputHashes[thr_id]);
|
||||
dim3 grid2((threads + threadsperblock-1)/threadsperblock);
|
||||
myriadgroestl_gpu_hash_quad2<<<grid2, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id], d_outputHashes[thr_id]);
|
||||
|
||||
// Strategisches Sleep Kommando zur Senkung der CPU Last
|
||||
MyStreamSynchronize(NULL, 0, thr_id);
|
||||
|
@ -85,7 +85,6 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
|
||||
{
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
// TODO: entfernen für eine Release! Ist nur zum Testen!
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x0000ff;
|
||||
|
||||
|
315
groestl_functions_quad.cu
Normal file
315
groestl_functions_quad.cu
Normal file
@ -0,0 +1,315 @@
|
||||
|
||||
__device__ __forceinline__ void G256_Mul2(uint32_t *regs)
|
||||
{
|
||||
uint32_t tmp = regs[7];
|
||||
regs[7] = regs[6];
|
||||
regs[6] = regs[5];
|
||||
regs[5] = regs[4];
|
||||
regs[4] = regs[3] ^ tmp;
|
||||
regs[3] = regs[2] ^ tmp;
|
||||
regs[2] = regs[1];
|
||||
regs[1] = regs[0] ^ tmp;
|
||||
regs[0] = tmp;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void G256_AddRoundConstantQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int round)
|
||||
{
|
||||
x0 = ~x0;
|
||||
x1 = ~x1;
|
||||
x2 = ~x2;
|
||||
x3 = ~x3;
|
||||
x4 = ~x4;
|
||||
x5 = ~x5;
|
||||
x6 = ~x6;
|
||||
x7 = ~x7;
|
||||
|
||||
if ((threadIdx.x & 0x03) == 3) {
|
||||
x0 ^= ((- (round & 0x01) ) & 0xFFFF0000);
|
||||
x1 ^= ((-((round & 0x02)>>1)) & 0xFFFF0000);
|
||||
x2 ^= ((-((round & 0x04)>>2)) & 0xFFFF0000);
|
||||
x3 ^= ((-((round & 0x08)>>3)) & 0xFFFF0000);
|
||||
x4 ^= 0xAAAA0000;
|
||||
x5 ^= 0xCCCC0000;
|
||||
x6 ^= 0xF0F00000;
|
||||
x7 ^= 0xFF000000;
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void G256_AddRoundConstantP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int round)
|
||||
{
|
||||
if ((threadIdx.x & 0x03) == 0)
|
||||
{
|
||||
x4 ^= 0xAAAA;
|
||||
x5 ^= 0xCCCC;
|
||||
x6 ^= 0xF0F0;
|
||||
x7 ^= 0xFF00;
|
||||
|
||||
x0 ^= ((- (round & 0x01) ) & 0xFFFF);
|
||||
x1 ^= ((-((round & 0x02)>>1)) & 0xFFFF);
|
||||
x2 ^= ((-((round & 0x04)>>2)) & 0xFFFF);
|
||||
x3 ^= ((-((round & 0x08)>>3)) & 0xFFFF);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void G16mul_quad(uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0,
|
||||
uint32_t &y3, uint32_t &y2, uint32_t &y1, uint32_t &y0)
|
||||
{
|
||||
uint32_t t0,t1,t2;
|
||||
|
||||
t0 = ((x2 ^ x0) ^ (x3 ^ x1)) & ((y2 ^ y0) ^ (y3 ^ y1));
|
||||
t1 = ((x2 ^ x0) & (y2 ^ y0)) ^ t0;
|
||||
t2 = ((x3 ^ x1) & (y3 ^ y1)) ^ t0 ^ t1;
|
||||
|
||||
t0 = (x2^x3) & (y2^y3);
|
||||
x3 = (x3 & y3) ^ t0 ^ t1;
|
||||
x2 = (x2 & y2) ^ t0 ^ t2;
|
||||
|
||||
t0 = (x0^x1) & (y0^y1);
|
||||
x1 = (x1 & y1) ^ t0 ^ t1;
|
||||
x0 = (x0 & y0) ^ t0 ^ t2;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void G256_inv_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
|
||||
{
|
||||
uint32_t t0,t1,t2,t3,t4,t5,t6,a,b;
|
||||
|
||||
t3 = x7;
|
||||
t2 = x6;
|
||||
t1 = x5;
|
||||
t0 = x4;
|
||||
|
||||
G16mul_quad(t3, t2, t1, t0, x3, x2, x1, x0);
|
||||
|
||||
a = (x4 ^ x0);
|
||||
t0 ^= a;
|
||||
t2 ^= (x7 ^ x3) ^ (x5 ^ x1);
|
||||
t1 ^= (x5 ^ x1) ^ a;
|
||||
t3 ^= (x6 ^ x2) ^ a;
|
||||
|
||||
b = t0 ^ t1;
|
||||
t4 = (t2 ^ t3) & b;
|
||||
a = t4 ^ t3 ^ t1;
|
||||
t5 = (t3 & t1) ^ a;
|
||||
t6 = (t2 & t0) ^ a ^ (t2 ^ t0);
|
||||
|
||||
t4 = (t5 ^ t6) & b;
|
||||
t1 = (t6 & t1) ^ t4;
|
||||
t0 = (t5 & t0) ^ t4;
|
||||
|
||||
t4 = (t5 ^ t6) & (t2^t3);
|
||||
t3 = (t6 & t3) ^ t4;
|
||||
t2 = (t5 & t2) ^ t4;
|
||||
|
||||
G16mul_quad(x3, x2, x1, x0, t1, t0, t3, t2);
|
||||
|
||||
G16mul_quad(x7, x6, x5, x4, t1, t0, t3, t2);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void transAtoX_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7)
|
||||
{
|
||||
uint32_t t0, t1;
|
||||
t0 = x0 ^ x1 ^ x2;
|
||||
t1 = x5 ^ x6;
|
||||
x2 = t0 ^ t1 ^ x7;
|
||||
x6 = t0 ^ x3 ^ x6;
|
||||
x3 = x0 ^ x1 ^ x3 ^ x4 ^ x7;
|
||||
x4 = x0 ^ x4 ^ t1;
|
||||
x2 = t0 ^ t1 ^ x7;
|
||||
x1 = x0 ^ x1 ^ t1;
|
||||
x7 = x0 ^ t1 ^ x7;
|
||||
x5 = x0 ^ t1;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void transXtoA_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7)
|
||||
{
|
||||
uint32_t t0,t2,t3,t5;
|
||||
|
||||
x1 ^= x4;
|
||||
t0 = x1 ^ x6;
|
||||
x1 ^= x5;
|
||||
|
||||
t2 = x0 ^ x2;
|
||||
x2 = x3 ^ x5;
|
||||
t2 ^= x2 ^ x6;
|
||||
x2 ^= x7;
|
||||
t3 = x4 ^ x2 ^ x6;
|
||||
|
||||
t5 = x0 ^ x6;
|
||||
x4 = x3 ^ x7;
|
||||
x0 = x3 ^ x5;
|
||||
|
||||
x6 = t0;
|
||||
x3 = t2;
|
||||
x7 = t3;
|
||||
x5 = t5;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void sbox_quad(uint32_t *r)
|
||||
{
|
||||
transAtoX_quad(r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7]);
|
||||
|
||||
G256_inv_quad(r[2], r[4], r[1], r[7], r[3], r[0], r[5], r[6]);
|
||||
|
||||
transXtoA_quad(r[7], r[1], r[4], r[2], r[6], r[5], r[0], r[3]);
|
||||
|
||||
r[0] = ~r[0];
|
||||
r[1] = ~r[1];
|
||||
r[5] = ~r[5];
|
||||
r[6] = ~r[6];
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void G256_ShiftBytesP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
|
||||
{
|
||||
uint32_t t0,t1;
|
||||
|
||||
int tpos = threadIdx.x & 0x03;
|
||||
int shift1 = tpos << 1;
|
||||
int shift2 = shift1+1 + ((tpos == 3)<<2);
|
||||
|
||||
t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
|
||||
t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
|
||||
x0 = __byte_perm(t0, t1, 0x5410);
|
||||
|
||||
t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
|
||||
t1 = __byte_perm(x1, 0, 0x3232)>>shift2;
|
||||
x1 = __byte_perm(t0, t1, 0x5410);
|
||||
|
||||
t0 = __byte_perm(x2, 0, 0x1010)>>shift1;
|
||||
t1 = __byte_perm(x2, 0, 0x3232)>>shift2;
|
||||
x2 = __byte_perm(t0, t1, 0x5410);
|
||||
|
||||
t0 = __byte_perm(x3, 0, 0x1010)>>shift1;
|
||||
t1 = __byte_perm(x3, 0, 0x3232)>>shift2;
|
||||
x3 = __byte_perm(t0, t1, 0x5410);
|
||||
|
||||
t0 = __byte_perm(x4, 0, 0x1010)>>shift1;
|
||||
t1 = __byte_perm(x4, 0, 0x3232)>>shift2;
|
||||
x4 = __byte_perm(t0, t1, 0x5410);
|
||||
|
||||
t0 = __byte_perm(x5, 0, 0x1010)>>shift1;
|
||||
t1 = __byte_perm(x5, 0, 0x3232)>>shift2;
|
||||
x5 = __byte_perm(t0, t1, 0x5410);
|
||||
|
||||
t0 = __byte_perm(x6, 0, 0x1010)>>shift1;
|
||||
t1 = __byte_perm(x6, 0, 0x3232)>>shift2;
|
||||
x6 = __byte_perm(t0, t1, 0x5410);
|
||||
|
||||
t0 = __byte_perm(x7, 0, 0x1010)>>shift1;
|
||||
t1 = __byte_perm(x7, 0, 0x3232)>>shift2;
|
||||
x7 = __byte_perm(t0, t1, 0x5410);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void G256_ShiftBytesQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
|
||||
{
|
||||
uint32_t t0,t1;
|
||||
|
||||
int tpos = threadIdx.x & 0x03;
|
||||
int shift1 = (1-(tpos>>1)) + ((tpos & 0x01)<<2);
|
||||
int shift2 = shift1+2 + ((tpos == 1)<<2);
|
||||
|
||||
t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
|
||||
t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
|
||||
x0 = __byte_perm(t0, t1, 0x5410);
|
||||
|
||||
t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
|
||||
t1 = __byte_perm(x1, 0, 0x3232)>>shift2;
|
||||
x1 = __byte_perm(t0, t1, 0x5410);
|
||||
|
||||
t0 = __byte_perm(x2, 0, 0x1010)>>shift1;
|
||||
t1 = __byte_perm(x2, 0, 0x3232)>>shift2;
|
||||
x2 = __byte_perm(t0, t1, 0x5410);
|
||||
|
||||
t0 = __byte_perm(x3, 0, 0x1010)>>shift1;
|
||||
t1 = __byte_perm(x3, 0, 0x3232)>>shift2;
|
||||
x3 = __byte_perm(t0, t1, 0x5410);
|
||||
|
||||
t0 = __byte_perm(x4, 0, 0x1010)>>shift1;
|
||||
t1 = __byte_perm(x4, 0, 0x3232)>>shift2;
|
||||
x4 = __byte_perm(t0, t1, 0x5410);
|
||||
|
||||
t0 = __byte_perm(x5, 0, 0x1010)>>shift1;
|
||||
t1 = __byte_perm(x5, 0, 0x3232)>>shift2;
|
||||
x5 = __byte_perm(t0, t1, 0x5410);
|
||||
|
||||
t0 = __byte_perm(x6, 0, 0x1010)>>shift1;
|
||||
t1 = __byte_perm(x6, 0, 0x3232)>>shift2;
|
||||
x6 = __byte_perm(t0, t1, 0x5410);
|
||||
|
||||
t0 = __byte_perm(x7, 0, 0x1010)>>shift1;
|
||||
t1 = __byte_perm(x7, 0, 0x3232)>>shift2;
|
||||
x7 = __byte_perm(t0, t1, 0x5410);
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void G256_MixFunction_quad(uint32_t *r)
|
||||
{
|
||||
#define SHIFT64_16(hi, lo) __byte_perm(lo, hi, 0x5432)
|
||||
#define A(v, u) __shfl((int)r[v], ((threadIdx.x+u)&0x03), 4)
|
||||
#define S(idx, l) SHIFT64_16( A(idx, (l+1)), A(idx, l) )
|
||||
|
||||
#define DOUBLE_ODD(i, bc) ( S(i, (bc)) ^ A(i, (bc) + 1) )
|
||||
#define DOUBLE_EVEN(i, bc) ( S(i, (bc)) ^ A(i, (bc) ) )
|
||||
|
||||
#define SINGLE_ODD(i, bc) ( S(i, (bc)) )
|
||||
#define SINGLE_EVEN(i, bc) ( A(i, (bc)) )
|
||||
uint32_t b[8];
|
||||
|
||||
#pragma unroll 8
|
||||
for(int i=0;i<8;i++)
|
||||
b[i] = DOUBLE_ODD(i, 1) ^ DOUBLE_EVEN(i, 3);
|
||||
|
||||
G256_Mul2(b);
|
||||
#pragma unroll 8
|
||||
for(int i=0;i<8;i++)
|
||||
b[i] = b[i] ^ DOUBLE_ODD(i, 3) ^ DOUBLE_ODD(i, 4) ^ SINGLE_ODD(i, 6);
|
||||
|
||||
G256_Mul2(b);
|
||||
#pragma unroll 8
|
||||
for(int i=0;i<8;i++)
|
||||
r[i] = b[i] ^ DOUBLE_EVEN(i, 2) ^ DOUBLE_EVEN(i, 3) ^ SINGLE_EVEN(i, 5);
|
||||
|
||||
#undef S
|
||||
#undef A
|
||||
#undef SHIFT64_16
|
||||
#undef t
|
||||
#undef X
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void groestl512_perm_P_quad(uint32_t *r)
|
||||
{
|
||||
for(int round=0;round<14;round++)
|
||||
{
|
||||
G256_AddRoundConstantP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
|
||||
sbox_quad(r);
|
||||
G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
|
||||
G256_MixFunction_quad(r);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void groestl512_perm_Q_quad(uint32_t *r)
|
||||
{
|
||||
for(int round=0;round<14;round++)
|
||||
{
|
||||
G256_AddRoundConstantQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
|
||||
sbox_quad(r);
|
||||
G256_ShiftBytesQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
|
||||
G256_MixFunction_quad(r);
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void groestl512_progressMessage_quad(uint32_t *state, uint32_t *message)
|
||||
{
|
||||
#pragma unroll 8
|
||||
for(int u=0;u<8;u++) state[u] = message[u];
|
||||
|
||||
if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
|
||||
groestl512_perm_P_quad(state);
|
||||
if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
|
||||
groestl512_perm_Q_quad(message);
|
||||
#pragma unroll 8
|
||||
for(int u=0;u<8;u++) state[u] ^= message[u];
|
||||
#pragma unroll 8
|
||||
for(int u=0;u<8;u++) message[u] = state[u];
|
||||
groestl512_perm_P_quad(message);
|
||||
#pragma unroll 8
|
||||
for(int u=0;u<8;u++) state[u] ^= message[u];
|
||||
}
|
209
groestlcoin.cpp
209
groestlcoin.cpp
@ -15,163 +15,118 @@
|
||||
|
||||
void sha256func(unsigned char *hash, const unsigned char *data, int len)
|
||||
{
|
||||
uint32_t S[16], T[16];
|
||||
int i, r;
|
||||
uint32_t S[16], T[16];
|
||||
int i, r;
|
||||
|
||||
sha256_init(S);
|
||||
for (r = len; r > -9; r -= 64) {
|
||||
if (r < 64)
|
||||
memset(T, 0, 64);
|
||||
memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
|
||||
if (r >= 0 && r < 64)
|
||||
((unsigned char *)T)[r] = 0x80;
|
||||
for (i = 0; i < 16; i++)
|
||||
T[i] = be32dec(T + i);
|
||||
if (r < 56)
|
||||
T[15] = 8 * len;
|
||||
sha256_transform(S, T, 0);
|
||||
}
|
||||
/*
|
||||
memcpy(S + 8, sha256d_hash1 + 8, 32);
|
||||
sha256_init(T);
|
||||
sha256_transform(T, S, 0);
|
||||
*/
|
||||
for (i = 0; i < 8; i++)
|
||||
be32enc((uint32_t *)hash + i, T[i]);
|
||||
sha256_init(S);
|
||||
for (r = len; r > -9; r -= 64) {
|
||||
if (r < 64)
|
||||
memset(T, 0, 64);
|
||||
memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
|
||||
if (r >= 0 && r < 64)
|
||||
((unsigned char *)T)[r] = 0x80;
|
||||
for (i = 0; i < 16; i++)
|
||||
T[i] = be32dec(T + i);
|
||||
if (r < 56)
|
||||
T[15] = 8 * len;
|
||||
sha256_transform(S, T, 0);
|
||||
}
|
||||
/*
|
||||
memcpy(S + 8, sha256d_hash1 + 8, 32);
|
||||
sha256_init(T);
|
||||
sha256_transform(T, S, 0);
|
||||
*/
|
||||
for (i = 0; i < 8; i++)
|
||||
be32enc((uint32_t *)hash + i, T[i]);
|
||||
}
|
||||
|
||||
static void groestlhash(void *state, const void *input)
|
||||
{
|
||||
// Tryout GPU-groestl
|
||||
// Tryout GPU-groestl
|
||||
|
||||
sph_groestl512_context ctx_groestl[2];
|
||||
static unsigned char pblank[1];
|
||||
int ii;
|
||||
uint32_t mask = 8;
|
||||
uint32_t zero = 0;
|
||||
|
||||
|
||||
//these uint512 in the c++ source of the client are backed by an array of uint32
|
||||
uint32_t hashA[16], hashB[16];
|
||||
//these uint512 in the c++ source of the client are backed by an array of uint32
|
||||
uint32_t hashA[16], hashB[16];
|
||||
|
||||
|
||||
sph_groestl512_init(&ctx_groestl[0]);
|
||||
sph_groestl512 (&ctx_groestl[0], input, 80); //6
|
||||
sph_groestl512_close(&ctx_groestl[0], hashA); //7
|
||||
sph_groestl512_close(&ctx_groestl[0], hashA); //7
|
||||
|
||||
sph_groestl512_init(&ctx_groestl[1]);
|
||||
sph_groestl512 (&ctx_groestl[1], hashA, 64); //6
|
||||
sph_groestl512_init(&ctx_groestl[1]);
|
||||
sph_groestl512 (&ctx_groestl[1], hashA, 64); //6
|
||||
sph_groestl512_close(&ctx_groestl[1], hashB); //7
|
||||
|
||||
memcpy(state, hashB, 32);
|
||||
memcpy(state, hashB, 32);
|
||||
}
|
||||
|
||||
|
||||
extern bool opt_benchmark;
|
||||
|
||||
extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
|
||||
uint32_t max_nonce, unsigned long *hashes_done)
|
||||
{
|
||||
uint32_t start_nonce = pdata[19]++;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t throughPut = 4096 * 128;
|
||||
//const uint32_t throughPut = 1;
|
||||
int i;
|
||||
uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t));
|
||||
uint32_t max_nonce, unsigned long *hashes_done)
|
||||
{
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x000000ff;
|
||||
|
||||
// init
|
||||
static bool init[8] = { false, false, false, false, false, false, false, false };
|
||||
if(!init[thr_id])
|
||||
{
|
||||
groestlcoin_cpu_init(thr_id, throughPut);
|
||||
init[thr_id] = true;
|
||||
}
|
||||
|
||||
// Endian Drehung ist notwendig
|
||||
//char testdata[] = {"\x70\x00\x00\x00\x5d\x38\x5b\xa1\x14\xd0\x79\x97\x0b\x29\xa9\x41\x8f\xd0\x54\x9e\x7d\x68\xa9\x5c\x7f\x16\x86\x21\xa3\x14\x20\x10\x00\x00\x00\x00\x57\x85\x86\xd1\x49\xfd\x07\xb2\x2f\x3a\x8a\x34\x7c\x51\x6d\xe7\x05\x2f\x03\x4d\x2b\x76\xff\x68\xe0\xd6\xec\xff\x9b\x77\xa4\x54\x89\xe3\xfd\x51\x17\x32\x01\x1d\xf0\x73\x10\x00"};
|
||||
//pdata = (uint32_t*)testdata;
|
||||
uint32_t endiandata[32];
|
||||
for (int kk=0; kk < 32; kk++)
|
||||
be32enc(&endiandata[kk], pdata[kk]);
|
||||
uint32_t start_nonce = pdata[19]++;
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
const uint32_t throughPut = 4096 * 128;
|
||||
//const uint32_t throughPut = 1;
|
||||
uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t));
|
||||
|
||||
// Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
|
||||
groestlcoin_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
|
||||
|
||||
do {
|
||||
// GPU
|
||||
uint32_t foundNounce = 0xFFFFFFFF;
|
||||
// init
|
||||
static bool init[8] = { false, false, false, false, false, false, false, false };
|
||||
if(!init[thr_id])
|
||||
{
|
||||
groestlcoin_cpu_init(thr_id, throughPut);
|
||||
init[thr_id] = true;
|
||||
}
|
||||
|
||||
// Endian Drehung ist notwendig
|
||||
uint32_t endiandata[32];
|
||||
for (int kk=0; kk < 32; kk++)
|
||||
be32enc(&endiandata[kk], pdata[kk]);
|
||||
|
||||
groestlcoin_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce);
|
||||
// Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
|
||||
groestlcoin_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
|
||||
|
||||
do {
|
||||
// GPU
|
||||
uint32_t foundNounce = 0xFFFFFFFF;
|
||||
|
||||
/*
|
||||
{
|
||||
for(i=0;i<throughPut;i++)
|
||||
{
|
||||
uint32_t tmpHash[8];
|
||||
endiandata[19] = SWAP32(pdata[19]);
|
||||
groestlhash(tmpHash, endiandata);
|
||||
|
||||
int ii;
|
||||
printf("result GPU: ");
|
||||
for (ii=0; ii < 32; ii++)
|
||||
{
|
||||
printf ("%.2x",((uint8_t*)&outputHash[8*i])[ii]);
|
||||
};
|
||||
printf ("\n");
|
||||
|
||||
groestlcoin_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce);
|
||||
|
||||
groestlhash(tmpHash, endiandata);
|
||||
printf("result CPU: ");
|
||||
for (ii=0; ii < 32; ii++)
|
||||
{
|
||||
printf ("%.2x",((uint8_t*)tmpHash)[ii]);
|
||||
};
|
||||
|
||||
|
||||
}
|
||||
exit(0);
|
||||
}
|
||||
*/
|
||||
if(foundNounce < 0xffffffff)
|
||||
{
|
||||
uint32_t tmpHash[8];
|
||||
endiandata[19] = SWAP32(foundNounce);
|
||||
groestlhash(tmpHash, endiandata);
|
||||
if (tmpHash[7] <= Htarg &&
|
||||
fulltest(tmpHash, ptarget)) {
|
||||
pdata[19] = foundNounce;
|
||||
*hashes_done = foundNounce - start_nonce;
|
||||
free(outputHash);
|
||||
return true;
|
||||
} else {
|
||||
applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce);
|
||||
}
|
||||
if(foundNounce < 0xffffffff)
|
||||
{
|
||||
uint32_t tmpHash[8];
|
||||
endiandata[19] = SWAP32(foundNounce);
|
||||
groestlhash(tmpHash, endiandata);
|
||||
if (tmpHash[7] <= Htarg &&
|
||||
fulltest(tmpHash, ptarget)) {
|
||||
pdata[19] = foundNounce;
|
||||
*hashes_done = foundNounce - start_nonce;
|
||||
free(outputHash);
|
||||
return true;
|
||||
} else {
|
||||
applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce);
|
||||
}
|
||||
|
||||
foundNounce = 0xffffffff;
|
||||
/*
|
||||
int ii;
|
||||
printf("result GPU: ");
|
||||
for (ii=0; ii < 32; ii++)
|
||||
{
|
||||
printf ("%.2x",((uint8_t*)&outputHash[0])[ii]);
|
||||
};
|
||||
printf ("\n");
|
||||
printf("result CPU: ");
|
||||
for (ii=0; ii < 32; ii++)
|
||||
{
|
||||
printf ("%.2x",((uint8_t*)tmpHash)[ii]);
|
||||
};
|
||||
printf ("\n");
|
||||
*/
|
||||
}
|
||||
foundNounce = 0xffffffff;
|
||||
}
|
||||
|
||||
if (pdata[19] + throughPut < pdata[19])
|
||||
pdata[19] = max_nonce;
|
||||
else pdata[19] += throughPut;
|
||||
if (pdata[19] + throughPut < pdata[19])
|
||||
pdata[19] = max_nonce;
|
||||
else pdata[19] += throughPut;
|
||||
|
||||
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = pdata[19] - start_nonce;
|
||||
free(outputHash);
|
||||
return 0;
|
||||
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
|
||||
|
||||
*hashes_done = pdata[19] - start_nonce;
|
||||
free(outputHash);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -269,8 +269,6 @@ __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
||||
if (BLOCKSIZE == 80)
|
||||
blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
|
||||
else if (BLOCKSIZE == 84)
|
||||
|
@ -141,8 +141,6 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
||||
combine_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], d_nonceVector[thr_id]);
|
||||
|
||||
// da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden
|
||||
|
@ -824,8 +824,6 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
||||
if (BLOCKSIZE == 84)
|
||||
groestl512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
|
||||
else if (BLOCKSIZE == 80)
|
||||
|
@ -416,8 +416,6 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
|
||||
size_t shared_size = 0;
|
||||
#endif
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
||||
hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
|
||||
|
||||
// Strategisches Sleep Kommando zur Senkung der CPU Last
|
||||
|
@ -279,7 +279,6 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
if (BLOCKSIZE==84)
|
||||
keccak512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
|
||||
else if (BLOCKSIZE==80)
|
||||
|
@ -271,7 +271,6 @@ __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
if (BLOCKSIZE == 84)
|
||||
sha256_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
|
||||
else if (BLOCKSIZE == 80) {
|
||||
|
@ -35,17 +35,19 @@ static void myriadhash(void *state, const void *input)
|
||||
memcpy(state, hashB, 32);
|
||||
}
|
||||
|
||||
|
||||
extern bool opt_benchmark;
|
||||
|
||||
extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
|
||||
uint32_t max_nonce, unsigned long *hashes_done)
|
||||
{
|
||||
uint32_t start_nonce = pdata[19]++;
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x000000ff;
|
||||
|
||||
uint32_t start_nonce = pdata[19]++;
|
||||
const uint32_t throughPut = 128 * 1024;
|
||||
// const uint32_t throughPut = 1;
|
||||
|
||||
uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t));
|
||||
|
||||
// TODO: entfernen für eine Release! Ist nur zum Testen!
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x0000ff;
|
||||
|
||||
|
@ -175,7 +175,6 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
|
||||
{
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
// TODO: entfernen für eine Release! Ist nur zum Testen!
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x00000f;
|
||||
|
||||
|
@ -447,8 +447,6 @@ __host__ void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNo
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
||||
quark_bmw512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||
MyStreamSynchronize(NULL, order, thr_id);
|
||||
}
|
||||
@ -464,8 +462,6 @@ __host__ void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNo
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
||||
quark_bmw512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
|
||||
MyStreamSynchronize(NULL, order, thr_id);
|
||||
}
|
||||
|
@ -350,8 +350,6 @@ __host__ void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
||||
quark_jh512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||
MyStreamSynchronize(NULL, order, thr_id);
|
||||
}
|
||||
|
@ -406,8 +406,6 @@ __host__ void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t start
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
||||
quark_blake512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash);
|
||||
|
||||
// Strategisches Sleep Kommando zur Senkung der CPU Last
|
||||
@ -425,8 +423,6 @@ __host__ void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t start
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
||||
quark_blake512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
|
||||
|
||||
// Strategisches Sleep Kommando zur Senkung der CPU Last
|
||||
|
@ -89,8 +89,6 @@ __host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t star
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
||||
quark_check_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
|
||||
|
||||
// Strategisches Sleep Kommando zur Senkung der CPU Last
|
||||
|
@ -1,4 +1,4 @@
|
||||
// Auf QuarkCoin spezialisierte Version von Groestl
|
||||
// Auf QuarkCoin spezialisierte Version von Groestl inkl. Bitslice
|
||||
|
||||
#include <cuda.h>
|
||||
#include "cuda_runtime.h"
|
||||
@ -7,9 +7,6 @@
|
||||
#include <stdio.h>
|
||||
#include <memory.h>
|
||||
|
||||
// it's unfortunate that this is a compile time constant.
|
||||
#define MAXWELL_OR_FERMI 1
|
||||
|
||||
// aus cpu-miner.c
|
||||
extern int device_map[8];
|
||||
|
||||
@ -18,353 +15,137 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
|
||||
|
||||
// Folgende Definitionen später durch header ersetzen
|
||||
typedef unsigned char uint8_t;
|
||||
typedef unsigned short uint16_t;
|
||||
typedef unsigned int uint32_t;
|
||||
typedef unsigned long long uint64_t;
|
||||
|
||||
// diese Struktur wird in der Init Funktion angefordert
|
||||
static cudaDeviceProp props[8];
|
||||
|
||||
#define SPH_C32(x) ((uint32_t)(x ## U))
|
||||
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
|
||||
// 64 Register Variante für Compute 3.0
|
||||
#include "groestl_functions_quad.cu"
|
||||
#include "bitslice_transformations_quad.cu"
|
||||
|
||||
#define PC32up(j, r) ((uint32_t)((j) + (r)))
|
||||
#define PC32dn(j, r) 0
|
||||
#define QC32up(j, r) 0xFFFFFFFF
|
||||
#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
|
||||
|
||||
#define B32_0(x) __byte_perm(x, 0, 0x4440)
|
||||
//((x) & 0xFF)
|
||||
#define B32_1(x) __byte_perm(x, 0, 0x4441)
|
||||
//(((x) >> 8) & 0xFF)
|
||||
#define B32_2(x) __byte_perm(x, 0, 0x4442)
|
||||
//(((x) >> 16) & 0xFF)
|
||||
#define B32_3(x) __byte_perm(x, 0, 0x4443)
|
||||
//((x) >> 24)
|
||||
|
||||
#if MAXWELL_OR_FERMI
|
||||
#define USE_SHARED 1
|
||||
// Maxwell and Fermi cards get the best speed with SHARED access it seems.
|
||||
#if USE_SHARED
|
||||
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
|
||||
#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
|
||||
#define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
|
||||
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
|
||||
#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
|
||||
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
|
||||
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
|
||||
#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
|
||||
#else
|
||||
#define T0up(x) tex1Dfetch(t0up1, x)
|
||||
#define T0dn(x) tex1Dfetch(t0dn1, x)
|
||||
#define T1up(x) tex1Dfetch(t1up1, x)
|
||||
#define T1dn(x) tex1Dfetch(t1dn1, x)
|
||||
#define T2up(x) tex1Dfetch(t2up1, x)
|
||||
#define T2dn(x) tex1Dfetch(t2dn1, x)
|
||||
#define T3up(x) tex1Dfetch(t3up1, x)
|
||||
#define T3dn(x) tex1Dfetch(t3dn1, x)
|
||||
#endif
|
||||
#else
|
||||
#define USE_SHARED 1
|
||||
// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
|
||||
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
|
||||
#define T0dn(x) tex1Dfetch(t0dn1, x)
|
||||
#define T1up(x) tex1Dfetch(t1up1, x)
|
||||
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
|
||||
#define T2up(x) tex1Dfetch(t2up1, x)
|
||||
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
|
||||
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
|
||||
#define T3dn(x) tex1Dfetch(t3dn1, x)
|
||||
#endif
|
||||
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t0up1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t1up1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t2up1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t3up1;
|
||||
texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
|
||||
|
||||
extern uint32_t T0up_cpu[];
|
||||
extern uint32_t T0dn_cpu[];
|
||||
extern uint32_t T1up_cpu[];
|
||||
extern uint32_t T1dn_cpu[];
|
||||
extern uint32_t T2up_cpu[];
|
||||
extern uint32_t T2dn_cpu[];
|
||||
extern uint32_t T3up_cpu[];
|
||||
extern uint32_t T3dn_cpu[];
|
||||
|
||||
__device__ __forceinline__ void quark_groestl512_perm_P(uint32_t *a, char *mixtabs)
|
||||
__global__ void __launch_bounds__(256, 4)
|
||||
quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
|
||||
{
|
||||
uint32_t t[32];
|
||||
|
||||
//#pragma unroll 14
|
||||
for(int r=0;r<14;r++)
|
||||
{
|
||||
switch(r)
|
||||
{
|
||||
case 0:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 0); break;
|
||||
case 1:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 1); break;
|
||||
case 2:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 2); break;
|
||||
case 3:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 3); break;
|
||||
case 4:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 4); break;
|
||||
case 5:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 5); break;
|
||||
case 6:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 6); break;
|
||||
case 7:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 7); break;
|
||||
case 8:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 8); break;
|
||||
case 9:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 9); break;
|
||||
case 10:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 10); break;
|
||||
case 11:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 11); break;
|
||||
case 12:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 12); break;
|
||||
case 13:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 13); break;
|
||||
}
|
||||
|
||||
// RBTT
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<32;k+=2)
|
||||
{
|
||||
uint32_t t0_0 = B32_0(a[(k ) & 0x1f]), t9_0 = B32_0(a[(k + 9) & 0x1f]);
|
||||
uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
|
||||
uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
|
||||
uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
|
||||
|
||||
t[k + 0] = T0up( t0_0 ) ^ T1up( t2_1 ) ^ T2up( t4_2 ) ^ T3up( t6_3 ) ^
|
||||
T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 );
|
||||
|
||||
t[k + 1] = T0dn( t0_0 ) ^ T1dn( t2_1 ) ^ T2dn( t4_2 ) ^ T3dn( t6_3 ) ^
|
||||
T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 );
|
||||
}
|
||||
#pragma unroll 32
|
||||
for(int k=0;k<32;k++)
|
||||
a[k] = t[k];
|
||||
}
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void quark_groestl512_perm_Q(uint32_t *a, char *mixtabs)
|
||||
{
|
||||
//#pragma unroll 14
|
||||
for(int r=0;r<14;r++)
|
||||
{
|
||||
uint32_t t[32];
|
||||
|
||||
switch(r)
|
||||
{
|
||||
case 0:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 0); a[(k*2)+1] ^= QC32dn(k<< 4, 0);} break;
|
||||
case 1:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 1); a[(k*2)+1] ^= QC32dn(k<< 4, 1);} break;
|
||||
case 2:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 2); a[(k*2)+1] ^= QC32dn(k<< 4, 2);} break;
|
||||
case 3:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 3); a[(k*2)+1] ^= QC32dn(k<< 4, 3);} break;
|
||||
case 4:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 4); a[(k*2)+1] ^= QC32dn(k<< 4, 4);} break;
|
||||
case 5:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 5); a[(k*2)+1] ^= QC32dn(k<< 4, 5);} break;
|
||||
case 6:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 6); a[(k*2)+1] ^= QC32dn(k<< 4, 6);} break;
|
||||
case 7:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 7); a[(k*2)+1] ^= QC32dn(k<< 4, 7);} break;
|
||||
case 8:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 8); a[(k*2)+1] ^= QC32dn(k<< 4, 8);} break;
|
||||
case 9:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 9); a[(k*2)+1] ^= QC32dn(k<< 4, 9);} break;
|
||||
case 10:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 10); a[(k*2)+1] ^= QC32dn(k<< 4, 10);} break;
|
||||
case 11:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 11); a[(k*2)+1] ^= QC32dn(k<< 4, 11);} break;
|
||||
case 12:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 12); a[(k*2)+1] ^= QC32dn(k<< 4, 12);} break;
|
||||
case 13:
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 13); a[(k*2)+1] ^= QC32dn(k<< 4, 13);} break;
|
||||
}
|
||||
|
||||
// RBTT
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<32;k+=2)
|
||||
{
|
||||
uint32_t t2_0 = B32_0(a[(k + 2) & 0x1f]), t1_0 = B32_0(a[(k + 1) & 0x1f]);
|
||||
uint32_t t6_1 = B32_1(a[(k + 6) & 0x1f]), t5_1 = B32_1(a[(k + 5) & 0x1f]);
|
||||
uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2 = B32_2(a[(k + 9) & 0x1f]);
|
||||
uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
|
||||
|
||||
t[k + 0] = T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^
|
||||
T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn( t9_2 ) ^ T3dn( t13_3 );
|
||||
|
||||
t[k + 1] = T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^
|
||||
T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up( t9_2 ) ^ T3up( t13_3 );
|
||||
}
|
||||
#pragma unroll 32
|
||||
for(int k=0;k<32;k++)
|
||||
a[k] = t[k];
|
||||
}
|
||||
}
|
||||
__global__ void quark_groestl512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
|
||||
{
|
||||
#if USE_SHARED
|
||||
extern __shared__ char mixtabs[];
|
||||
|
||||
if (threadIdx.x < 256)
|
||||
{
|
||||
*((uint32_t*)mixtabs + ( threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
|
||||
*((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
#endif
|
||||
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
|
||||
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
|
||||
if (thread < threads)
|
||||
{
|
||||
// GROESTL
|
||||
uint32_t message[32];
|
||||
uint32_t state[32];
|
||||
uint32_t message[8];
|
||||
uint32_t state[8];
|
||||
|
||||
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
|
||||
|
||||
int hashPosition = nounce - startNounce;
|
||||
uint32_t *inpHash = &g_hash[16 * hashPosition];
|
||||
uint32_t *inpHash = &g_hash[hashPosition<<4];
|
||||
|
||||
#pragma unroll 4
|
||||
for(int k=0;k<4;k++) message[k] = inpHash[(k<<2) + (threadIdx.x&0x03)];
|
||||
#pragma unroll 4
|
||||
for(int k=4;k<8;k++) message[k] = 0;
|
||||
|
||||
if ((threadIdx.x&0x03) == 0) message[4] = 0x80;
|
||||
if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000;
|
||||
|
||||
uint32_t msgBitsliced[8];
|
||||
to_bitslice_quad(message, msgBitsliced);
|
||||
|
||||
groestl512_progressMessage_quad(state, msgBitsliced);
|
||||
|
||||
// Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
|
||||
uint32_t *outpHash = &g_hash[hashPosition<<4];
|
||||
uint32_t hash[16];
|
||||
from_bitslice_quad(state, hash);
|
||||
|
||||
if ((threadIdx.x & 0x03) == 0)
|
||||
{
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) message[k] = inpHash[k];
|
||||
#pragma unroll 14
|
||||
for(int k=1;k<15;k++)
|
||||
message[k+16] = 0;
|
||||
|
||||
message[16] = 0x80;
|
||||
message[31] = 0x01000000;
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++) state[u] = message[u];
|
||||
state[31] ^= 0x20000;
|
||||
|
||||
// Perm
|
||||
#if USE_SHARED
|
||||
quark_groestl512_perm_P(state, mixtabs);
|
||||
state[31] ^= 0x20000;
|
||||
quark_groestl512_perm_Q(message, mixtabs);
|
||||
#else
|
||||
quark_groestl512_perm_P(state, NULL);
|
||||
state[31] ^= 0x20000;
|
||||
quark_groestl512_perm_Q(message, NULL);
|
||||
#endif
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++) state[u] ^= message[u];
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++) message[u] = state[u];
|
||||
|
||||
#if USE_SHARED
|
||||
quark_groestl512_perm_P(message, mixtabs);
|
||||
#else
|
||||
quark_groestl512_perm_P(message, NULL);
|
||||
#endif
|
||||
|
||||
#pragma unroll 32
|
||||
for(int u=0;u<32;u++) state[u] ^= message[u];
|
||||
// Erzeugten Hash rausschreiben
|
||||
|
||||
uint32_t *outpHash = &g_hash[16 * hashPosition];
|
||||
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) outpHash[k] = state[k+16];
|
||||
for(int k=0;k<16;k++) outpHash[k] = hash[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define texDef(texname, texmem, texsource, texsize) \
|
||||
unsigned int *texmem; \
|
||||
cudaMalloc(&texmem, texsize); \
|
||||
cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
|
||||
texname.normalized = 0; \
|
||||
texname.filterMode = cudaFilterModePoint; \
|
||||
texname.addressMode[0] = cudaAddressModeClamp; \
|
||||
{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
|
||||
cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
|
||||
__global__ void __launch_bounds__(256, 4)
|
||||
quark_doublegroestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
|
||||
{
|
||||
int thread = (blockDim.x * blockIdx.x + threadIdx.x)>>2;
|
||||
if (thread < threads)
|
||||
{
|
||||
// GROESTL
|
||||
uint32_t message[8];
|
||||
uint32_t state[8];
|
||||
|
||||
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
|
||||
|
||||
int hashPosition = nounce - startNounce;
|
||||
uint32_t *inpHash = &g_hash[hashPosition<<4];
|
||||
|
||||
#pragma unroll 4
|
||||
for(int k=0;k<4;k++) message[k] = inpHash[(k<<2)+(threadIdx.x&0x03)];
|
||||
#pragma unroll 4
|
||||
for(int k=4;k<8;k++) message[k] = 0;
|
||||
|
||||
if ((threadIdx.x&0x03) == 0) message[4] = 0x80;
|
||||
if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000;
|
||||
|
||||
uint32_t msgBitsliced[8];
|
||||
to_bitslice_quad(message, msgBitsliced);
|
||||
|
||||
for (int round=0; round<2; round++)
|
||||
{
|
||||
groestl512_progressMessage_quad(state, msgBitsliced);
|
||||
|
||||
if (round < 1)
|
||||
{
|
||||
// Verkettung zweier Runden inclusive Padding.
|
||||
msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + (((threadIdx.x%4)==3)<<13));
|
||||
msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
|
||||
msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
|
||||
msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
|
||||
msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
|
||||
msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
|
||||
msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
|
||||
msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + (((threadIdx.x%4)==0)<<4));
|
||||
}
|
||||
}
|
||||
|
||||
// Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
|
||||
uint32_t *outpHash = &g_hash[hashPosition<<4];
|
||||
uint32_t hash[16];
|
||||
from_bitslice_quad(state, hash);
|
||||
|
||||
if ((threadIdx.x & 0x03) == 0)
|
||||
{
|
||||
#pragma unroll 16
|
||||
for(int k=0;k<16;k++) outpHash[k] = hash[k];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Setup-Funktionen
|
||||
__host__ void quark_groestl512_cpu_init(int thr_id, int threads)
|
||||
{
|
||||
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
|
||||
|
||||
// Texturen mit obigem Makro initialisieren
|
||||
texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
|
||||
texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
|
||||
}
|
||||
|
||||
__host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
||||
{
|
||||
// Compute 3.5 und 5.x Geräte am besten mit 768 Threads ansteuern,
|
||||
// alle anderen mit 512 Threads.
|
||||
int threadsperblock = ((props[thr_id].major == 3 && props[thr_id].minor == 5) || props[thr_id].major > 3) ? 768 : 512;
|
||||
int threadsperblock = 256;
|
||||
|
||||
// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
|
||||
// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
|
||||
const int factor = 4;
|
||||
|
||||
// berechne wie viele Thread Blocks wir brauchen
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
#if USE_SHARED
|
||||
size_t shared_size = 8 * 256 * sizeof(uint32_t);
|
||||
#else
|
||||
size_t shared_size = 0;
|
||||
#endif
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
//fprintf(stderr, "ThrID: %d\n", thr_id);
|
||||
quark_groestl512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
|
||||
quark_groestl512_gpu_hash_64_quad<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
|
||||
|
||||
// Strategisches Sleep Kommando zur Senkung der CPU Last
|
||||
MyStreamSynchronize(NULL, order, thr_id);
|
||||
@ -372,25 +153,20 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t sta
|
||||
|
||||
__host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
|
||||
{
|
||||
// Compute 3.5 und 5.x Geräte am besten mit 768 Threads ansteuern,
|
||||
// alle anderen mit 512 Threads.
|
||||
int threadsperblock = ((props[thr_id].major == 3 && props[thr_id].minor == 5) || props[thr_id].major > 3) ? 768 : 512;
|
||||
int threadsperblock = 256;
|
||||
|
||||
// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
|
||||
// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
|
||||
const int factor = 4;
|
||||
|
||||
// berechne wie viele Thread Blocks wir brauchen
|
||||
dim3 grid((threads + threadsperblock-1)/threadsperblock);
|
||||
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
|
||||
dim3 block(threadsperblock);
|
||||
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
#if USE_SHARED
|
||||
size_t shared_size = 8 * 256 * sizeof(uint32_t);
|
||||
#else
|
||||
size_t shared_size = 0;
|
||||
#endif
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
//fprintf(stderr, "ThrID: %d\n", thr_id);
|
||||
quark_groestl512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
|
||||
quark_groestl512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
|
||||
quark_doublegroestl512_gpu_hash_64_quad<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
|
||||
|
||||
// Strategisches Sleep Kommando zur Senkung der CPU Last
|
||||
MyStreamSynchronize(NULL, order, thr_id);
|
||||
|
@ -175,8 +175,6 @@ __host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t star
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
||||
quark_keccak512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||
MyStreamSynchronize(NULL, order, thr_id);
|
||||
}
|
||||
|
@ -442,7 +442,6 @@ __host__ void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t start
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
quark_skein512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||
|
||||
// Strategisches Sleep Kommando zur Senkung der CPU Last
|
||||
|
@ -157,7 +157,6 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
|
||||
{
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
// TODO: entfernen für eine Release! Ist nur zum Testen!
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x0000ff;
|
||||
|
||||
|
@ -307,8 +307,6 @@ __host__ void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t star
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
||||
x11_cubehash512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||
MyStreamSynchronize(NULL, order, thr_id);
|
||||
}
|
||||
|
@ -225,8 +225,6 @@ __host__ void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
||||
x11_echo512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||
MyStreamSynchronize(NULL, order, thr_id);
|
||||
}
|
||||
|
@ -376,8 +376,6 @@ __host__ void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNo
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
||||
x11_luffa512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||
MyStreamSynchronize(NULL, order, thr_id);
|
||||
}
|
||||
|
@ -1372,8 +1372,6 @@ __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t start
|
||||
// Größe des dynamischen Shared Memory Bereichs
|
||||
size_t shared_size = 0;
|
||||
|
||||
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
|
||||
|
||||
x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
|
||||
MyStreamSynchronize(NULL, order, thr_id);
|
||||
}
|
||||
|
@ -162,11 +162,8 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
|
||||
{
|
||||
const uint32_t first_nonce = pdata[19];
|
||||
|
||||
// TODO: entfernen für eine Release! Ist nur zum Testen!
|
||||
if (opt_benchmark) {
|
||||
if (opt_benchmark)
|
||||
((uint32_t*)ptarget)[7] = 0x0000ff;
|
||||
pdata[17] = 0;
|
||||
}
|
||||
|
||||
const uint32_t Htarg = ptarget[7];
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user