Browse Source

bump to revision V1.1 with Killer Groestl

master v1.1
Christian Buchner 11 years ago
parent
commit
3b21069504
  1. 2
      JHA/cuda_jha_keccak512.cu
  2. 16
      JHA/jackpotcoin.cu
  3. 13
      README.txt
  4. 418
      bitslice_transformations_quad.cu
  5. 12
      ccminer.vcxproj
  6. 6
      ccminer.vcxproj.filters
  7. 2
      configure.ac
  8. 6
      cpu-miner.c
  9. 4
      cpuminer-config.h
  10. 405
      cuda_groestlcoin.cu
  11. 449
      cuda_myriadgroestl.cu
  12. 1
      cuda_nist5.cu
  13. 315
      groestl_functions_quad.cu
  14. 53
      groestlcoin.cpp
  15. 2
      heavy/cuda_blake512.cu
  16. 2
      heavy/cuda_combine.cu
  17. 2
      heavy/cuda_groestl512.cu
  18. 2
      heavy/cuda_hefty1.cu
  19. 1
      heavy/cuda_keccak512.cu
  20. 1
      heavy/cuda_sha256.cu
  21. 8
      myriadgroestl.cpp
  22. 1
      quark/animecoin.cu
  23. 4
      quark/cuda_bmw512.cu
  24. 2
      quark/cuda_jh512.cu
  25. 4
      quark/cuda_quark_blake512.cu
  26. 2
      quark/cuda_quark_checkhash.cu
  27. 404
      quark/cuda_quark_groestl512.cu
  28. 2
      quark/cuda_quark_keccak512.cu
  29. 1
      quark/cuda_skein512.cu
  30. 1
      quark/quarkcoin.cu
  31. 2
      x11/cuda_x11_cubehash512.cu
  32. 2
      x11/cuda_x11_echo.cu
  33. 2
      x11/cuda_x11_luffa512.cu
  34. 2
      x11/cuda_x11_shavite512.cu
  35. 5
      x11/x11.cu

2
JHA/cuda_jha_keccak512.cu

@ -567,8 +567,6 @@ __host__ void jackpot_keccak512_cpu_hash(int thr_id, int threads, uint32_t start
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
jackpot_keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash); jackpot_keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }

16
JHA/jackpotcoin.cu

@ -101,14 +101,12 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
{ {
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
// TODO: entfernen für eine Release! Ist nur zum Testen!
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff; ((uint32_t*)ptarget)[7] = 0x0000ff;
const uint32_t Htarg = ptarget[7]; const uint32_t Htarg = ptarget[7];
const int throughput = 256*4096*4; // 100; const int throughput = 256*4096*4; // 100;
//const int throughput = 256*256*2+100; // 100;
static bool init[8] = {0,0,0,0,0,0,0,0}; static bool init[8] = {0,0,0,0,0,0,0,0};
if (!init[thr_id]) if (!init[thr_id])
@ -167,16 +165,18 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++); quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
} }
// Runde 2 (ohne Gröstl) // Runde 3 (komplett)
// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01) // jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
d_branch1Nonces[thr_id], &nrm1, d_branch1Nonces[thr_id], &nrm1,
d_branch3Nonces[thr_id], &nrm3, d_branch2Nonces[thr_id], &nrm2,
order++); order++);
// verfolge den skein-pfad weiter if (nrm1+nrm2 == nrm3) {
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
}
// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01) // jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id], jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
@ -226,7 +226,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) { if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
pdata[19] = foundNonce; pdata[19] = foundNonce;
*hashes_done = (foundNonce - first_nonce + 1)/4; *hashes_done = (foundNonce - first_nonce + 1)/2;
//applog(LOG_INFO, "GPU #%d: result for nonce $%08X does validate on CPU (%d rounds)!", thr_id, foundNonce, rounds); //applog(LOG_INFO, "GPU #%d: result for nonce $%08X does validate on CPU (%d rounds)!", thr_id, foundNonce, rounds);
return 1; return 1;
} else { } else {
@ -238,6 +238,6 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart); } while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = (pdata[19] - first_nonce + 1)/4; *hashes_done = (pdata[19] - first_nonce + 1)/2;
return 0; return 0;
} }

13
README.txt

@ -1,5 +1,5 @@
ccMiner release 1.0 (May 10th 2014) - "Did anyone say X11?" ccMiner release 1.1 (June 14th 2014) - "Killer Groestl!"
------------------------------------------------------------- -------------------------------------------------------------
*************************************************************** ***************************************************************
@ -30,13 +30,12 @@ FugueCoin
GroestlCoin & Myriad-Groestl GroestlCoin & Myriad-Groestl
JackpotCoin JackpotCoin
QuarkCoin family & AnimeCoin QuarkCoin family & AnimeCoin
TalkCoin
DarkCoin and other X11 coins
where some of these coins have a VERY NOTABLE nVidia advantage where some of these coins have a VERY NOTABLE nVidia advantage
over competing AMD (OpenCL) implementations. over competing AMD (OpenCL) implementations.
X11 algo is being worked on. It will be released when we
have achieved a nice nVidia advantage.
We did not take a big effort on improving usability, so please set We did not take a big effort on improving usability, so please set
your parameters carefuly. your parameters carefuly.
@ -140,6 +139,12 @@ features.
>>> RELEASE HISTORY <<< >>> RELEASE HISTORY <<<
June 14th 2014 released Killer Groestl quad version which I deem
sufficiently hard to port over to AMD. It isn't
the fastest option for Compute 3.5 and 5.0 cards,
but it is still much faster than the table based
versions.
May 10th 2014 added X11, but without the bells & whistles May 10th 2014 added X11, but without the bells & whistles
(no killer Groestl, SIMD hash quite slow still) (no killer Groestl, SIMD hash quite slow still)

418
bitslice_transformations_quad.cu

@ -0,0 +1,418 @@
__device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *output)
{
int n = threadIdx.x % 4;
uint32_t other[8];
#pragma unroll 8
for (int i = 0; i < 8; i++) {
input[i] =__shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4);
other[i] =__shfl((int)input[i], (threadIdx.x + 1) % 4, 4);
input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);
other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);
if (threadIdx.x & 1) {
input[i] = __byte_perm(input[i], 0, 0x1032);
other[i] = __byte_perm(other[i], 0, 0x1032);
}
output[i] = 0;
}
output[ 0] |= (input[ 0] & 0x00000001);
output[ 0] |= ((other[ 0] & 0x00000001) << 1);
output[ 0] |= ((input[ 1] & 0x00000001) << 2);
output[ 0] |= ((other[ 1] & 0x00000001) << 3);
output[ 0] |= ((input[ 2] & 0x00000001) << 4);
output[ 0] |= ((other[ 2] & 0x00000001) << 5);
output[ 0] |= ((input[ 3] & 0x00000001) << 6);
output[ 0] |= ((other[ 3] & 0x00000001) << 7);
output[ 0] |= ((input[ 4] & 0x00000001) << 8);
output[ 0] |= ((other[ 4] & 0x00000001) << 9);
output[ 0] |= ((input[ 5] & 0x00000001) <<10);
output[ 0] |= ((other[ 5] & 0x00000001) <<11);
output[ 0] |= ((input[ 6] & 0x00000001) <<12);
output[ 0] |= ((other[ 6] & 0x00000001) <<13);
output[ 0] |= ((input[ 7] & 0x00000001) <<14);
output[ 0] |= ((other[ 7] & 0x00000001) <<15);
output[ 0] |= ((input[ 0] & 0x00000100) << 8);
output[ 0] |= ((other[ 0] & 0x00000100) << 9);
output[ 0] |= ((input[ 1] & 0x00000100) <<10);
output[ 0] |= ((other[ 1] & 0x00000100) <<11);
output[ 0] |= ((input[ 2] & 0x00000100) <<12);
output[ 0] |= ((other[ 2] & 0x00000100) <<13);
output[ 0] |= ((input[ 3] & 0x00000100) <<14);
output[ 0] |= ((other[ 3] & 0x00000100) <<15);
output[ 0] |= ((input[ 4] & 0x00000100) <<16);
output[ 0] |= ((other[ 4] & 0x00000100) <<17);
output[ 0] |= ((input[ 5] & 0x00000100) <<18);
output[ 0] |= ((other[ 5] & 0x00000100) <<19);
output[ 0] |= ((input[ 6] & 0x00000100) <<20);
output[ 0] |= ((other[ 6] & 0x00000100) <<21);
output[ 0] |= ((input[ 7] & 0x00000100) <<22);
output[ 0] |= ((other[ 7] & 0x00000100) <<23);
output[ 1] |= ((input[ 0] & 0x00000002) >> 1);
output[ 1] |= (other[ 0] & 0x00000002);
output[ 1] |= ((input[ 1] & 0x00000002) << 1);
output[ 1] |= ((other[ 1] & 0x00000002) << 2);
output[ 1] |= ((input[ 2] & 0x00000002) << 3);
output[ 1] |= ((other[ 2] & 0x00000002) << 4);
output[ 1] |= ((input[ 3] & 0x00000002) << 5);
output[ 1] |= ((other[ 3] & 0x00000002) << 6);
output[ 1] |= ((input[ 4] & 0x00000002) << 7);
output[ 1] |= ((other[ 4] & 0x00000002) << 8);
output[ 1] |= ((input[ 5] & 0x00000002) << 9);
output[ 1] |= ((other[ 5] & 0x00000002) <<10);
output[ 1] |= ((input[ 6] & 0x00000002) <<11);
output[ 1] |= ((other[ 6] & 0x00000002) <<12);
output[ 1] |= ((input[ 7] & 0x00000002) <<13);
output[ 1] |= ((other[ 7] & 0x00000002) <<14);
output[ 1] |= ((input[ 0] & 0x00000200) << 7);
output[ 1] |= ((other[ 0] & 0x00000200) << 8);
output[ 1] |= ((input[ 1] & 0x00000200) << 9);
output[ 1] |= ((other[ 1] & 0x00000200) <<10);
output[ 1] |= ((input[ 2] & 0x00000200) <<11);
output[ 1] |= ((other[ 2] & 0x00000200) <<12);
output[ 1] |= ((input[ 3] & 0x00000200) <<13);
output[ 1] |= ((other[ 3] & 0x00000200) <<14);
output[ 1] |= ((input[ 4] & 0x00000200) <<15);
output[ 1] |= ((other[ 4] & 0x00000200) <<16);
output[ 1] |= ((input[ 5] & 0x00000200) <<17);
output[ 1] |= ((other[ 5] & 0x00000200) <<18);
output[ 1] |= ((input[ 6] & 0x00000200) <<19);
output[ 1] |= ((other[ 6] & 0x00000200) <<20);
output[ 1] |= ((input[ 7] & 0x00000200) <<21);
output[ 1] |= ((other[ 7] & 0x00000200) <<22);
output[ 2] |= ((input[ 0] & 0x00000004) >> 2);
output[ 2] |= ((other[ 0] & 0x00000004) >> 1);
output[ 2] |= (input[ 1] & 0x00000004);
output[ 2] |= ((other[ 1] & 0x00000004) << 1);
output[ 2] |= ((input[ 2] & 0x00000004) << 2);
output[ 2] |= ((other[ 2] & 0x00000004) << 3);
output[ 2] |= ((input[ 3] & 0x00000004) << 4);
output[ 2] |= ((other[ 3] & 0x00000004) << 5);
output[ 2] |= ((input[ 4] & 0x00000004) << 6);
output[ 2] |= ((other[ 4] & 0x00000004) << 7);
output[ 2] |= ((input[ 5] & 0x00000004) << 8);
output[ 2] |= ((other[ 5] & 0x00000004) << 9);
output[ 2] |= ((input[ 6] & 0x00000004) <<10);
output[ 2] |= ((other[ 6] & 0x00000004) <<11);
output[ 2] |= ((input[ 7] & 0x00000004) <<12);
output[ 2] |= ((other[ 7] & 0x00000004) <<13);
output[ 2] |= ((input[ 0] & 0x00000400) << 6);
output[ 2] |= ((other[ 0] & 0x00000400) << 7);
output[ 2] |= ((input[ 1] & 0x00000400) << 8);
output[ 2] |= ((other[ 1] & 0x00000400) << 9);
output[ 2] |= ((input[ 2] & 0x00000400) <<10);
output[ 2] |= ((other[ 2] & 0x00000400) <<11);
output[ 2] |= ((input[ 3] & 0x00000400) <<12);
output[ 2] |= ((other[ 3] & 0x00000400) <<13);
output[ 2] |= ((input[ 4] & 0x00000400) <<14);
output[ 2] |= ((other[ 4] & 0x00000400) <<15);
output[ 2] |= ((input[ 5] & 0x00000400) <<16);
output[ 2] |= ((other[ 5] & 0x00000400) <<17);
output[ 2] |= ((input[ 6] & 0x00000400) <<18);
output[ 2] |= ((other[ 6] & 0x00000400) <<19);
output[ 2] |= ((input[ 7] & 0x00000400) <<20);
output[ 2] |= ((other[ 7] & 0x00000400) <<21);
output[ 3] |= ((input[ 0] & 0x00000008) >> 3);
output[ 3] |= ((other[ 0] & 0x00000008) >> 2);
output[ 3] |= ((input[ 1] & 0x00000008) >> 1);
output[ 3] |= (other[ 1] & 0x00000008);
output[ 3] |= ((input[ 2] & 0x00000008) << 1);
output[ 3] |= ((other[ 2] & 0x00000008) << 2);
output[ 3] |= ((input[ 3] & 0x00000008) << 3);
output[ 3] |= ((other[ 3] & 0x00000008) << 4);
output[ 3] |= ((input[ 4] & 0x00000008) << 5);
output[ 3] |= ((other[ 4] & 0x00000008) << 6);
output[ 3] |= ((input[ 5] & 0x00000008) << 7);
output[ 3] |= ((other[ 5] & 0x00000008) << 8);
output[ 3] |= ((input[ 6] & 0x00000008) << 9);
output[ 3] |= ((other[ 6] & 0x00000008) <<10);
output[ 3] |= ((input[ 7] & 0x00000008) <<11);
output[ 3] |= ((other[ 7] & 0x00000008) <<12);
output[ 3] |= ((input[ 0] & 0x00000800) << 5);
output[ 3] |= ((other[ 0] & 0x00000800) << 6);
output[ 3] |= ((input[ 1] & 0x00000800) << 7);
output[ 3] |= ((other[ 1] & 0x00000800) << 8);
output[ 3] |= ((input[ 2] & 0x00000800) << 9);
output[ 3] |= ((other[ 2] & 0x00000800) <<10);
output[ 3] |= ((input[ 3] & 0x00000800) <<11);
output[ 3] |= ((other[ 3] & 0x00000800) <<12);
output[ 3] |= ((input[ 4] & 0x00000800) <<13);
output[ 3] |= ((other[ 4] & 0x00000800) <<14);
output[ 3] |= ((input[ 5] & 0x00000800) <<15);
output[ 3] |= ((other[ 5] & 0x00000800) <<16);
output[ 3] |= ((input[ 6] & 0x00000800) <<17);
output[ 3] |= ((other[ 6] & 0x00000800) <<18);
output[ 3] |= ((input[ 7] & 0x00000800) <<19);
output[ 3] |= ((other[ 7] & 0x00000800) <<20);
output[ 4] |= ((input[ 0] & 0x00000010) >> 4);
output[ 4] |= ((other[ 0] & 0x00000010) >> 3);
output[ 4] |= ((input[ 1] & 0x00000010) >> 2);
output[ 4] |= ((other[ 1] & 0x00000010) >> 1);
output[ 4] |= (input[ 2] & 0x00000010);
output[ 4] |= ((other[ 2] & 0x00000010) << 1);
output[ 4] |= ((input[ 3] & 0x00000010) << 2);
output[ 4] |= ((other[ 3] & 0x00000010) << 3);
output[ 4] |= ((input[ 4] & 0x00000010) << 4);
output[ 4] |= ((other[ 4] & 0x00000010) << 5);
output[ 4] |= ((input[ 5] & 0x00000010) << 6);
output[ 4] |= ((other[ 5] & 0x00000010) << 7);
output[ 4] |= ((input[ 6] & 0x00000010) << 8);
output[ 4] |= ((other[ 6] & 0x00000010) << 9);
output[ 4] |= ((input[ 7] & 0x00000010) <<10);
output[ 4] |= ((other[ 7] & 0x00000010) <<11);
output[ 4] |= ((input[ 0] & 0x00001000) << 4);
output[ 4] |= ((other[ 0] & 0x00001000) << 5);
output[ 4] |= ((input[ 1] & 0x00001000) << 6);
output[ 4] |= ((other[ 1] & 0x00001000) << 7);
output[ 4] |= ((input[ 2] & 0x00001000) << 8);
output[ 4] |= ((other[ 2] & 0x00001000) << 9);
output[ 4] |= ((input[ 3] & 0x00001000) <<10);
output[ 4] |= ((other[ 3] & 0x00001000) <<11);
output[ 4] |= ((input[ 4] & 0x00001000) <<12);
output[ 4] |= ((other[ 4] & 0x00001000) <<13);
output[ 4] |= ((input[ 5] & 0x00001000) <<14);
output[ 4] |= ((other[ 5] & 0x00001000) <<15);
output[ 4] |= ((input[ 6] & 0x00001000) <<16);
output[ 4] |= ((other[ 6] & 0x00001000) <<17);
output[ 4] |= ((input[ 7] & 0x00001000) <<18);
output[ 4] |= ((other[ 7] & 0x00001000) <<19);
output[ 5] |= ((input[ 0] & 0x00000020) >> 5);
output[ 5] |= ((other[ 0] & 0x00000020) >> 4);
output[ 5] |= ((input[ 1] & 0x00000020) >> 3);
output[ 5] |= ((other[ 1] & 0x00000020) >> 2);
output[ 5] |= ((input[ 2] & 0x00000020) >> 1);
output[ 5] |= (other[ 2] & 0x00000020);
output[ 5] |= ((input[ 3] & 0x00000020) << 1);
output[ 5] |= ((other[ 3] & 0x00000020) << 2);
output[ 5] |= ((input[ 4] & 0x00000020) << 3);
output[ 5] |= ((other[ 4] & 0x00000020) << 4);
output[ 5] |= ((input[ 5] & 0x00000020) << 5);
output[ 5] |= ((other[ 5] & 0x00000020) << 6);
output[ 5] |= ((input[ 6] & 0x00000020) << 7);
output[ 5] |= ((other[ 6] & 0x00000020) << 8);
output[ 5] |= ((input[ 7] & 0x00000020) << 9);
output[ 5] |= ((other[ 7] & 0x00000020) <<10);
output[ 5] |= ((input[ 0] & 0x00002000) << 3);
output[ 5] |= ((other[ 0] & 0x00002000) << 4);
output[ 5] |= ((input[ 1] & 0x00002000) << 5);
output[ 5] |= ((other[ 1] & 0x00002000) << 6);
output[ 5] |= ((input[ 2] & 0x00002000) << 7);
output[ 5] |= ((other[ 2] & 0x00002000) << 8);
output[ 5] |= ((input[ 3] & 0x00002000) << 9);
output[ 5] |= ((other[ 3] & 0x00002000) <<10);
output[ 5] |= ((input[ 4] & 0x00002000) <<11);
output[ 5] |= ((other[ 4] & 0x00002000) <<12);
output[ 5] |= ((input[ 5] & 0x00002000) <<13);
output[ 5] |= ((other[ 5] & 0x00002000) <<14);
output[ 5] |= ((input[ 6] & 0x00002000) <<15);
output[ 5] |= ((other[ 6] & 0x00002000) <<16);
output[ 5] |= ((input[ 7] & 0x00002000) <<17);
output[ 5] |= ((other[ 7] & 0x00002000) <<18);
output[ 6] |= ((input[ 0] & 0x00000040) >> 6);
output[ 6] |= ((other[ 0] & 0x00000040) >> 5);
output[ 6] |= ((input[ 1] & 0x00000040) >> 4);
output[ 6] |= ((other[ 1] & 0x00000040) >> 3);
output[ 6] |= ((input[ 2] & 0x00000040) >> 2);
output[ 6] |= ((other[ 2] & 0x00000040) >> 1);
output[ 6] |= (input[ 3] & 0x00000040);
output[ 6] |= ((other[ 3] & 0x00000040) << 1);
output[ 6] |= ((input[ 4] & 0x00000040) << 2);
output[ 6] |= ((other[ 4] & 0x00000040) << 3);
output[ 6] |= ((input[ 5] & 0x00000040) << 4);
output[ 6] |= ((other[ 5] & 0x00000040) << 5);
output[ 6] |= ((input[ 6] & 0x00000040) << 6);
output[ 6] |= ((other[ 6] & 0x00000040) << 7);
output[ 6] |= ((input[ 7] & 0x00000040) << 8);
output[ 6] |= ((other[ 7] & 0x00000040) << 9);
output[ 6] |= ((input[ 0] & 0x00004000) << 2);
output[ 6] |= ((other[ 0] & 0x00004000) << 3);
output[ 6] |= ((input[ 1] & 0x00004000) << 4);
output[ 6] |= ((other[ 1] & 0x00004000) << 5);
output[ 6] |= ((input[ 2] & 0x00004000) << 6);
output[ 6] |= ((other[ 2] & 0x00004000) << 7);
output[ 6] |= ((input[ 3] & 0x00004000) << 8);
output[ 6] |= ((other[ 3] & 0x00004000) << 9);
output[ 6] |= ((input[ 4] & 0x00004000) <<10);
output[ 6] |= ((other[ 4] & 0x00004000) <<11);
output[ 6] |= ((input[ 5] & 0x00004000) <<12);
output[ 6] |= ((other[ 5] & 0x00004000) <<13);
output[ 6] |= ((input[ 6] & 0x00004000) <<14);
output[ 6] |= ((other[ 6] & 0x00004000) <<15);
output[ 6] |= ((input[ 7] & 0x00004000) <<16);
output[ 6] |= ((other[ 7] & 0x00004000) <<17);
output[ 7] |= ((input[ 0] & 0x00000080) >> 7);
output[ 7] |= ((other[ 0] & 0x00000080) >> 6);
output[ 7] |= ((input[ 1] & 0x00000080) >> 5);
output[ 7] |= ((other[ 1] & 0x00000080) >> 4);
output[ 7] |= ((input[ 2] & 0x00000080) >> 3);
output[ 7] |= ((other[ 2] & 0x00000080) >> 2);
output[ 7] |= ((input[ 3] & 0x00000080) >> 1);
output[ 7] |= (other[ 3] & 0x00000080);
output[ 7] |= ((input[ 4] & 0x00000080) << 1);
output[ 7] |= ((other[ 4] & 0x00000080) << 2);
output[ 7] |= ((input[ 5] & 0x00000080) << 3);
output[ 7] |= ((other[ 5] & 0x00000080) << 4);
output[ 7] |= ((input[ 6] & 0x00000080) << 5);
output[ 7] |= ((other[ 6] & 0x00000080) << 6);
output[ 7] |= ((input[ 7] & 0x00000080) << 7);
output[ 7] |= ((other[ 7] & 0x00000080) << 8);
output[ 7] |= ((input[ 0] & 0x00008000) << 1);
output[ 7] |= ((other[ 0] & 0x00008000) << 2);
output[ 7] |= ((input[ 1] & 0x00008000) << 3);
output[ 7] |= ((other[ 1] & 0x00008000) << 4);
output[ 7] |= ((input[ 2] & 0x00008000) << 5);
output[ 7] |= ((other[ 2] & 0x00008000) << 6);
output[ 7] |= ((input[ 3] & 0x00008000) << 7);
output[ 7] |= ((other[ 3] & 0x00008000) << 8);
output[ 7] |= ((input[ 4] & 0x00008000) << 9);
output[ 7] |= ((other[ 4] & 0x00008000) <<10);
output[ 7] |= ((input[ 5] & 0x00008000) <<11);
output[ 7] |= ((other[ 5] & 0x00008000) <<12);
output[ 7] |= ((input[ 6] & 0x00008000) <<13);
output[ 7] |= ((other[ 6] & 0x00008000) <<14);
output[ 7] |= ((input[ 7] & 0x00008000) <<15);
output[ 7] |= ((other[ 7] & 0x00008000) <<16);
}
__device__ __forceinline__ void from_bitslice_quad(uint32_t *input, uint32_t *output)
{
#pragma unroll 8
for (int i=0; i < 16; i+=2) output[i] = 0;
output[ 0] |= ((input[ 0] & 0x00000100) >> 8);
output[ 0] |= ((input[ 1] & 0x00000100) >> 7);
output[ 0] |= ((input[ 2] & 0x00000100) >> 6);
output[ 0] |= ((input[ 3] & 0x00000100) >> 5);
output[ 0] |= ((input[ 4] & 0x00000100) >> 4);
output[ 0] |= ((input[ 5] & 0x00000100) >> 3);
output[ 0] |= ((input[ 6] & 0x00000100) >> 2);
output[ 0] |= ((input[ 7] & 0x00000100) >> 1);
output[ 0] |= ((input[ 0] & 0x01000000) >>16);
output[ 0] |= ((input[ 1] & 0x01000000) >>15);
output[ 0] |= ((input[ 2] & 0x01000000) >>14);
output[ 0] |= ((input[ 3] & 0x01000000) >>13);
output[ 0] |= ((input[ 4] & 0x01000000) >>12);
output[ 0] |= ((input[ 5] & 0x01000000) >>11);
output[ 0] |= ((input[ 6] & 0x01000000) >>10);
output[ 0] |= ((input[ 7] & 0x01000000) >> 9);
output[ 2] |= ((input[ 0] & 0x00000200) >> 9);
output[ 2] |= ((input[ 1] & 0x00000200) >> 8);
output[ 2] |= ((input[ 2] & 0x00000200) >> 7);
output[ 2] |= ((input[ 3] & 0x00000200) >> 6);
output[ 2] |= ((input[ 4] & 0x00000200) >> 5);
output[ 2] |= ((input[ 5] & 0x00000200) >> 4);
output[ 2] |= ((input[ 6] & 0x00000200) >> 3);
output[ 2] |= ((input[ 7] & 0x00000200) >> 2);
output[ 2] |= ((input[ 0] & 0x02000000) >>17);
output[ 2] |= ((input[ 1] & 0x02000000) >>16);
output[ 2] |= ((input[ 2] & 0x02000000) >>15);
output[ 2] |= ((input[ 3] & 0x02000000) >>14);
output[ 2] |= ((input[ 4] & 0x02000000) >>13);
output[ 2] |= ((input[ 5] & 0x02000000) >>12);
output[ 2] |= ((input[ 6] & 0x02000000) >>11);
output[ 2] |= ((input[ 7] & 0x02000000) >>10);
output[ 4] |= ((input[ 0] & 0x00000400) >>10);
output[ 4] |= ((input[ 1] & 0x00000400) >> 9);
output[ 4] |= ((input[ 2] & 0x00000400) >> 8);
output[ 4] |= ((input[ 3] & 0x00000400) >> 7);
output[ 4] |= ((input[ 4] & 0x00000400) >> 6);
output[ 4] |= ((input[ 5] & 0x00000400) >> 5);
output[ 4] |= ((input[ 6] & 0x00000400) >> 4);
output[ 4] |= ((input[ 7] & 0x00000400) >> 3);
output[ 4] |= ((input[ 0] & 0x04000000) >>18);
output[ 4] |= ((input[ 1] & 0x04000000) >>17);
output[ 4] |= ((input[ 2] & 0x04000000) >>16);
output[ 4] |= ((input[ 3] & 0x04000000) >>15);
output[ 4] |= ((input[ 4] & 0x04000000) >>14);
output[ 4] |= ((input[ 5] & 0x04000000) >>13);
output[ 4] |= ((input[ 6] & 0x04000000) >>12);
output[ 4] |= ((input[ 7] & 0x04000000) >>11);
output[ 6] |= ((input[ 0] & 0x00000800) >>11);
output[ 6] |= ((input[ 1] & 0x00000800) >>10);
output[ 6] |= ((input[ 2] & 0x00000800) >> 9);
output[ 6] |= ((input[ 3] & 0x00000800) >> 8);
output[ 6] |= ((input[ 4] & 0x00000800) >> 7);
output[ 6] |= ((input[ 5] & 0x00000800) >> 6);
output[ 6] |= ((input[ 6] & 0x00000800) >> 5);
output[ 6] |= ((input[ 7] & 0x00000800) >> 4);
output[ 6] |= ((input[ 0] & 0x08000000) >>19);
output[ 6] |= ((input[ 1] & 0x08000000) >>18);
output[ 6] |= ((input[ 2] & 0x08000000) >>17);
output[ 6] |= ((input[ 3] & 0x08000000) >>16);
output[ 6] |= ((input[ 4] & 0x08000000) >>15);
output[ 6] |= ((input[ 5] & 0x08000000) >>14);
output[ 6] |= ((input[ 6] & 0x08000000) >>13);
output[ 6] |= ((input[ 7] & 0x08000000) >>12);
output[ 8] |= ((input[ 0] & 0x00001000) >>12);
output[ 8] |= ((input[ 1] & 0x00001000) >>11);
output[ 8] |= ((input[ 2] & 0x00001000) >>10);
output[ 8] |= ((input[ 3] & 0x00001000) >> 9);
output[ 8] |= ((input[ 4] & 0x00001000) >> 8);
output[ 8] |= ((input[ 5] & 0x00001000) >> 7);
output[ 8] |= ((input[ 6] & 0x00001000) >> 6);
output[ 8] |= ((input[ 7] & 0x00001000) >> 5);
output[ 8] |= ((input[ 0] & 0x10000000) >>20);
output[ 8] |= ((input[ 1] & 0x10000000) >>19);
output[ 8] |= ((input[ 2] & 0x10000000) >>18);
output[ 8] |= ((input[ 3] & 0x10000000) >>17);
output[ 8] |= ((input[ 4] & 0x10000000) >>16);
output[ 8] |= ((input[ 5] & 0x10000000) >>15);
output[ 8] |= ((input[ 6] & 0x10000000) >>14);
output[ 8] |= ((input[ 7] & 0x10000000) >>13);
output[10] |= ((input[ 0] & 0x00002000) >>13);
output[10] |= ((input[ 1] & 0x00002000) >>12);
output[10] |= ((input[ 2] & 0x00002000) >>11);
output[10] |= ((input[ 3] & 0x00002000) >>10);
output[10] |= ((input[ 4] & 0x00002000) >> 9);
output[10] |= ((input[ 5] & 0x00002000) >> 8);
output[10] |= ((input[ 6] & 0x00002000) >> 7);
output[10] |= ((input[ 7] & 0x00002000) >> 6);
output[10] |= ((input[ 0] & 0x20000000) >>21);
output[10] |= ((input[ 1] & 0x20000000) >>20);
output[10] |= ((input[ 2] & 0x20000000) >>19);
output[10] |= ((input[ 3] & 0x20000000) >>18);
output[10] |= ((input[ 4] & 0x20000000) >>17);
output[10] |= ((input[ 5] & 0x20000000) >>16);
output[10] |= ((input[ 6] & 0x20000000) >>15);
output[10] |= ((input[ 7] & 0x20000000) >>14);
output[12] |= ((input[ 0] & 0x00004000) >>14);
output[12] |= ((input[ 1] & 0x00004000) >>13);
output[12] |= ((input[ 2] & 0x00004000) >>12);
output[12] |= ((input[ 3] & 0x00004000) >>11);
output[12] |= ((input[ 4] & 0x00004000) >>10);
output[12] |= ((input[ 5] & 0x00004000) >> 9);
output[12] |= ((input[ 6] & 0x00004000) >> 8);
output[12] |= ((input[ 7] & 0x00004000) >> 7);
output[12] |= ((input[ 0] & 0x40000000) >>22);
output[12] |= ((input[ 1] & 0x40000000) >>21);
output[12] |= ((input[ 2] & 0x40000000) >>20);
output[12] |= ((input[ 3] & 0x40000000) >>19);
output[12] |= ((input[ 4] & 0x40000000) >>18);
output[12] |= ((input[ 5] & 0x40000000) >>17);
output[12] |= ((input[ 6] & 0x40000000) >>16);
output[12] |= ((input[ 7] & 0x40000000) >>15);
output[14] |= ((input[ 0] & 0x00008000) >>15);
output[14] |= ((input[ 1] & 0x00008000) >>14);
output[14] |= ((input[ 2] & 0x00008000) >>13);
output[14] |= ((input[ 3] & 0x00008000) >>12);
output[14] |= ((input[ 4] & 0x00008000) >>11);
output[14] |= ((input[ 5] & 0x00008000) >>10);
output[14] |= ((input[ 6] & 0x00008000) >> 9);
output[14] |= ((input[ 7] & 0x00008000) >> 8);
output[14] |= ((input[ 0] & 0x80000000) >>23);
output[14] |= ((input[ 1] & 0x80000000) >>22);
output[14] |= ((input[ 2] & 0x80000000) >>21);
output[14] |= ((input[ 3] & 0x80000000) >>20);
output[14] |= ((input[ 4] & 0x80000000) >>19);
output[14] |= ((input[ 5] & 0x80000000) >>18);
output[14] |= ((input[ 6] & 0x80000000) >>17);
output[14] |= ((input[ 7] & 0x80000000) >>16);
#pragma unroll 8
for (int i = 0; i < 16; i+=2) {
if (threadIdx.x & 1) output[i] = __byte_perm(output[i], 0, 0x1032);
output[i] = __byte_perm(output[i], __shfl((int)output[i], (threadIdx.x+1)%4, 4), 0x7610);
output[i+1] = __shfl((int)output[i], (threadIdx.x+2)%4, 4);
if ((threadIdx.x % 4) != 0) output[i] = output[i+1] = 0;
}
}

12
ccminer.vcxproj

@ -287,6 +287,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<ClInclude Include="uint256.h" /> <ClInclude Include="uint256.h" />
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<CudaCompile Include="bitslice_transformations_quad.cu">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</CudaCompile>
<CudaCompile Include="cuda_fugue256.cu"> <CudaCompile Include="cuda_fugue256.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -311,6 +317,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
</CudaCompile> </CudaCompile>
<CudaCompile Include="groestl_functions_quad.cu">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</CudaCompile>
<CudaCompile Include="heavy\cuda_blake512.cu"> <CudaCompile Include="heavy\cuda_blake512.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions> <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>

6
ccminer.vcxproj.filters

@ -340,5 +340,11 @@
<CudaCompile Include="x11\simd_functions.cu"> <CudaCompile Include="x11\simd_functions.cu">
<Filter>Source Files\CUDA\x11</Filter> <Filter>Source Files\CUDA\x11</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="bitslice_transformations_quad.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="groestl_functions_quad.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
</ItemGroup> </ItemGroup>
</Project> </Project>

2
configure.ac

@ -1,4 +1,4 @@
AC_INIT([ccminer], [2014.05.10]) AC_INIT([ccminer], [2014.06.14])
AC_PREREQ([2.59c]) AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM AC_CANONICAL_SYSTEM

6
cpu-miner.c

@ -917,8 +917,8 @@ static void *miner_thread(void *userdata)
goto out; goto out;
} }
if (opt_benchmark) // if (opt_benchmark)
if (++rounds == 1) exit(0); // if (++rounds == 1) exit(0);
/* record scanhash elapsed time */ /* record scanhash elapsed time */
gettimeofday(&tv_end, NULL); gettimeofday(&tv_end, NULL);
@ -1469,7 +1469,7 @@ static void signal_handler(int sig)
} }
#endif #endif
#define PROGRAM_VERSION "1.0" #define PROGRAM_VERSION "1.1"
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
struct thr_info *thr; struct thr_info *thr;

4
cpuminer-config.h

@ -152,7 +152,7 @@
#define PACKAGE_NAME "ccminer" #define PACKAGE_NAME "ccminer"
/* Define to the full name and version of this package. */ /* Define to the full name and version of this package. */
#define PACKAGE_STRING "ccminer 2014.05.10" #define PACKAGE_STRING "ccminer 2014.06.14"
/* Define to the one symbol short name of this package. */ /* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME #undef PACKAGE_TARNAME
@ -161,7 +161,7 @@
#undef PACKAGE_URL #undef PACKAGE_URL
/* Define to the version of this package. */ /* Define to the version of this package. */
#define PACKAGE_VERSION "2014.05.10" #define PACKAGE_VERSION "2014.06.14"
/* If using the C implementation of alloca, define if you know the /* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be direction of stack growth for your system; otherwise it will be

405
cuda_groestlcoin.cu

@ -1,4 +1,4 @@
// Auf Groestlcoin spezialisierte Version von Groestl // Auf Groestlcoin spezialisierte Version von Groestl inkl. Bitslice
#include <cuda.h> #include <cuda.h>
#include "cuda_runtime.h" #include "cuda_runtime.h"
@ -7,9 +7,6 @@
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
// it's unfortunate that this is a compile time constant.
#define MAXWELL_OR_FERMI 1
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -18,361 +15,80 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
// Folgende Definitionen später durch header ersetzen // Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t; typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t; typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
// diese Struktur wird in der Init Funktion angefordert // diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props; static cudaDeviceProp props[8];
// globaler Speicher für alle HeftyHashes aller Threads
__constant__ uint32_t pTarget[8]; // Single GPU __constant__ uint32_t pTarget[8]; // Single GPU
extern uint32_t *d_resultNonce[8]; extern uint32_t *d_resultNonce[8];
__constant__ uint32_t groestlcoin_gpu_msg[32]; __constant__ uint32_t groestlcoin_gpu_msg[32];
#define SPH_C32(x) ((uint32_t)(x ## U)) // 64 Register Variante für Compute 3.0
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) #include "groestl_functions_quad.cu"
#include "bitslice_transformations_quad.cu"
#define PC32up(j, r) ((uint32_t)((j) + (r)))
#define PC32dn(j, r) 0
#define QC32up(j, r) 0xFFFFFFFF
#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
#define B32_0(x) __byte_perm(x, 0, 0x4440)
//((x) & 0xFF)
#define B32_1(x) __byte_perm(x, 0, 0x4441)
//(((x) >> 8) & 0xFF)
#define B32_2(x) __byte_perm(x, 0, 0x4442)
//(((x) >> 16) & 0xFF)
#define B32_3(x) __byte_perm(x, 0, 0x4443)
//((x) >> 24)
#if MAXWELL_OR_FERMI
#define USE_SHARED 1
// Maxwell and Fermi cards get the best speed with SHARED access it seems.
#if USE_SHARED
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
#define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
#else
#define T0up(x) tex1Dfetch(t0up1, x)
#define T0dn(x) tex1Dfetch(t0dn1, x)
#define T1up(x) tex1Dfetch(t1up1, x)
#define T1dn(x) tex1Dfetch(t1dn1, x)
#define T2up(x) tex1Dfetch(t2up1, x)
#define T2dn(x) tex1Dfetch(t2dn1, x)
#define T3up(x) tex1Dfetch(t3up1, x)
#define T3dn(x) tex1Dfetch(t3dn1, x)
#endif
#else
#define USE_SHARED 1
// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
#define T0dn(x) tex1Dfetch(t0dn1, x)
#define T1up(x) tex1Dfetch(t1up1, x)
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
#define T2up(x) tex1Dfetch(t2up1, x)
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
#define T3dn(x) tex1Dfetch(t3dn1, x)
#endif
texture<unsigned int, 1, cudaReadModeElementType> t0up1;
texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
texture<unsigned int, 1, cudaReadModeElementType> t1up1;
texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
texture<unsigned int, 1, cudaReadModeElementType> t2up1;
texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
texture<unsigned int, 1, cudaReadModeElementType> t3up1;
texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
extern uint32_t T0up_cpu[];
extern uint32_t T0dn_cpu[];
extern uint32_t T1up_cpu[];
extern uint32_t T1dn_cpu[];
extern uint32_t T2up_cpu[];
extern uint32_t T2dn_cpu[];
extern uint32_t T3up_cpu[];
extern uint32_t T3dn_cpu[];
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) ) #define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
__global__ void __launch_bounds__(256, 4)
__device__ __forceinline__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs) groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce)
{
uint32_t t[32];
//#pragma unroll 14
for(int r=0;r<14;r++)
{
switch(r)
{ {
case 0: // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
#pragma unroll 16 int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break; if (thread < threads)
case 1:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
case 2:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
case 3:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
case 4:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
case 5:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
case 6:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
case 7:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
case 8:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
case 9:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
case 10:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
case 11:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
case 12:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
case 13:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
}
// RBTT
#pragma unroll 16
for(int k=0;k<32;k+=2)
{ {
uint32_t t0_0 = B32_0(a[(k ) & 0x1f]), t9_0 = B32_0(a[(k + 9) & 0x1f]); // GROESTL
uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]); uint32_t paddedInput[8];
uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]); #pragma unroll 8
uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]); for(int k=0;k<8;k++) paddedInput[k] = groestlcoin_gpu_msg[4*k+threadIdx.x%4];
t[k + 0] = T0up( t0_0 ) ^ T1up( t2_1 ) ^ T2up( t4_2 ) ^ T3up( t6_3 ) ^ uint32_t nounce = startNounce + thread;
T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 ); if ((threadIdx.x % 4) == 3)
paddedInput[4] = SWAB32(nounce); // 4*4+3 = 19
t[k + 1] = T0dn( t0_0 ) ^ T1dn( t2_1 ) ^ T2dn( t4_2 ) ^ T3dn( t6_3 ) ^ uint32_t msgBitsliced[8];
T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 ); to_bitslice_quad(paddedInput, msgBitsliced);
}
#pragma unroll 32
for(int k=0;k<32;k++)
a[k] = t[k];
}
}
__device__ __forceinline__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs) uint32_t state[8];
for (int round=0; round<2; round++)
{ {
//#pragma unroll 14 groestl512_progressMessage_quad(state, msgBitsliced);
for(int r=0;r<14;r++)
{
uint32_t t[32];
switch(r) if (round < 1)
{ {
case 0: // Verkettung zweier Runden inclusive Padding.
#pragma unroll 16 msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + ((threadIdx.x%4)==3)*0x2000);
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break; msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
case 1: msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
#pragma unroll 16 msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break; msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
case 2: msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
#pragma unroll 16 msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break; msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + ((threadIdx.x%4)==0)*0x0010);
case 3:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
case 4:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
case 5:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
case 6:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
case 7:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
case 8:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
case 9:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
case 10:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
case 11:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
case 12:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
case 13:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
}
// RBTT
#pragma unroll 16
for(int k=0;k<32;k+=2)
{
uint32_t t2_0 = B32_0(a[(k + 2) & 0x1f]), t1_0 = B32_0(a[(k + 1) & 0x1f]);
uint32_t t6_1 = B32_1(a[(k + 6) & 0x1f]), t5_1 = B32_1(a[(k + 5) & 0x1f]);
uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2 = B32_2(a[(k + 9) & 0x1f]);
uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
t[k + 0] = T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^
T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn( t9_2 ) ^ T3dn( t13_3 );
t[k + 1] = T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^
T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up( t9_2 ) ^ T3up( t13_3 );
}
#pragma unroll 32
for(int k=0;k<32;k++)
a[k] = t[k];
} }
} }
#if USE_SHARED
__global__ void /* __launch_bounds__(256) */
#else
__global__ void
#endif
groestlcoin_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce) // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
{ uint32_t out_state[16];
#if USE_SHARED from_bitslice_quad(state, out_state);
extern __shared__ char mixtabs[];
if (threadIdx.x < 256) if (threadIdx.x % 4 == 0)
{
*((uint32_t*)mixtabs + ( threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
*((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
*((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
*((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
*((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
*((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
*((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
*((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
}
__syncthreads();
#endif
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{ {
// GROESTL
uint32_t message[32];
uint32_t state[32];
#pragma unroll 32
for(int k=0;k<32;k++) message[k] = groestlcoin_gpu_msg[k];
uint32_t nounce = startNounce + thread;
message[19] = SWAB32(nounce);
#pragma unroll 32
for(int u=0;u<32;u++) state[u] = message[u];
state[31] ^= 0x20000;
// Perm
#if USE_SHARED
groestlcoin_perm_P(state, mixtabs);
state[31] ^= 0x20000;
groestlcoin_perm_Q(message, mixtabs);
#else
groestlcoin_perm_P(state, NULL);
state[31] ^= 0x20000;
groestlcoin_perm_Q(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++) state[u] ^= message[u];
#pragma unroll 32
for(int u=0;u<32;u++) message[u] = state[u];
#if USE_SHARED
groestlcoin_perm_P(message, mixtabs);
#else
groestlcoin_perm_P(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++) state[u] ^= message[u];
////
//// 2. Runde groestl
////
#pragma unroll 16
for(int k=0;k<16;k++) message[k] = state[k + 16];
#pragma unroll 14
for(int k=1;k<15;k++)
message[k+16] = 0;
message[16] = 0x80;
message[31] = 0x01000000;
#pragma unroll 32
for(int u=0;u<32;u++)
state[u] = message[u];
state[31] ^= 0x20000;
// Perm
#if USE_SHARED
groestlcoin_perm_P(state, mixtabs);
state[31] ^= 0x20000;
groestlcoin_perm_Q(message, mixtabs);
#else
groestlcoin_perm_P(state, NULL);
state[31] ^= 0x20000;
groestlcoin_perm_Q(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++) state[u] ^= message[u];
#pragma unroll 32
for(int u=0;u<32;u++) message[u] = state[u];
#if USE_SHARED
groestlcoin_perm_P(message, mixtabs);
#else
groestlcoin_perm_P(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++) state[u] ^= message[u];
// kopiere Ergebnis
int i, position = -1; int i, position = -1;
bool rc = true; bool rc = true;
#pragma unroll 8 #pragma unroll 8
for (i = 7; i >= 0; i--) { for (i = 7; i >= 0; i--) {
if (state[i+16] > pTarget[i]) { if (out_state[i] > pTarget[i]) {
if(position < i) { if(position < i) {
position = i; position = i;
rc = false; rc = false;
} }
} }
if (state[i+16] < pTarget[i]) { if (out_state[i] < pTarget[i]) {
if(position < i) { if(position < i) {
position = i; position = i;
rc = true; rc = true;
@ -385,33 +101,14 @@ __global__ void
resNounce[0] = nounce; resNounce[0] = nounce;
} }
} }
}
#define texDef(texname, texmem, texsource, texsize) \
unsigned int *texmem; \
cudaMalloc(&texmem, texsize); \
cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
texname.normalized = 0; \
texname.filterMode = cudaFilterModePoint; \
texname.addressMode[0] = cudaAddressModeClamp; \
{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
// Setup-Funktionen // Setup-Funktionen
__host__ void groestlcoin_cpu_init(int thr_id, int threads) __host__ void groestlcoin_cpu_init(int thr_id, int threads)
{ {
cudaSetDevice(device_map[thr_id]); cudaSetDevice(device_map[thr_id]);
cudaGetDeviceProperties(&props, device_map[thr_id]); cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
// Texturen mit obigem Makro initialisieren
texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
// Speicher für Gewinner-Nonce belegen // Speicher für Gewinner-Nonce belegen
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
@ -446,25 +143,21 @@ __host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
__host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce) __host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
{ {
// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern, int threadsperblock = 256;
// alle anderen mit 512 Threads.
int threadsperblock = (props.major >= 3) ? 768 : 512; // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
int factor = 4;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
#if USE_SHARED
size_t shared_size = 8 * 256 * sizeof(uint32_t);
#else
size_t shared_size = 0; size_t shared_size = 0;
#endif
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
//fprintf(stderr, "ThrID: %d\n", thr_id);
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]); groestlcoin_gpu_hash_quad<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last // Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, 0, thr_id); MyStreamSynchronize(NULL, 0, thr_id);

449
cuda_myriadgroestl.cu

@ -1,4 +1,4 @@
// Auf Myriadcoin spezialisierte Version von Groestl // Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice
#include <cuda.h> #include <cuda.h>
#include "cuda_runtime.h" #include "cuda_runtime.h"
@ -7,9 +7,6 @@
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
// it's unfortunate that this is a compile time constant.
#define MAXWELL_OR_FERMI 1
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -22,15 +19,18 @@ typedef unsigned short uint16_t;
typedef unsigned int uint32_t; typedef unsigned int uint32_t;
// diese Struktur wird in der Init Funktion angefordert // diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props; static cudaDeviceProp props[8];
// globaler Speicher für alle HeftyHashes aller Threads
__constant__ uint32_t pTarget[8]; // Single GPU __constant__ uint32_t pTarget[8]; // Single GPU
uint32_t *d_outputHashes[8];
extern uint32_t *d_resultNonce[8]; extern uint32_t *d_resultNonce[8];
__constant__ uint32_t myriadgroestl_gpu_msg[32]; __constant__ uint32_t myriadgroestl_gpu_msg[32];
// muss expandiert werden // muss expandiert werden
__constant__ uint32_t myr_sha256_gpu_constantTable[64]; __constant__ uint32_t myr_sha256_gpu_constantTable[64];
__constant__ uint32_t myr_sha256_gpu_constantTable2[64];
__constant__ uint32_t myr_sha256_gpu_hashTable[8]; __constant__ uint32_t myr_sha256_gpu_hashTable[8];
uint32_t myr_sha256_cpu_hashTable[] = { uint32_t myr_sha256_cpu_hashTable[] = {
@ -46,6 +46,22 @@ uint32_t myr_sha256_cpu_constantTable[] = {
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
}; };
uint32_t myr_sha256_cpu_w2Table[] = {
0x80000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000200,
0x80000000, 0x01400000, 0x00205000, 0x00005088, 0x22000800, 0x22550014, 0x05089742, 0xa0000020,
0x5a880000, 0x005c9400, 0x0016d49d, 0xfa801f00, 0xd33225d0, 0x11675959, 0xf6e6bfda, 0xb30c1549,
0x08b2b050, 0x9d7c4c27, 0x0ce2a393, 0x88e6e1ea, 0xa52b4335, 0x67a16f49, 0xd732016f, 0x4eeb2e91,
0x5dbf55e5, 0x8eee2335, 0xe2bc5ec2, 0xa83f4394, 0x45ad78f7, 0x36f3d0cd, 0xd99c05e8, 0xb0511dc7,
0x69bc7ac4, 0xbd11375b, 0xe3ba71e5, 0x3b209ff2, 0x18feee17, 0xe25ad9e7, 0x13375046, 0x0515089d,
0x4f0d0f04, 0x2627484e, 0x310128d2, 0xc668b434, 0x420841cc, 0x62d311b8, 0xe59ba771, 0x85a7a484 };
// 64 Register Variante für Compute 3.0
#include "groestl_functions_quad.cu"
#include "bitslice_transformations_quad.cu"
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
#if __CUDA_ARCH__ < 350 #if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0) // Kepler (Compute 3.0)
#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
@ -61,8 +77,6 @@ uint32_t myr_sha256_cpu_constantTable[] = {
#define s0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3)) #define s0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3))
#define s1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10)) #define s1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10))
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
__device__ void myriadgroestl_gpu_sha256(uint32_t *message) __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
{ {
uint32_t W1[16]; uint32_t W1[16];
@ -99,9 +113,7 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
} }
// Progress W2...W3 // Progress W2...W3
#pragma unroll 3 ////// PART 1
for(int k=0;k<3;k++)
{
#pragma unroll 2 #pragma unroll 2
for(int j=0;j<2;j++) for(int j=0;j<2;j++)
W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
@ -120,7 +132,7 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
for(int j=0;j<16;j++) for(int j=0;j<16;j++)
{ {
uint32_t T1, T2; uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j]; T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16] + W2[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
#pragma unroll 7 #pragma unroll 7
@ -129,46 +141,35 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
regs[4] += T1; regs[4] += T1;
} }
#pragma unroll 16 ////// PART 2
for(int j=0;j<16;j++) #pragma unroll 2
W1[j] = W2[j]; for(int j=0;j<2;j++)
} W1[j] = s1(W2[14+j]) + W2[9+j] + s0(W2[1+j]) + W2[j];
#pragma unroll 5
#pragma unroll 8 for(int j=2;j<7;j++)
for(int k=0;k<8;k++) W1[j] = s1(W1[j-2]) + W2[9+j] + s0(W2[1+j]) + W2[j];
hash[k] += regs[k];
/////
///// Zweite Runde (wegen Msg-Padding)
/////
#pragma unroll 8 #pragma unroll 8
for(int k=0;k<8;k++) for(int j=7;j<15;j++)
regs[k] = hash[k]; W1[j] = s1(W1[j-2]) + W1[j-7] + s0(W2[1+j]) + W2[j];
W1[0] = SWAB32(0x80); W1[15] = s1(W1[13]) + W1[8] + s0(W1[0]) + W2[15];
#pragma unroll 14
for(int k=1;k<15;k++)
W1[k] = 0;
W1[15] = 512;
// Progress W1 // Rundenfunktion
#pragma unroll 16 #pragma unroll 16
for(int j=0;j<16;j++) for(int j=0;j<16;j++)
{ {
uint32_t T1, T2; uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j]; T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 32] + W1[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
#pragma unroll 7 #pragma unroll 7
for (int k=6; k >= 0; k--) regs[k+1] = regs[k]; for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
regs[0] = T1 + T2; regs[0] = T1 + T2;
regs[4] += T1; regs[4] += T1;
} }
// Progress W2...W3 ////// PART 3
#pragma unroll 3
for(int k=0;k<3;k++)
{
#pragma unroll 2 #pragma unroll 2
for(int j=0;j<2;j++) for(int j=0;j<2;j++)
W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j]; W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
@ -187,7 +188,7 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
for(int j=0;j<16;j++) for(int j=0;j<16;j++)
{ {
uint32_t T1, T2; uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j]; T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 48] + W2[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
#pragma unroll 7 #pragma unroll 7
@ -196,9 +197,29 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
regs[4] += T1; regs[4] += T1;
} }
#pragma unroll 16 #pragma unroll 8
for(int j=0;j<16;j++) for(int k=0;k<8;k++)
W1[j] = W2[j]; hash[k] += regs[k];
/////
///// Zweite Runde (wegen Msg-Padding)
/////
#pragma unroll 8
for(int k=0;k<8;k++)
regs[k] = hash[k];
// Progress W1
#pragma unroll 64
for(int j=0;j<64;j++)
{
uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable2[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
#pragma unroll 7
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
regs[0] = T1 + T2;
regs[4] += T1;
} }
#pragma unroll 8 #pragma unroll 8
@ -212,293 +233,55 @@ __device__ void myriadgroestl_gpu_sha256(uint32_t *message)
message[k] = SWAB32(hash[k]); message[k] = SWAB32(hash[k]);
} }
#define SPH_C32(x) ((uint32_t)(x ## U)) __global__ void __launch_bounds__(256, 4)
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) myriadgroestl_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *hashBuffer)
#define PC32up(j, r) ((uint32_t)((j) + (r)))
#define PC32dn(j, r) 0
#define QC32up(j, r) 0xFFFFFFFF
#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
#define B32_0(x) __byte_perm(x, 0, 0x4440)
//((x) & 0xFF)
#define B32_1(x) __byte_perm(x, 0, 0x4441)
//(((x) >> 8) & 0xFF)
#define B32_2(x) __byte_perm(x, 0, 0x4442)
//(((x) >> 16) & 0xFF)
#define B32_3(x) __byte_perm(x, 0, 0x4443)
//((x) >> 24)
#if MAXWELL_OR_FERMI
#define USE_SHARED 1
// Maxwell and Fermi cards get the best speed with SHARED access it seems.
#if USE_SHARED
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
#define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
#else
#define T0up(x) tex1Dfetch(t0up1, x)
#define T0dn(x) tex1Dfetch(t0dn1, x)
#define T1up(x) tex1Dfetch(t1up1, x)
#define T1dn(x) tex1Dfetch(t1dn1, x)
#define T2up(x) tex1Dfetch(t2up1, x)
#define T2dn(x) tex1Dfetch(t2dn1, x)
#define T3up(x) tex1Dfetch(t3up1, x)
#define T3dn(x) tex1Dfetch(t3dn1, x)
#endif
#else
#define USE_SHARED 1
// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
#define T0dn(x) tex1Dfetch(t0dn1, x)
#define T1up(x) tex1Dfetch(t1up1, x)
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
#define T2up(x) tex1Dfetch(t2up1, x)
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
#define T3dn(x) tex1Dfetch(t3dn1, x)
#endif
texture<unsigned int, 1, cudaReadModeElementType> t0up1;
texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
texture<unsigned int, 1, cudaReadModeElementType> t1up1;
texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
texture<unsigned int, 1, cudaReadModeElementType> t2up1;
texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
texture<unsigned int, 1, cudaReadModeElementType> t3up1;
texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
extern uint32_t T0up_cpu[];
extern uint32_t T0dn_cpu[];
extern uint32_t T1up_cpu[];
extern uint32_t T1dn_cpu[];
extern uint32_t T2up_cpu[];
extern uint32_t T2dn_cpu[];
extern uint32_t T3up_cpu[];
extern uint32_t T3dn_cpu[];
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
__device__ __forceinline__ void myriadgroestl_perm_P(uint32_t *a, char *mixtabs)
{ {
uint32_t t[32]; // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
//#pragma unroll 14 if (thread < threads)
for(int r=0;r<14;r++)
{
switch(r)
{ {
case 0: // GROESTL
#pragma unroll 16 uint32_t paddedInput[8];
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break; #pragma unroll 8
case 1: for(int k=0;k<8;k++) paddedInput[k] = myriadgroestl_gpu_msg[4*k+threadIdx.x%4];
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
case 2:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
case 3:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
case 4:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
case 5:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
case 6:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
case 7:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
case 8:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
case 9:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
case 10:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
case 11:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
case 12:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
case 13:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
}
// RBTT uint32_t nounce = startNounce + thread;
#pragma unroll 16 if ((threadIdx.x % 4) == 3)
for(int k=0;k<32;k+=2) paddedInput[4] = SWAB32(nounce); // 4*4+3 = 19
{
uint32_t t0_0 = B32_0(a[(k ) & 0x1f]), t9_0 = B32_0(a[(k + 9) & 0x1f]);
uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
t[k + 0] = T0up( t0_0 ) ^ T1up( t2_1 ) ^ T2up( t4_2 ) ^ T3up( t6_3 ) ^ uint32_t msgBitsliced[8];
T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 ); to_bitslice_quad(paddedInput, msgBitsliced);
t[k + 1] = T0dn( t0_0 ) ^ T1dn( t2_1 ) ^ T2dn( t4_2 ) ^ T3dn( t6_3 ) ^ uint32_t state[8];
T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 );
}
#pragma unroll 32
for(int k=0;k<32;k++)
a[k] = t[k];
}
}
__device__ __forceinline__ void myriadgroestl_perm_Q(uint32_t *a, char *mixtabs) groestl512_progressMessage_quad(state, msgBitsliced);
{
//#pragma unroll 14
for(int r=0;r<14;r++)
{
uint32_t t[32];
switch(r) uint32_t out_state[16];
{ from_bitslice_quad(state, out_state);
case 0:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break;
case 1:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break;
case 2:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break;
case 3:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
case 4:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
case 5:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
case 6:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
case 7:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
case 8:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
case 9:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
case 10:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
case 11:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
case 12:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
case 13:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
}
// RBTT if ((threadIdx.x & 0x03) == 0)
#pragma unroll 16
for(int k=0;k<32;k+=2)
{ {
uint32_t t2_0 = B32_0(a[(k + 2) & 0x1f]), t1_0 = B32_0(a[(k + 1) & 0x1f]); uint32_t *outpHash = &hashBuffer[16 * thread];
uint32_t t6_1 = B32_1(a[(k + 6) & 0x1f]), t5_1 = B32_1(a[(k + 5) & 0x1f]); #pragma unroll 16
uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2 = B32_2(a[(k + 9) & 0x1f]); for(int k=0;k<16;k++) outpHash[k] = out_state[k];
uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
t[k + 0] = T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^
T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn( t9_2 ) ^ T3dn( t13_3 );
t[k + 1] = T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^
T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up( t9_2 ) ^ T3up( t13_3 );
} }
#pragma unroll 32
for(int k=0;k<32;k++)
a[k] = t[k];
} }
} }
__global__ void __global__ void
myriadgroestl_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce) myriadgroestl_gpu_hash_quad2(int threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer)
{ {
#if USE_SHARED
extern __shared__ char mixtabs[];
if (threadIdx.x < 256)
{
*((uint32_t*)mixtabs + ( threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
*((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
*((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
*((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
*((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
*((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
*((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
*((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
}
__syncthreads();
#endif
int thread = (blockDim.x * blockIdx.x + threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
// GROESTL
uint32_t message[32];
uint32_t state[32];
#pragma unroll 32
for(int k=0;k<32;k++) message[k] = myriadgroestl_gpu_msg[k];
uint32_t nounce = startNounce + thread; uint32_t nounce = startNounce + thread;
message[19] = SWAB32(nounce);
#pragma unroll 32
for(int u=0;u<32;u++) state[u] = message[u];
state[31] ^= 0x20000;
// Perm
#if USE_SHARED
myriadgroestl_perm_P(state, mixtabs);
state[31] ^= 0x20000;
myriadgroestl_perm_Q(message, mixtabs);
#else
myriadgroestl_perm_P(state, NULL);
state[31] ^= 0x20000;
myriadgroestl_perm_Q(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++) state[u] ^= message[u];
#pragma unroll 32
for(int u=0;u<32;u++) message[u] = state[u];
#if USE_SHARED
myriadgroestl_perm_P(message, mixtabs);
#else
myriadgroestl_perm_P(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++) state[u] ^= message[u];
uint32_t out_state[16]; uint32_t out_state[16];
uint32_t *inpHash = &hashBuffer[16 * thread];
#pragma unroll 16 #pragma unroll 16
for(int u=0;u<16;u++) out_state[u] = state[u+16]; for (int i=0; i < 16; i++)
out_state[i] = inpHash[i];
myriadgroestl_gpu_sha256(out_state); myriadgroestl_gpu_sha256(out_state);
int i, position = -1; int i, position = -1;
@ -526,16 +309,6 @@ myriadgroestl_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
} }
} }
#define texDef(texname, texmem, texsource, texsize) \
unsigned int *texmem; \
cudaMalloc(&texmem, texsize); \
cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
texname.normalized = 0; \
texname.filterMode = cudaFilterModePoint; \
texname.addressMode[0] = cudaAddressModeClamp; \
{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
// Setup-Funktionen // Setup-Funktionen
__host__ void myriadgroestl_cpu_init(int thr_id, int threads) __host__ void myriadgroestl_cpu_init(int thr_id, int threads)
{ {
@ -549,20 +322,22 @@ __host__ void myriadgroestl_cpu_init(int thr_id, int threads)
myr_sha256_cpu_constantTable, myr_sha256_cpu_constantTable,
sizeof(uint32_t) * 64 ); sizeof(uint32_t) * 64 );
cudaGetDeviceProperties(&props, device_map[thr_id]); // zweite CPU-Tabelle bauen und auf die GPU laden
uint32_t temp[64];
for(int i=0;i<64;i++)
temp[i] = myr_sha256_cpu_w2Table[i] + myr_sha256_cpu_constantTable[i];
cudaMemcpyToSymbol( myr_sha256_gpu_constantTable2,
temp,
sizeof(uint32_t) * 64 );
// Texturen mit obigem Makro initialisieren cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
// Speicher für Gewinner-Nonce belegen // Speicher für Gewinner-Nonce belegen
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
// Speicher für temporäreHashes
cudaMalloc(&d_outputHashes[thr_id], 16*sizeof(uint32_t)*threads);
} }
__host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn) __host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
@ -594,25 +369,23 @@ __host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn
__host__ void myriadgroestl_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce) __host__ void myriadgroestl_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
{ {
// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern, int threadsperblock = 256;
// alle anderen mit 512 Threads.
int threadsperblock = (props.major >= 3) ? 768 : 512;
// berechne wie viele Thread Blocks wir brauchen // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
dim3 grid((threads + threadsperblock-1)/threadsperblock); // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
dim3 block(threadsperblock); const int factor=4;
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
#if USE_SHARED
size_t shared_size = 8 * 256 * sizeof(uint32_t);
#else
size_t shared_size = 0; size_t shared_size = 0;
#endif
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
//fprintf(stderr, "ThrID: %d\n", thr_id);
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t)); cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
myriadgroestl_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]); // berechne wie viele Thread Blocks wir brauchen
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
dim3 block(threadsperblock);
myriadgroestl_gpu_hash_quad<<<grid, block, shared_size>>>(threads, startNounce, d_outputHashes[thr_id]);
dim3 grid2((threads + threadsperblock-1)/threadsperblock);
myriadgroestl_gpu_hash_quad2<<<grid2, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id], d_outputHashes[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last // Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, 0, thr_id); MyStreamSynchronize(NULL, 0, thr_id);

1
cuda_nist5.cu

@ -85,7 +85,6 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
{ {
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
// TODO: entfernen für eine Release! Ist nur zum Testen!
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff; ((uint32_t*)ptarget)[7] = 0x0000ff;

315
groestl_functions_quad.cu

@ -0,0 +1,315 @@
__device__ __forceinline__ void G256_Mul2(uint32_t *regs)
{
uint32_t tmp = regs[7];
regs[7] = regs[6];
regs[6] = regs[5];
regs[5] = regs[4];
regs[4] = regs[3] ^ tmp;
regs[3] = regs[2] ^ tmp;
regs[2] = regs[1];
regs[1] = regs[0] ^ tmp;
regs[0] = tmp;
}
__device__ __forceinline__ void G256_AddRoundConstantQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int round)
{
x0 = ~x0;
x1 = ~x1;
x2 = ~x2;
x3 = ~x3;
x4 = ~x4;
x5 = ~x5;
x6 = ~x6;
x7 = ~x7;
if ((threadIdx.x & 0x03) == 3) {
x0 ^= ((- (round & 0x01) ) & 0xFFFF0000);
x1 ^= ((-((round & 0x02)>>1)) & 0xFFFF0000);
x2 ^= ((-((round & 0x04)>>2)) & 0xFFFF0000);
x3 ^= ((-((round & 0x08)>>3)) & 0xFFFF0000);
x4 ^= 0xAAAA0000;
x5 ^= 0xCCCC0000;
x6 ^= 0xF0F00000;
x7 ^= 0xFF000000;
}
}
__device__ __forceinline__ void G256_AddRoundConstantP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int round)
{
if ((threadIdx.x & 0x03) == 0)
{
x4 ^= 0xAAAA;
x5 ^= 0xCCCC;
x6 ^= 0xF0F0;
x7 ^= 0xFF00;
x0 ^= ((- (round & 0x01) ) & 0xFFFF);
x1 ^= ((-((round & 0x02)>>1)) & 0xFFFF);
x2 ^= ((-((round & 0x04)>>2)) & 0xFFFF);
x3 ^= ((-((round & 0x08)>>3)) & 0xFFFF);
}
}
__device__ __forceinline__ void G16mul_quad(uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0,
uint32_t &y3, uint32_t &y2, uint32_t &y1, uint32_t &y0)
{
uint32_t t0,t1,t2;
t0 = ((x2 ^ x0) ^ (x3 ^ x1)) & ((y2 ^ y0) ^ (y3 ^ y1));
t1 = ((x2 ^ x0) & (y2 ^ y0)) ^ t0;
t2 = ((x3 ^ x1) & (y3 ^ y1)) ^ t0 ^ t1;
t0 = (x2^x3) & (y2^y3);
x3 = (x3 & y3) ^ t0 ^ t1;
x2 = (x2 & y2) ^ t0 ^ t2;
t0 = (x0^x1) & (y0^y1);
x1 = (x1 & y1) ^ t0 ^ t1;
x0 = (x0 & y0) ^ t0 ^ t2;
}
__device__ __forceinline__ void G256_inv_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
{
uint32_t t0,t1,t2,t3,t4,t5,t6,a,b;
t3 = x7;
t2 = x6;
t1 = x5;
t0 = x4;
G16mul_quad(t3, t2, t1, t0, x3, x2, x1, x0);
a = (x4 ^ x0);
t0 ^= a;
t2 ^= (x7 ^ x3) ^ (x5 ^ x1);
t1 ^= (x5 ^ x1) ^ a;
t3 ^= (x6 ^ x2) ^ a;
b = t0 ^ t1;
t4 = (t2 ^ t3) & b;
a = t4 ^ t3 ^ t1;
t5 = (t3 & t1) ^ a;
t6 = (t2 & t0) ^ a ^ (t2 ^ t0);
t4 = (t5 ^ t6) & b;
t1 = (t6 & t1) ^ t4;
t0 = (t5 & t0) ^ t4;
t4 = (t5 ^ t6) & (t2^t3);
t3 = (t6 & t3) ^ t4;
t2 = (t5 & t2) ^ t4;
G16mul_quad(x3, x2, x1, x0, t1, t0, t3, t2);
G16mul_quad(x7, x6, x5, x4, t1, t0, t3, t2);
}
__device__ __forceinline__ void transAtoX_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7)
{
uint32_t t0, t1;
t0 = x0 ^ x1 ^ x2;
t1 = x5 ^ x6;
x2 = t0 ^ t1 ^ x7;
x6 = t0 ^ x3 ^ x6;
x3 = x0 ^ x1 ^ x3 ^ x4 ^ x7;
x4 = x0 ^ x4 ^ t1;
x2 = t0 ^ t1 ^ x7;
x1 = x0 ^ x1 ^ t1;
x7 = x0 ^ t1 ^ x7;
x5 = x0 ^ t1;
}
__device__ __forceinline__ void transXtoA_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7)
{
uint32_t t0,t2,t3,t5;
x1 ^= x4;
t0 = x1 ^ x6;
x1 ^= x5;
t2 = x0 ^ x2;
x2 = x3 ^ x5;
t2 ^= x2 ^ x6;
x2 ^= x7;
t3 = x4 ^ x2 ^ x6;
t5 = x0 ^ x6;
x4 = x3 ^ x7;
x0 = x3 ^ x5;
x6 = t0;
x3 = t2;
x7 = t3;
x5 = t5;
}
__device__ __forceinline__ void sbox_quad(uint32_t *r)
{
transAtoX_quad(r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7]);
G256_inv_quad(r[2], r[4], r[1], r[7], r[3], r[0], r[5], r[6]);
transXtoA_quad(r[7], r[1], r[4], r[2], r[6], r[5], r[0], r[3]);
r[0] = ~r[0];
r[1] = ~r[1];
r[5] = ~r[5];
r[6] = ~r[6];
}
__device__ __forceinline__ void G256_ShiftBytesP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
{
uint32_t t0,t1;
int tpos = threadIdx.x & 0x03;
int shift1 = tpos << 1;
int shift2 = shift1+1 + ((tpos == 3)<<2);
t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
x0 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
t1 = __byte_perm(x1, 0, 0x3232)>>shift2;
x1 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x2, 0, 0x1010)>>shift1;
t1 = __byte_perm(x2, 0, 0x3232)>>shift2;
x2 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x3, 0, 0x1010)>>shift1;
t1 = __byte_perm(x3, 0, 0x3232)>>shift2;
x3 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x4, 0, 0x1010)>>shift1;
t1 = __byte_perm(x4, 0, 0x3232)>>shift2;
x4 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x5, 0, 0x1010)>>shift1;
t1 = __byte_perm(x5, 0, 0x3232)>>shift2;
x5 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x6, 0, 0x1010)>>shift1;
t1 = __byte_perm(x6, 0, 0x3232)>>shift2;
x6 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x7, 0, 0x1010)>>shift1;
t1 = __byte_perm(x7, 0, 0x3232)>>shift2;
x7 = __byte_perm(t0, t1, 0x5410);
}
__device__ __forceinline__ void G256_ShiftBytesQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
{
uint32_t t0,t1;
int tpos = threadIdx.x & 0x03;
int shift1 = (1-(tpos>>1)) + ((tpos & 0x01)<<2);
int shift2 = shift1+2 + ((tpos == 1)<<2);
t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
x0 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
t1 = __byte_perm(x1, 0, 0x3232)>>shift2;
x1 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x2, 0, 0x1010)>>shift1;
t1 = __byte_perm(x2, 0, 0x3232)>>shift2;
x2 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x3, 0, 0x1010)>>shift1;
t1 = __byte_perm(x3, 0, 0x3232)>>shift2;
x3 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x4, 0, 0x1010)>>shift1;
t1 = __byte_perm(x4, 0, 0x3232)>>shift2;
x4 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x5, 0, 0x1010)>>shift1;
t1 = __byte_perm(x5, 0, 0x3232)>>shift2;
x5 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x6, 0, 0x1010)>>shift1;
t1 = __byte_perm(x6, 0, 0x3232)>>shift2;
x6 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x7, 0, 0x1010)>>shift1;
t1 = __byte_perm(x7, 0, 0x3232)>>shift2;
x7 = __byte_perm(t0, t1, 0x5410);
}
__device__ __forceinline__ void G256_MixFunction_quad(uint32_t *r)
{
#define SHIFT64_16(hi, lo) __byte_perm(lo, hi, 0x5432)
#define A(v, u) __shfl((int)r[v], ((threadIdx.x+u)&0x03), 4)
#define S(idx, l) SHIFT64_16( A(idx, (l+1)), A(idx, l) )
#define DOUBLE_ODD(i, bc) ( S(i, (bc)) ^ A(i, (bc) + 1) )
#define DOUBLE_EVEN(i, bc) ( S(i, (bc)) ^ A(i, (bc) ) )
#define SINGLE_ODD(i, bc) ( S(i, (bc)) )
#define SINGLE_EVEN(i, bc) ( A(i, (bc)) )
uint32_t b[8];
#pragma unroll 8
for(int i=0;i<8;i++)
b[i] = DOUBLE_ODD(i, 1) ^ DOUBLE_EVEN(i, 3);
G256_Mul2(b);
#pragma unroll 8
for(int i=0;i<8;i++)
b[i] = b[i] ^ DOUBLE_ODD(i, 3) ^ DOUBLE_ODD(i, 4) ^ SINGLE_ODD(i, 6);
G256_Mul2(b);
#pragma unroll 8
for(int i=0;i<8;i++)
r[i] = b[i] ^ DOUBLE_EVEN(i, 2) ^ DOUBLE_EVEN(i, 3) ^ SINGLE_EVEN(i, 5);
#undef S
#undef A
#undef SHIFT64_16
#undef t
#undef X
}
__device__ __forceinline__ void groestl512_perm_P_quad(uint32_t *r)
{
for(int round=0;round<14;round++)
{
G256_AddRoundConstantP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
sbox_quad(r);
G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
G256_MixFunction_quad(r);
}
}
__device__ __forceinline__ void groestl512_perm_Q_quad(uint32_t *r)
{
for(int round=0;round<14;round++)
{
G256_AddRoundConstantQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
sbox_quad(r);
G256_ShiftBytesQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
G256_MixFunction_quad(r);
}
}
__device__ __forceinline__ void groestl512_progressMessage_quad(uint32_t *state, uint32_t *message)
{
#pragma unroll 8
for(int u=0;u<8;u++) state[u] = message[u];
if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
groestl512_perm_P_quad(state);
if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
groestl512_perm_Q_quad(message);
#pragma unroll 8
for(int u=0;u<8;u++) state[u] ^= message[u];
#pragma unroll 8
for(int u=0;u<8;u++) message[u] = state[u];
groestl512_perm_P_quad(message);
#pragma unroll 8
for(int u=0;u<8;u++) state[u] ^= message[u];
}

53
groestlcoin.cpp

@ -46,7 +46,6 @@ static void groestlhash(void *state, const void *input)
sph_groestl512_context ctx_groestl[2]; sph_groestl512_context ctx_groestl[2];
static unsigned char pblank[1]; static unsigned char pblank[1];
int ii;
uint32_t mask = 8; uint32_t mask = 8;
uint32_t zero = 0; uint32_t zero = 0;
@ -66,16 +65,18 @@ static void groestlhash(void *state, const void *input)
memcpy(state, hashB, 32); memcpy(state, hashB, 32);
} }
extern bool opt_benchmark;
extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget, extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
uint32_t max_nonce, unsigned long *hashes_done) uint32_t max_nonce, unsigned long *hashes_done)
{ {
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x000000ff;
uint32_t start_nonce = pdata[19]++; uint32_t start_nonce = pdata[19]++;
const uint32_t Htarg = ptarget[7]; const uint32_t Htarg = ptarget[7];
const uint32_t throughPut = 4096 * 128; const uint32_t throughPut = 4096 * 128;
//const uint32_t throughPut = 1; //const uint32_t throughPut = 1;
int i;
uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t)); uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t));
// init // init
@ -87,8 +88,6 @@ extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t
} }
// Endian Drehung ist notwendig // Endian Drehung ist notwendig
//char testdata[] = {"\x70\x00\x00\x00\x5d\x38\x5b\xa1\x14\xd0\x79\x97\x0b\x29\xa9\x41\x8f\xd0\x54\x9e\x7d\x68\xa9\x5c\x7f\x16\x86\x21\xa3\x14\x20\x10\x00\x00\x00\x00\x57\x85\x86\xd1\x49\xfd\x07\xb2\x2f\x3a\x8a\x34\x7c\x51\x6d\xe7\x05\x2f\x03\x4d\x2b\x76\xff\x68\xe0\xd6\xec\xff\x9b\x77\xa4\x54\x89\xe3\xfd\x51\x17\x32\x01\x1d\xf0\x73\x10\x00"};
//pdata = (uint32_t*)testdata;
uint32_t endiandata[32]; uint32_t endiandata[32];
for (int kk=0; kk < 32; kk++) for (int kk=0; kk < 32; kk++)
be32enc(&endiandata[kk], pdata[kk]); be32enc(&endiandata[kk], pdata[kk]);
@ -102,35 +101,6 @@ extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t
groestlcoin_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce); groestlcoin_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce);
/*
{
for(i=0;i<throughPut;i++)
{
uint32_t tmpHash[8];
endiandata[19] = SWAP32(pdata[19]);
groestlhash(tmpHash, endiandata);
int ii;
printf("result GPU: ");
for (ii=0; ii < 32; ii++)
{
printf ("%.2x",((uint8_t*)&outputHash[8*i])[ii]);
};
printf ("\n");
groestlhash(tmpHash, endiandata);
printf("result CPU: ");
for (ii=0; ii < 32; ii++)
{
printf ("%.2x",((uint8_t*)tmpHash)[ii]);
};
}
exit(0);
}
*/
if(foundNounce < 0xffffffff) if(foundNounce < 0xffffffff)
{ {
uint32_t tmpHash[8]; uint32_t tmpHash[8];
@ -147,21 +117,6 @@ extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t
} }
foundNounce = 0xffffffff; foundNounce = 0xffffffff;
/*
int ii;
printf("result GPU: ");
for (ii=0; ii < 32; ii++)
{
printf ("%.2x",((uint8_t*)&outputHash[0])[ii]);
};
printf ("\n");
printf("result CPU: ");
for (ii=0; ii < 32; ii++)
{
printf ("%.2x",((uint8_t*)tmpHash)[ii]);
};
printf ("\n");
*/
} }
if (pdata[19] + throughPut < pdata[19]) if (pdata[19] + throughPut < pdata[19])

2
heavy/cuda_blake512.cu

@ -269,8 +269,6 @@ __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
if (BLOCKSIZE == 80) if (BLOCKSIZE == 80)
blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
else if (BLOCKSIZE == 84) else if (BLOCKSIZE == 84)

2
heavy/cuda_combine.cu

@ -141,8 +141,6 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
combine_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], d_nonceVector[thr_id]); combine_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], d_nonceVector[thr_id]);
// da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden // da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden

2
heavy/cuda_groestl512.cu

@ -824,8 +824,6 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
if (BLOCKSIZE == 84) if (BLOCKSIZE == 84)
groestl512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); groestl512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
else if (BLOCKSIZE == 80) else if (BLOCKSIZE == 80)

2
heavy/cuda_hefty1.cu

@ -416,8 +416,6 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
size_t shared_size = 0; size_t shared_size = 0;
#endif #endif
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]); hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last // Strategisches Sleep Kommando zur Senkung der CPU Last

1
heavy/cuda_keccak512.cu

@ -279,7 +279,6 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
if (BLOCKSIZE==84) if (BLOCKSIZE==84)
keccak512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); keccak512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
else if (BLOCKSIZE==80) else if (BLOCKSIZE==80)

1
heavy/cuda_sha256.cu

@ -271,7 +271,6 @@ __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
if (BLOCKSIZE == 84) if (BLOCKSIZE == 84)
sha256_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]); sha256_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
else if (BLOCKSIZE == 80) { else if (BLOCKSIZE == 80) {

8
myriadgroestl.cpp

@ -35,17 +35,19 @@ static void myriadhash(void *state, const void *input)
memcpy(state, hashB, 32); memcpy(state, hashB, 32);
} }
extern bool opt_benchmark;
extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptarget, extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
uint32_t max_nonce, unsigned long *hashes_done) uint32_t max_nonce, unsigned long *hashes_done)
{ {
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x000000ff;
uint32_t start_nonce = pdata[19]++; uint32_t start_nonce = pdata[19]++;
const uint32_t throughPut = 128 * 1024; const uint32_t throughPut = 128 * 1024;
// const uint32_t throughPut = 1;
uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t)); uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t));
// TODO: entfernen für eine Release! Ist nur zum Testen!
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff; ((uint32_t*)ptarget)[7] = 0x0000ff;

1
quark/animecoin.cu

@ -175,7 +175,6 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
{ {
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
// TODO: entfernen für eine Release! Ist nur zum Testen!
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x00000f; ((uint32_t*)ptarget)[7] = 0x00000f;

4
quark/cuda_bmw512.cu

@ -447,8 +447,6 @@ __host__ void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNo
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_bmw512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); quark_bmw512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }
@ -464,8 +462,6 @@ __host__ void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNo
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_bmw512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash); quark_bmw512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }

2
quark/cuda_jh512.cu

@ -350,8 +350,6 @@ __host__ void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_jh512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); quark_jh512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }

4
quark/cuda_quark_blake512.cu

@ -406,8 +406,6 @@ __host__ void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t start
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_blake512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash); quark_blake512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash);
// Strategisches Sleep Kommando zur Senkung der CPU Last // Strategisches Sleep Kommando zur Senkung der CPU Last
@ -425,8 +423,6 @@ __host__ void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t start
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_blake512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash); quark_blake512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
// Strategisches Sleep Kommando zur Senkung der CPU Last // Strategisches Sleep Kommando zur Senkung der CPU Last

2
quark/cuda_quark_checkhash.cu

@ -89,8 +89,6 @@ __host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t star
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_check_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]); quark_check_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last // Strategisches Sleep Kommando zur Senkung der CPU Last

404
quark/cuda_quark_groestl512.cu

@ -1,4 +1,4 @@
// Auf QuarkCoin spezialisierte Version von Groestl // Auf QuarkCoin spezialisierte Version von Groestl inkl. Bitslice
#include <cuda.h> #include <cuda.h>
#include "cuda_runtime.h" #include "cuda_runtime.h"
@ -7,9 +7,6 @@
#include <stdio.h> #include <stdio.h>
#include <memory.h> #include <memory.h>
// it's unfortunate that this is a compile time constant.
#define MAXWELL_OR_FERMI 1
// aus cpu-miner.c // aus cpu-miner.c
extern int device_map[8]; extern int device_map[8];
@ -18,353 +15,137 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
// Folgende Definitionen später durch header ersetzen // Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t; typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t; typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
// diese Struktur wird in der Init Funktion angefordert // diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8]; static cudaDeviceProp props[8];
#define SPH_C32(x) ((uint32_t)(x ## U)) // 64 Register Variante für Compute 3.0
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF)) #include "groestl_functions_quad.cu"
#include "bitslice_transformations_quad.cu"
#define PC32up(j, r) ((uint32_t)((j) + (r)))
#define PC32dn(j, r) 0
#define QC32up(j, r) 0xFFFFFFFF
#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
#define B32_0(x) __byte_perm(x, 0, 0x4440)
//((x) & 0xFF)
#define B32_1(x) __byte_perm(x, 0, 0x4441)
//(((x) >> 8) & 0xFF)
#define B32_2(x) __byte_perm(x, 0, 0x4442)
//(((x) >> 16) & 0xFF)
#define B32_3(x) __byte_perm(x, 0, 0x4443)
//((x) >> 24)
#if MAXWELL_OR_FERMI
#define USE_SHARED 1
// Maxwell and Fermi cards get the best speed with SHARED access it seems.
#if USE_SHARED
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
#define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
#else
#define T0up(x) tex1Dfetch(t0up1, x)
#define T0dn(x) tex1Dfetch(t0dn1, x)
#define T1up(x) tex1Dfetch(t1up1, x)
#define T1dn(x) tex1Dfetch(t1dn1, x)
#define T2up(x) tex1Dfetch(t2up1, x)
#define T2dn(x) tex1Dfetch(t2dn1, x)
#define T3up(x) tex1Dfetch(t3up1, x)
#define T3dn(x) tex1Dfetch(t3dn1, x)
#endif
#else
#define USE_SHARED 1
// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
#define T0dn(x) tex1Dfetch(t0dn1, x)
#define T1up(x) tex1Dfetch(t1up1, x)
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
#define T2up(x) tex1Dfetch(t2up1, x)
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
#define T3dn(x) tex1Dfetch(t3dn1, x)
#endif
texture<unsigned int, 1, cudaReadModeElementType> t0up1;
texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
texture<unsigned int, 1, cudaReadModeElementType> t1up1;
texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
texture<unsigned int, 1, cudaReadModeElementType> t2up1;
texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
texture<unsigned int, 1, cudaReadModeElementType> t3up1;
texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
extern uint32_t T0up_cpu[];
extern uint32_t T0dn_cpu[];
extern uint32_t T1up_cpu[];
extern uint32_t T1dn_cpu[];
extern uint32_t T2up_cpu[];
extern uint32_t T2dn_cpu[];
extern uint32_t T3up_cpu[];
extern uint32_t T3dn_cpu[];
__device__ __forceinline__ void quark_groestl512_perm_P(uint32_t *a, char *mixtabs)
{
uint32_t t[32];
//#pragma unroll 14 __global__ void __launch_bounds__(256, 4)
for(int r=0;r<14;r++) quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
{ {
switch(r) // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
if (thread < threads)
{ {
case 0: // GROESTL
#pragma unroll 16 uint32_t message[8];
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 0); break; uint32_t state[8];
case 1:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 1); break;
case 2:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 2); break;
case 3:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 3); break;
case 4:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 4); break;
case 5:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 5); break;
case 6:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 6); break;
case 7:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 7); break;
case 8:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 8); break;
case 9:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 9); break;
case 10:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 10); break;
case 11:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 11); break;
case 12:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 12); break;
case 13:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 13); break;
}
// RBTT uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
#pragma unroll 16
for(int k=0;k<32;k+=2)
{
uint32_t t0_0 = B32_0(a[(k ) & 0x1f]), t9_0 = B32_0(a[(k + 9) & 0x1f]);
uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
t[k + 0] = T0up( t0_0 ) ^ T1up( t2_1 ) ^ T2up( t4_2 ) ^ T3up( t6_3 ) ^ int hashPosition = nounce - startNounce;
T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 ); uint32_t *inpHash = &g_hash[hashPosition<<4];
t[k + 1] = T0dn( t0_0 ) ^ T1dn( t2_1 ) ^ T2dn( t4_2 ) ^ T3dn( t6_3 ) ^ #pragma unroll 4
T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 ); for(int k=0;k<4;k++) message[k] = inpHash[(k<<2) + (threadIdx.x&0x03)];
} #pragma unroll 4
#pragma unroll 32 for(int k=4;k<8;k++) message[k] = 0;
for(int k=0;k<32;k++)
a[k] = t[k];
}
}
__device__ __forceinline__ void quark_groestl512_perm_Q(uint32_t *a, char *mixtabs) if ((threadIdx.x&0x03) == 0) message[4] = 0x80;
{ if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000;
//#pragma unroll 14
for(int r=0;r<14;r++)
{
uint32_t t[32];
switch(r) uint32_t msgBitsliced[8];
{ to_bitslice_quad(message, msgBitsliced);
case 0:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 0); a[(k*2)+1] ^= QC32dn(k<< 4, 0);} break;
case 1:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 1); a[(k*2)+1] ^= QC32dn(k<< 4, 1);} break;
case 2:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 2); a[(k*2)+1] ^= QC32dn(k<< 4, 2);} break;
case 3:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 3); a[(k*2)+1] ^= QC32dn(k<< 4, 3);} break;
case 4:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 4); a[(k*2)+1] ^= QC32dn(k<< 4, 4);} break;
case 5:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 5); a[(k*2)+1] ^= QC32dn(k<< 4, 5);} break;
case 6:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 6); a[(k*2)+1] ^= QC32dn(k<< 4, 6);} break;
case 7:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 7); a[(k*2)+1] ^= QC32dn(k<< 4, 7);} break;
case 8:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 8); a[(k*2)+1] ^= QC32dn(k<< 4, 8);} break;
case 9:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 9); a[(k*2)+1] ^= QC32dn(k<< 4, 9);} break;
case 10:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 10); a[(k*2)+1] ^= QC32dn(k<< 4, 10);} break;
case 11:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 11); a[(k*2)+1] ^= QC32dn(k<< 4, 11);} break;
case 12:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 12); a[(k*2)+1] ^= QC32dn(k<< 4, 12);} break;
case 13:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 13); a[(k*2)+1] ^= QC32dn(k<< 4, 13);} break;
}
// RBTT groestl512_progressMessage_quad(state, msgBitsliced);
#pragma unroll 16
for(int k=0;k<32;k+=2)
{
uint32_t t2_0 = B32_0(a[(k + 2) & 0x1f]), t1_0 = B32_0(a[(k + 1) & 0x1f]);
uint32_t t6_1 = B32_1(a[(k + 6) & 0x1f]), t5_1 = B32_1(a[(k + 5) & 0x1f]);
uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2 = B32_2(a[(k + 9) & 0x1f]);
uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
t[k + 0] = T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^ // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn( t9_2 ) ^ T3dn( t13_3 ); uint32_t *outpHash = &g_hash[hashPosition<<4];
uint32_t hash[16];
from_bitslice_quad(state, hash);
t[k + 1] = T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^ if ((threadIdx.x & 0x03) == 0)
T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up( t9_2 ) ^ T3up( t13_3 ); {
#pragma unroll 16
for(int k=0;k<16;k++) outpHash[k] = hash[k];
} }
#pragma unroll 32
for(int k=0;k<32;k++)
a[k] = t[k];
} }
} }
__global__ void quark_groestl512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
{
#if USE_SHARED
extern __shared__ char mixtabs[];
if (threadIdx.x < 256) __global__ void __launch_bounds__(256, 4)
quark_doublegroestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
{ {
*((uint32_t*)mixtabs + ( threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x); int thread = (blockDim.x * blockIdx.x + threadIdx.x)>>2;
*((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
*((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
*((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
*((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
*((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
*((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
*((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
}
__syncthreads();
#endif
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
{ {
// GROESTL // GROESTL
uint32_t message[32]; uint32_t message[8];
uint32_t state[32]; uint32_t state[8];
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce; int hashPosition = nounce - startNounce;
uint32_t *inpHash = &g_hash[16 * hashPosition]; uint32_t *inpHash = &g_hash[hashPosition<<4];
#pragma unroll 16 #pragma unroll 4
for(int k=0;k<16;k++) message[k] = inpHash[k]; for(int k=0;k<4;k++) message[k] = inpHash[(k<<2)+(threadIdx.x&0x03)];
#pragma unroll 14 #pragma unroll 4
for(int k=1;k<15;k++) for(int k=4;k<8;k++) message[k] = 0;
message[k+16] = 0;
message[16] = 0x80;
message[31] = 0x01000000;
#pragma unroll 32
for(int u=0;u<32;u++) state[u] = message[u];
state[31] ^= 0x20000;
// Perm
#if USE_SHARED
quark_groestl512_perm_P(state, mixtabs);
state[31] ^= 0x20000;
quark_groestl512_perm_Q(message, mixtabs);
#else
quark_groestl512_perm_P(state, NULL);
state[31] ^= 0x20000;
quark_groestl512_perm_Q(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++) state[u] ^= message[u];
#pragma unroll 32
for(int u=0;u<32;u++) message[u] = state[u];
#if USE_SHARED
quark_groestl512_perm_P(message, mixtabs);
#else
quark_groestl512_perm_P(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++) state[u] ^= message[u];
// Erzeugten Hash rausschreiben
uint32_t *outpHash = &g_hash[16 * hashPosition];
#pragma unroll 16 if ((threadIdx.x&0x03) == 0) message[4] = 0x80;
for(int k=0;k<16;k++) outpHash[k] = state[k+16]; if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000;
uint32_t msgBitsliced[8];
to_bitslice_quad(message, msgBitsliced);
for (int round=0; round<2; round++)
{
groestl512_progressMessage_quad(state, msgBitsliced);
if (round < 1)
{
// Verkettung zweier Runden inclusive Padding.
msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + (((threadIdx.x%4)==3)<<13));
msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + (((threadIdx.x%4)==0)<<4));
} }
} }
#define texDef(texname, texmem, texsource, texsize) \ // Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
unsigned int *texmem; \ uint32_t *outpHash = &g_hash[hashPosition<<4];
cudaMalloc(&texmem, texsize); \ uint32_t hash[16];
cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ from_bitslice_quad(state, hash);
texname.normalized = 0; \
texname.filterMode = cudaFilterModePoint; \ if ((threadIdx.x & 0x03) == 0)
texname.addressMode[0] = cudaAddressModeClamp; \ {
{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \ #pragma unroll 16
cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \ for(int k=0;k<16;k++) outpHash[k] = hash[k];
}
}
}
// Setup-Funktionen // Setup-Funktionen
__host__ void quark_groestl512_cpu_init(int thr_id, int threads) __host__ void quark_groestl512_cpu_init(int thr_id, int threads)
{ {
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]); cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
// Texturen mit obigem Makro initialisieren
texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
} }
__host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
// Compute 3.5 und 5.x Geräte am besten mit 768 Threads ansteuern, int threadsperblock = 256;
// alle anderen mit 512 Threads.
int threadsperblock = ((props[thr_id].major == 3 && props[thr_id].minor == 5) || props[thr_id].major > 3) ? 768 : 512; // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
const int factor = 4;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
#if USE_SHARED
size_t shared_size = 8 * 256 * sizeof(uint32_t);
#else
size_t shared_size = 0; size_t shared_size = 0;
#endif
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); quark_groestl512_gpu_hash_64_quad<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
//fprintf(stderr, "ThrID: %d\n", thr_id);
quark_groestl512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
// Strategisches Sleep Kommando zur Senkung der CPU Last // Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
@ -372,25 +153,20 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t sta
__host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) __host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{ {
// Compute 3.5 und 5.x Geräte am besten mit 768 Threads ansteuern, int threadsperblock = 256;
// alle anderen mit 512 Threads.
int threadsperblock = ((props[thr_id].major == 3 && props[thr_id].minor == 5) || props[thr_id].major > 3) ? 768 : 512; // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
const int factor = 4;
// berechne wie viele Thread Blocks wir brauchen // berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
dim3 block(threadsperblock); dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
#if USE_SHARED
size_t shared_size = 8 * 256 * sizeof(uint32_t);
#else
size_t shared_size = 0; size_t shared_size = 0;
#endif
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size); quark_doublegroestl512_gpu_hash_64_quad<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
//fprintf(stderr, "ThrID: %d\n", thr_id);
quark_groestl512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
quark_groestl512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
// Strategisches Sleep Kommando zur Senkung der CPU Last // Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);

2
quark/cuda_quark_keccak512.cu

@ -175,8 +175,6 @@ __host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t star
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_keccak512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); quark_keccak512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }

1
quark/cuda_skein512.cu

@ -442,7 +442,6 @@ __host__ void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t start
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_skein512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); quark_skein512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
// Strategisches Sleep Kommando zur Senkung der CPU Last // Strategisches Sleep Kommando zur Senkung der CPU Last

1
quark/quarkcoin.cu

@ -157,7 +157,6 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
{ {
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
// TODO: entfernen für eine Release! Ist nur zum Testen!
if (opt_benchmark) if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff; ((uint32_t*)ptarget)[7] = 0x0000ff;

2
x11/cuda_x11_cubehash512.cu

@ -307,8 +307,6 @@ __host__ void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t star
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
x11_cubehash512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); x11_cubehash512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }

2
x11/cuda_x11_echo.cu

@ -225,8 +225,6 @@ __host__ void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
x11_echo512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); x11_echo512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }

2
x11/cuda_x11_luffa512.cu

@ -376,8 +376,6 @@ __host__ void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNo
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
x11_luffa512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); x11_luffa512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }

2
x11/cuda_x11_shavite512.cu

@ -1372,8 +1372,6 @@ __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t start
// Größe des dynamischen Shared Memory Bereichs // Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0; size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id); MyStreamSynchronize(NULL, order, thr_id);
} }

5
x11/x11.cu

@ -162,11 +162,8 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
{ {
const uint32_t first_nonce = pdata[19]; const uint32_t first_nonce = pdata[19];
// TODO: entfernen für eine Release! Ist nur zum Testen! if (opt_benchmark)
if (opt_benchmark) {
((uint32_t*)ptarget)[7] = 0x0000ff; ((uint32_t*)ptarget)[7] = 0x0000ff;
pdata[17] = 0;
}
const uint32_t Htarg = ptarget[7]; const uint32_t Htarg = ptarget[7];

Loading…
Cancel
Save