Browse Source

bump to revision V1.1 with Killer Groestl

2upstream v1.1
Christian Buchner 10 years ago
parent
commit
3b21069504
  1. 2
      JHA/cuda_jha_keccak512.cu
  2. 16
      JHA/jackpotcoin.cu
  3. 13
      README.txt
  4. 418
      bitslice_transformations_quad.cu
  5. 12
      ccminer.vcxproj
  6. 6
      ccminer.vcxproj.filters
  7. 2
      configure.ac
  8. 6
      cpu-miner.c
  9. 4
      cpuminer-config.h
  10. 537
      cuda_groestlcoin.cu
  11. 665
      cuda_myriadgroestl.cu
  12. 1
      cuda_nist5.cu
  13. 315
      groestl_functions_quad.cu
  14. 227
      groestlcoin.cpp
  15. 2
      heavy/cuda_blake512.cu
  16. 2
      heavy/cuda_combine.cu
  17. 2
      heavy/cuda_groestl512.cu
  18. 2
      heavy/cuda_hefty1.cu
  19. 1
      heavy/cuda_keccak512.cu
  20. 1
      heavy/cuda_sha256.cu
  21. 10
      myriadgroestl.cpp
  22. 1
      quark/animecoin.cu
  23. 4
      quark/cuda_bmw512.cu
  24. 2
      quark/cuda_jh512.cu
  25. 4
      quark/cuda_quark_blake512.cu
  26. 2
      quark/cuda_quark_checkhash.cu
  27. 414
      quark/cuda_quark_groestl512.cu
  28. 2
      quark/cuda_quark_keccak512.cu
  29. 1
      quark/cuda_skein512.cu
  30. 1
      quark/quarkcoin.cu
  31. 2
      x11/cuda_x11_cubehash512.cu
  32. 2
      x11/cuda_x11_echo.cu
  33. 2
      x11/cuda_x11_luffa512.cu
  34. 2
      x11/cuda_x11_shavite512.cu
  35. 5
      x11/x11.cu

2
JHA/cuda_jha_keccak512.cu

@ -567,8 +567,6 @@ __host__ void jackpot_keccak512_cpu_hash(int thr_id, int threads, uint32_t start @@ -567,8 +567,6 @@ __host__ void jackpot_keccak512_cpu_hash(int thr_id, int threads, uint32_t start
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
jackpot_keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
MyStreamSynchronize(NULL, order, thr_id);
}

16
JHA/jackpotcoin.cu

@ -101,14 +101,12 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, @@ -101,14 +101,12 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
{
const uint32_t first_nonce = pdata[19];
// TODO: entfernen für eine Release! Ist nur zum Testen!
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
const uint32_t Htarg = ptarget[7];
const int throughput = 256*4096*4; // 100;
//const int throughput = 256*256*2+100; // 100;
static bool init[8] = {0,0,0,0,0,0,0,0};
if (!init[thr_id])
@ -167,16 +165,18 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, @@ -167,16 +165,18 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
}
// Runde 2 (ohne Gröstl)
// Runde 3 (komplett)
// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
d_branch1Nonces[thr_id], &nrm1,
d_branch3Nonces[thr_id], &nrm3,
d_branch2Nonces[thr_id], &nrm2,
order++);
// verfolge den skein-pfad weiter
quark_skein512_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
if (nrm1+nrm2 == nrm3) {
quark_groestl512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
}
// jackpotNonces in branch1/2 aufsplitten gemäss if (hash[0] & 0x01)
jackpot_compactTest_cpu_hash_64(thr_id, nrm3, pdata[19], d_hash[thr_id], d_branch3Nonces[thr_id],
@ -226,7 +226,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, @@ -226,7 +226,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
pdata[19] = foundNonce;
*hashes_done = (foundNonce - first_nonce + 1)/4;
*hashes_done = (foundNonce - first_nonce + 1)/2;
//applog(LOG_INFO, "GPU #%d: result for nonce $%08X does validate on CPU (%d rounds)!", thr_id, foundNonce, rounds);
return 1;
} else {
@ -238,6 +238,6 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata, @@ -238,6 +238,6 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = (pdata[19] - first_nonce + 1)/4;
*hashes_done = (pdata[19] - first_nonce + 1)/2;
return 0;
}

13
README.txt

@ -1,5 +1,5 @@ @@ -1,5 +1,5 @@
ccMiner release 1.0 (May 10th 2014) - "Did anyone say X11?"
ccMiner release 1.1 (June 14th 2014) - "Killer Groestl!"
-------------------------------------------------------------
***************************************************************
@ -30,13 +30,12 @@ FugueCoin @@ -30,13 +30,12 @@ FugueCoin
GroestlCoin & Myriad-Groestl
JackpotCoin
QuarkCoin family & AnimeCoin
TalkCoin
DarkCoin and other X11 coins
where some of these coins have a VERY NOTABLE nVidia advantage
over competing AMD (OpenCL) implementations.
X11 algo is being worked on. It will be released when we
have achieved a nice nVidia advantage.
We did not take a big effort on improving usability, so please set
your parameters carefuly.
@ -140,6 +139,12 @@ features. @@ -140,6 +139,12 @@ features.
>>> RELEASE HISTORY <<<
June 14th 2014 released Killer Groestl quad version which I deem
sufficiently hard to port over to AMD. It isn't
the fastest option for Compute 3.5 and 5.0 cards,
but it is still much faster than the table based
versions.
May 10th 2014 added X11, but without the bells & whistles
(no killer Groestl, SIMD hash quite slow still)

418
bitslice_transformations_quad.cu

@ -0,0 +1,418 @@ @@ -0,0 +1,418 @@
__device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *output)
{
int n = threadIdx.x % 4;
uint32_t other[8];
#pragma unroll 8
for (int i = 0; i < 8; i++) {
input[i] =__shfl((int)input[i], n ^ (3*(n >=1 && n <=2)), 4);
other[i] =__shfl((int)input[i], (threadIdx.x + 1) % 4, 4);
input[i] = __shfl((int)input[i], threadIdx.x & 2, 4);
other[i] = __shfl((int)other[i], threadIdx.x & 2, 4);
if (threadIdx.x & 1) {
input[i] = __byte_perm(input[i], 0, 0x1032);
other[i] = __byte_perm(other[i], 0, 0x1032);
}
output[i] = 0;
}
output[ 0] |= (input[ 0] & 0x00000001);
output[ 0] |= ((other[ 0] & 0x00000001) << 1);
output[ 0] |= ((input[ 1] & 0x00000001) << 2);
output[ 0] |= ((other[ 1] & 0x00000001) << 3);
output[ 0] |= ((input[ 2] & 0x00000001) << 4);
output[ 0] |= ((other[ 2] & 0x00000001) << 5);
output[ 0] |= ((input[ 3] & 0x00000001) << 6);
output[ 0] |= ((other[ 3] & 0x00000001) << 7);
output[ 0] |= ((input[ 4] & 0x00000001) << 8);
output[ 0] |= ((other[ 4] & 0x00000001) << 9);
output[ 0] |= ((input[ 5] & 0x00000001) <<10);
output[ 0] |= ((other[ 5] & 0x00000001) <<11);
output[ 0] |= ((input[ 6] & 0x00000001) <<12);
output[ 0] |= ((other[ 6] & 0x00000001) <<13);
output[ 0] |= ((input[ 7] & 0x00000001) <<14);
output[ 0] |= ((other[ 7] & 0x00000001) <<15);
output[ 0] |= ((input[ 0] & 0x00000100) << 8);
output[ 0] |= ((other[ 0] & 0x00000100) << 9);
output[ 0] |= ((input[ 1] & 0x00000100) <<10);
output[ 0] |= ((other[ 1] & 0x00000100) <<11);
output[ 0] |= ((input[ 2] & 0x00000100) <<12);
output[ 0] |= ((other[ 2] & 0x00000100) <<13);
output[ 0] |= ((input[ 3] & 0x00000100) <<14);
output[ 0] |= ((other[ 3] & 0x00000100) <<15);
output[ 0] |= ((input[ 4] & 0x00000100) <<16);
output[ 0] |= ((other[ 4] & 0x00000100) <<17);
output[ 0] |= ((input[ 5] & 0x00000100) <<18);
output[ 0] |= ((other[ 5] & 0x00000100) <<19);
output[ 0] |= ((input[ 6] & 0x00000100) <<20);
output[ 0] |= ((other[ 6] & 0x00000100) <<21);
output[ 0] |= ((input[ 7] & 0x00000100) <<22);
output[ 0] |= ((other[ 7] & 0x00000100) <<23);
output[ 1] |= ((input[ 0] & 0x00000002) >> 1);
output[ 1] |= (other[ 0] & 0x00000002);
output[ 1] |= ((input[ 1] & 0x00000002) << 1);
output[ 1] |= ((other[ 1] & 0x00000002) << 2);
output[ 1] |= ((input[ 2] & 0x00000002) << 3);
output[ 1] |= ((other[ 2] & 0x00000002) << 4);
output[ 1] |= ((input[ 3] & 0x00000002) << 5);
output[ 1] |= ((other[ 3] & 0x00000002) << 6);
output[ 1] |= ((input[ 4] & 0x00000002) << 7);
output[ 1] |= ((other[ 4] & 0x00000002) << 8);
output[ 1] |= ((input[ 5] & 0x00000002) << 9);
output[ 1] |= ((other[ 5] & 0x00000002) <<10);
output[ 1] |= ((input[ 6] & 0x00000002) <<11);
output[ 1] |= ((other[ 6] & 0x00000002) <<12);
output[ 1] |= ((input[ 7] & 0x00000002) <<13);
output[ 1] |= ((other[ 7] & 0x00000002) <<14);
output[ 1] |= ((input[ 0] & 0x00000200) << 7);
output[ 1] |= ((other[ 0] & 0x00000200) << 8);
output[ 1] |= ((input[ 1] & 0x00000200) << 9);
output[ 1] |= ((other[ 1] & 0x00000200) <<10);
output[ 1] |= ((input[ 2] & 0x00000200) <<11);
output[ 1] |= ((other[ 2] & 0x00000200) <<12);
output[ 1] |= ((input[ 3] & 0x00000200) <<13);
output[ 1] |= ((other[ 3] & 0x00000200) <<14);
output[ 1] |= ((input[ 4] & 0x00000200) <<15);
output[ 1] |= ((other[ 4] & 0x00000200) <<16);
output[ 1] |= ((input[ 5] & 0x00000200) <<17);
output[ 1] |= ((other[ 5] & 0x00000200) <<18);
output[ 1] |= ((input[ 6] & 0x00000200) <<19);
output[ 1] |= ((other[ 6] & 0x00000200) <<20);
output[ 1] |= ((input[ 7] & 0x00000200) <<21);
output[ 1] |= ((other[ 7] & 0x00000200) <<22);
output[ 2] |= ((input[ 0] & 0x00000004) >> 2);
output[ 2] |= ((other[ 0] & 0x00000004) >> 1);
output[ 2] |= (input[ 1] & 0x00000004);
output[ 2] |= ((other[ 1] & 0x00000004) << 1);
output[ 2] |= ((input[ 2] & 0x00000004) << 2);
output[ 2] |= ((other[ 2] & 0x00000004) << 3);
output[ 2] |= ((input[ 3] & 0x00000004) << 4);
output[ 2] |= ((other[ 3] & 0x00000004) << 5);
output[ 2] |= ((input[ 4] & 0x00000004) << 6);
output[ 2] |= ((other[ 4] & 0x00000004) << 7);
output[ 2] |= ((input[ 5] & 0x00000004) << 8);
output[ 2] |= ((other[ 5] & 0x00000004) << 9);
output[ 2] |= ((input[ 6] & 0x00000004) <<10);
output[ 2] |= ((other[ 6] & 0x00000004) <<11);
output[ 2] |= ((input[ 7] & 0x00000004) <<12);
output[ 2] |= ((other[ 7] & 0x00000004) <<13);
output[ 2] |= ((input[ 0] & 0x00000400) << 6);
output[ 2] |= ((other[ 0] & 0x00000400) << 7);
output[ 2] |= ((input[ 1] & 0x00000400) << 8);
output[ 2] |= ((other[ 1] & 0x00000400) << 9);
output[ 2] |= ((input[ 2] & 0x00000400) <<10);
output[ 2] |= ((other[ 2] & 0x00000400) <<11);
output[ 2] |= ((input[ 3] & 0x00000400) <<12);
output[ 2] |= ((other[ 3] & 0x00000400) <<13);
output[ 2] |= ((input[ 4] & 0x00000400) <<14);
output[ 2] |= ((other[ 4] & 0x00000400) <<15);
output[ 2] |= ((input[ 5] & 0x00000400) <<16);
output[ 2] |= ((other[ 5] & 0x00000400) <<17);
output[ 2] |= ((input[ 6] & 0x00000400) <<18);
output[ 2] |= ((other[ 6] & 0x00000400) <<19);
output[ 2] |= ((input[ 7] & 0x00000400) <<20);
output[ 2] |= ((other[ 7] & 0x00000400) <<21);
output[ 3] |= ((input[ 0] & 0x00000008) >> 3);
output[ 3] |= ((other[ 0] & 0x00000008) >> 2);
output[ 3] |= ((input[ 1] & 0x00000008) >> 1);
output[ 3] |= (other[ 1] & 0x00000008);
output[ 3] |= ((input[ 2] & 0x00000008) << 1);
output[ 3] |= ((other[ 2] & 0x00000008) << 2);
output[ 3] |= ((input[ 3] & 0x00000008) << 3);
output[ 3] |= ((other[ 3] & 0x00000008) << 4);
output[ 3] |= ((input[ 4] & 0x00000008) << 5);
output[ 3] |= ((other[ 4] & 0x00000008) << 6);
output[ 3] |= ((input[ 5] & 0x00000008) << 7);
output[ 3] |= ((other[ 5] & 0x00000008) << 8);
output[ 3] |= ((input[ 6] & 0x00000008) << 9);
output[ 3] |= ((other[ 6] & 0x00000008) <<10);
output[ 3] |= ((input[ 7] & 0x00000008) <<11);
output[ 3] |= ((other[ 7] & 0x00000008) <<12);
output[ 3] |= ((input[ 0] & 0x00000800) << 5);
output[ 3] |= ((other[ 0] & 0x00000800) << 6);
output[ 3] |= ((input[ 1] & 0x00000800) << 7);
output[ 3] |= ((other[ 1] & 0x00000800) << 8);
output[ 3] |= ((input[ 2] & 0x00000800) << 9);
output[ 3] |= ((other[ 2] & 0x00000800) <<10);
output[ 3] |= ((input[ 3] & 0x00000800) <<11);
output[ 3] |= ((other[ 3] & 0x00000800) <<12);
output[ 3] |= ((input[ 4] & 0x00000800) <<13);
output[ 3] |= ((other[ 4] & 0x00000800) <<14);
output[ 3] |= ((input[ 5] & 0x00000800) <<15);
output[ 3] |= ((other[ 5] & 0x00000800) <<16);
output[ 3] |= ((input[ 6] & 0x00000800) <<17);
output[ 3] |= ((other[ 6] & 0x00000800) <<18);
output[ 3] |= ((input[ 7] & 0x00000800) <<19);
output[ 3] |= ((other[ 7] & 0x00000800) <<20);
output[ 4] |= ((input[ 0] & 0x00000010) >> 4);
output[ 4] |= ((other[ 0] & 0x00000010) >> 3);
output[ 4] |= ((input[ 1] & 0x00000010) >> 2);
output[ 4] |= ((other[ 1] & 0x00000010) >> 1);
output[ 4] |= (input[ 2] & 0x00000010);
output[ 4] |= ((other[ 2] & 0x00000010) << 1);
output[ 4] |= ((input[ 3] & 0x00000010) << 2);
output[ 4] |= ((other[ 3] & 0x00000010) << 3);
output[ 4] |= ((input[ 4] & 0x00000010) << 4);
output[ 4] |= ((other[ 4] & 0x00000010) << 5);
output[ 4] |= ((input[ 5] & 0x00000010) << 6);
output[ 4] |= ((other[ 5] & 0x00000010) << 7);
output[ 4] |= ((input[ 6] & 0x00000010) << 8);
output[ 4] |= ((other[ 6] & 0x00000010) << 9);
output[ 4] |= ((input[ 7] & 0x00000010) <<10);
output[ 4] |= ((other[ 7] & 0x00000010) <<11);
output[ 4] |= ((input[ 0] & 0x00001000) << 4);
output[ 4] |= ((other[ 0] & 0x00001000) << 5);
output[ 4] |= ((input[ 1] & 0x00001000) << 6);
output[ 4] |= ((other[ 1] & 0x00001000) << 7);
output[ 4] |= ((input[ 2] & 0x00001000) << 8);
output[ 4] |= ((other[ 2] & 0x00001000) << 9);
output[ 4] |= ((input[ 3] & 0x00001000) <<10);
output[ 4] |= ((other[ 3] & 0x00001000) <<11);
output[ 4] |= ((input[ 4] & 0x00001000) <<12);
output[ 4] |= ((other[ 4] & 0x00001000) <<13);
output[ 4] |= ((input[ 5] & 0x00001000) <<14);
output[ 4] |= ((other[ 5] & 0x00001000) <<15);
output[ 4] |= ((input[ 6] & 0x00001000) <<16);
output[ 4] |= ((other[ 6] & 0x00001000) <<17);
output[ 4] |= ((input[ 7] & 0x00001000) <<18);
output[ 4] |= ((other[ 7] & 0x00001000) <<19);
output[ 5] |= ((input[ 0] & 0x00000020) >> 5);
output[ 5] |= ((other[ 0] & 0x00000020) >> 4);
output[ 5] |= ((input[ 1] & 0x00000020) >> 3);
output[ 5] |= ((other[ 1] & 0x00000020) >> 2);
output[ 5] |= ((input[ 2] & 0x00000020) >> 1);
output[ 5] |= (other[ 2] & 0x00000020);
output[ 5] |= ((input[ 3] & 0x00000020) << 1);
output[ 5] |= ((other[ 3] & 0x00000020) << 2);
output[ 5] |= ((input[ 4] & 0x00000020) << 3);
output[ 5] |= ((other[ 4] & 0x00000020) << 4);
output[ 5] |= ((input[ 5] & 0x00000020) << 5);
output[ 5] |= ((other[ 5] & 0x00000020) << 6);
output[ 5] |= ((input[ 6] & 0x00000020) << 7);
output[ 5] |= ((other[ 6] & 0x00000020) << 8);
output[ 5] |= ((input[ 7] & 0x00000020) << 9);
output[ 5] |= ((other[ 7] & 0x00000020) <<10);
output[ 5] |= ((input[ 0] & 0x00002000) << 3);
output[ 5] |= ((other[ 0] & 0x00002000) << 4);
output[ 5] |= ((input[ 1] & 0x00002000) << 5);
output[ 5] |= ((other[ 1] & 0x00002000) << 6);
output[ 5] |= ((input[ 2] & 0x00002000) << 7);
output[ 5] |= ((other[ 2] & 0x00002000) << 8);
output[ 5] |= ((input[ 3] & 0x00002000) << 9);
output[ 5] |= ((other[ 3] & 0x00002000) <<10);
output[ 5] |= ((input[ 4] & 0x00002000) <<11);
output[ 5] |= ((other[ 4] & 0x00002000) <<12);
output[ 5] |= ((input[ 5] & 0x00002000) <<13);
output[ 5] |= ((other[ 5] & 0x00002000) <<14);
output[ 5] |= ((input[ 6] & 0x00002000) <<15);
output[ 5] |= ((other[ 6] & 0x00002000) <<16);
output[ 5] |= ((input[ 7] & 0x00002000) <<17);
output[ 5] |= ((other[ 7] & 0x00002000) <<18);
output[ 6] |= ((input[ 0] & 0x00000040) >> 6);
output[ 6] |= ((other[ 0] & 0x00000040) >> 5);
output[ 6] |= ((input[ 1] & 0x00000040) >> 4);
output[ 6] |= ((other[ 1] & 0x00000040) >> 3);
output[ 6] |= ((input[ 2] & 0x00000040) >> 2);
output[ 6] |= ((other[ 2] & 0x00000040) >> 1);
output[ 6] |= (input[ 3] & 0x00000040);
output[ 6] |= ((other[ 3] & 0x00000040) << 1);
output[ 6] |= ((input[ 4] & 0x00000040) << 2);
output[ 6] |= ((other[ 4] & 0x00000040) << 3);
output[ 6] |= ((input[ 5] & 0x00000040) << 4);
output[ 6] |= ((other[ 5] & 0x00000040) << 5);
output[ 6] |= ((input[ 6] & 0x00000040) << 6);
output[ 6] |= ((other[ 6] & 0x00000040) << 7);
output[ 6] |= ((input[ 7] & 0x00000040) << 8);
output[ 6] |= ((other[ 7] & 0x00000040) << 9);
output[ 6] |= ((input[ 0] & 0x00004000) << 2);
output[ 6] |= ((other[ 0] & 0x00004000) << 3);
output[ 6] |= ((input[ 1] & 0x00004000) << 4);
output[ 6] |= ((other[ 1] & 0x00004000) << 5);
output[ 6] |= ((input[ 2] & 0x00004000) << 6);
output[ 6] |= ((other[ 2] & 0x00004000) << 7);
output[ 6] |= ((input[ 3] & 0x00004000) << 8);
output[ 6] |= ((other[ 3] & 0x00004000) << 9);
output[ 6] |= ((input[ 4] & 0x00004000) <<10);
output[ 6] |= ((other[ 4] & 0x00004000) <<11);
output[ 6] |= ((input[ 5] & 0x00004000) <<12);
output[ 6] |= ((other[ 5] & 0x00004000) <<13);
output[ 6] |= ((input[ 6] & 0x00004000) <<14);
output[ 6] |= ((other[ 6] & 0x00004000) <<15);
output[ 6] |= ((input[ 7] & 0x00004000) <<16);
output[ 6] |= ((other[ 7] & 0x00004000) <<17);
output[ 7] |= ((input[ 0] & 0x00000080) >> 7);
output[ 7] |= ((other[ 0] & 0x00000080) >> 6);
output[ 7] |= ((input[ 1] & 0x00000080) >> 5);
output[ 7] |= ((other[ 1] & 0x00000080) >> 4);
output[ 7] |= ((input[ 2] & 0x00000080) >> 3);
output[ 7] |= ((other[ 2] & 0x00000080) >> 2);
output[ 7] |= ((input[ 3] & 0x00000080) >> 1);
output[ 7] |= (other[ 3] & 0x00000080);
output[ 7] |= ((input[ 4] & 0x00000080) << 1);
output[ 7] |= ((other[ 4] & 0x00000080) << 2);
output[ 7] |= ((input[ 5] & 0x00000080) << 3);
output[ 7] |= ((other[ 5] & 0x00000080) << 4);
output[ 7] |= ((input[ 6] & 0x00000080) << 5);
output[ 7] |= ((other[ 6] & 0x00000080) << 6);
output[ 7] |= ((input[ 7] & 0x00000080) << 7);
output[ 7] |= ((other[ 7] & 0x00000080) << 8);
output[ 7] |= ((input[ 0] & 0x00008000) << 1);
output[ 7] |= ((other[ 0] & 0x00008000) << 2);
output[ 7] |= ((input[ 1] & 0x00008000) << 3);
output[ 7] |= ((other[ 1] & 0x00008000) << 4);
output[ 7] |= ((input[ 2] & 0x00008000) << 5);
output[ 7] |= ((other[ 2] & 0x00008000) << 6);
output[ 7] |= ((input[ 3] & 0x00008000) << 7);
output[ 7] |= ((other[ 3] & 0x00008000) << 8);
output[ 7] |= ((input[ 4] & 0x00008000) << 9);
output[ 7] |= ((other[ 4] & 0x00008000) <<10);
output[ 7] |= ((input[ 5] & 0x00008000) <<11);
output[ 7] |= ((other[ 5] & 0x00008000) <<12);
output[ 7] |= ((input[ 6] & 0x00008000) <<13);
output[ 7] |= ((other[ 6] & 0x00008000) <<14);
output[ 7] |= ((input[ 7] & 0x00008000) <<15);
output[ 7] |= ((other[ 7] & 0x00008000) <<16);
}
__device__ __forceinline__ void from_bitslice_quad(uint32_t *input, uint32_t *output)
{
#pragma unroll 8
for (int i=0; i < 16; i+=2) output[i] = 0;
output[ 0] |= ((input[ 0] & 0x00000100) >> 8);
output[ 0] |= ((input[ 1] & 0x00000100) >> 7);
output[ 0] |= ((input[ 2] & 0x00000100) >> 6);
output[ 0] |= ((input[ 3] & 0x00000100) >> 5);
output[ 0] |= ((input[ 4] & 0x00000100) >> 4);
output[ 0] |= ((input[ 5] & 0x00000100) >> 3);
output[ 0] |= ((input[ 6] & 0x00000100) >> 2);
output[ 0] |= ((input[ 7] & 0x00000100) >> 1);
output[ 0] |= ((input[ 0] & 0x01000000) >>16);
output[ 0] |= ((input[ 1] & 0x01000000) >>15);
output[ 0] |= ((input[ 2] & 0x01000000) >>14);
output[ 0] |= ((input[ 3] & 0x01000000) >>13);
output[ 0] |= ((input[ 4] & 0x01000000) >>12);
output[ 0] |= ((input[ 5] & 0x01000000) >>11);
output[ 0] |= ((input[ 6] & 0x01000000) >>10);
output[ 0] |= ((input[ 7] & 0x01000000) >> 9);
output[ 2] |= ((input[ 0] & 0x00000200) >> 9);
output[ 2] |= ((input[ 1] & 0x00000200) >> 8);
output[ 2] |= ((input[ 2] & 0x00000200) >> 7);
output[ 2] |= ((input[ 3] & 0x00000200) >> 6);
output[ 2] |= ((input[ 4] & 0x00000200) >> 5);
output[ 2] |= ((input[ 5] & 0x00000200) >> 4);
output[ 2] |= ((input[ 6] & 0x00000200) >> 3);
output[ 2] |= ((input[ 7] & 0x00000200) >> 2);
output[ 2] |= ((input[ 0] & 0x02000000) >>17);
output[ 2] |= ((input[ 1] & 0x02000000) >>16);
output[ 2] |= ((input[ 2] & 0x02000000) >>15);
output[ 2] |= ((input[ 3] & 0x02000000) >>14);
output[ 2] |= ((input[ 4] & 0x02000000) >>13);
output[ 2] |= ((input[ 5] & 0x02000000) >>12);
output[ 2] |= ((input[ 6] & 0x02000000) >>11);
output[ 2] |= ((input[ 7] & 0x02000000) >>10);
output[ 4] |= ((input[ 0] & 0x00000400) >>10);
output[ 4] |= ((input[ 1] & 0x00000400) >> 9);
output[ 4] |= ((input[ 2] & 0x00000400) >> 8);
output[ 4] |= ((input[ 3] & 0x00000400) >> 7);
output[ 4] |= ((input[ 4] & 0x00000400) >> 6);
output[ 4] |= ((input[ 5] & 0x00000400) >> 5);
output[ 4] |= ((input[ 6] & 0x00000400) >> 4);
output[ 4] |= ((input[ 7] & 0x00000400) >> 3);
output[ 4] |= ((input[ 0] & 0x04000000) >>18);
output[ 4] |= ((input[ 1] & 0x04000000) >>17);
output[ 4] |= ((input[ 2] & 0x04000000) >>16);
output[ 4] |= ((input[ 3] & 0x04000000) >>15);
output[ 4] |= ((input[ 4] & 0x04000000) >>14);
output[ 4] |= ((input[ 5] & 0x04000000) >>13);
output[ 4] |= ((input[ 6] & 0x04000000) >>12);
output[ 4] |= ((input[ 7] & 0x04000000) >>11);
output[ 6] |= ((input[ 0] & 0x00000800) >>11);
output[ 6] |= ((input[ 1] & 0x00000800) >>10);
output[ 6] |= ((input[ 2] & 0x00000800) >> 9);
output[ 6] |= ((input[ 3] & 0x00000800) >> 8);
output[ 6] |= ((input[ 4] & 0x00000800) >> 7);
output[ 6] |= ((input[ 5] & 0x00000800) >> 6);
output[ 6] |= ((input[ 6] & 0x00000800) >> 5);
output[ 6] |= ((input[ 7] & 0x00000800) >> 4);
output[ 6] |= ((input[ 0] & 0x08000000) >>19);
output[ 6] |= ((input[ 1] & 0x08000000) >>18);
output[ 6] |= ((input[ 2] & 0x08000000) >>17);
output[ 6] |= ((input[ 3] & 0x08000000) >>16);
output[ 6] |= ((input[ 4] & 0x08000000) >>15);
output[ 6] |= ((input[ 5] & 0x08000000) >>14);
output[ 6] |= ((input[ 6] & 0x08000000) >>13);
output[ 6] |= ((input[ 7] & 0x08000000) >>12);
output[ 8] |= ((input[ 0] & 0x00001000) >>12);
output[ 8] |= ((input[ 1] & 0x00001000) >>11);
output[ 8] |= ((input[ 2] & 0x00001000) >>10);
output[ 8] |= ((input[ 3] & 0x00001000) >> 9);
output[ 8] |= ((input[ 4] & 0x00001000) >> 8);
output[ 8] |= ((input[ 5] & 0x00001000) >> 7);
output[ 8] |= ((input[ 6] & 0x00001000) >> 6);
output[ 8] |= ((input[ 7] & 0x00001000) >> 5);
output[ 8] |= ((input[ 0] & 0x10000000) >>20);
output[ 8] |= ((input[ 1] & 0x10000000) >>19);
output[ 8] |= ((input[ 2] & 0x10000000) >>18);
output[ 8] |= ((input[ 3] & 0x10000000) >>17);
output[ 8] |= ((input[ 4] & 0x10000000) >>16);
output[ 8] |= ((input[ 5] & 0x10000000) >>15);
output[ 8] |= ((input[ 6] & 0x10000000) >>14);
output[ 8] |= ((input[ 7] & 0x10000000) >>13);
output[10] |= ((input[ 0] & 0x00002000) >>13);
output[10] |= ((input[ 1] & 0x00002000) >>12);
output[10] |= ((input[ 2] & 0x00002000) >>11);
output[10] |= ((input[ 3] & 0x00002000) >>10);
output[10] |= ((input[ 4] & 0x00002000) >> 9);
output[10] |= ((input[ 5] & 0x00002000) >> 8);
output[10] |= ((input[ 6] & 0x00002000) >> 7);
output[10] |= ((input[ 7] & 0x00002000) >> 6);
output[10] |= ((input[ 0] & 0x20000000) >>21);
output[10] |= ((input[ 1] & 0x20000000) >>20);
output[10] |= ((input[ 2] & 0x20000000) >>19);
output[10] |= ((input[ 3] & 0x20000000) >>18);
output[10] |= ((input[ 4] & 0x20000000) >>17);
output[10] |= ((input[ 5] & 0x20000000) >>16);
output[10] |= ((input[ 6] & 0x20000000) >>15);
output[10] |= ((input[ 7] & 0x20000000) >>14);
output[12] |= ((input[ 0] & 0x00004000) >>14);
output[12] |= ((input[ 1] & 0x00004000) >>13);
output[12] |= ((input[ 2] & 0x00004000) >>12);
output[12] |= ((input[ 3] & 0x00004000) >>11);
output[12] |= ((input[ 4] & 0x00004000) >>10);
output[12] |= ((input[ 5] & 0x00004000) >> 9);
output[12] |= ((input[ 6] & 0x00004000) >> 8);
output[12] |= ((input[ 7] & 0x00004000) >> 7);
output[12] |= ((input[ 0] & 0x40000000) >>22);
output[12] |= ((input[ 1] & 0x40000000) >>21);
output[12] |= ((input[ 2] & 0x40000000) >>20);
output[12] |= ((input[ 3] & 0x40000000) >>19);
output[12] |= ((input[ 4] & 0x40000000) >>18);
output[12] |= ((input[ 5] & 0x40000000) >>17);
output[12] |= ((input[ 6] & 0x40000000) >>16);
output[12] |= ((input[ 7] & 0x40000000) >>15);
output[14] |= ((input[ 0] & 0x00008000) >>15);
output[14] |= ((input[ 1] & 0x00008000) >>14);
output[14] |= ((input[ 2] & 0x00008000) >>13);
output[14] |= ((input[ 3] & 0x00008000) >>12);
output[14] |= ((input[ 4] & 0x00008000) >>11);
output[14] |= ((input[ 5] & 0x00008000) >>10);
output[14] |= ((input[ 6] & 0x00008000) >> 9);
output[14] |= ((input[ 7] & 0x00008000) >> 8);
output[14] |= ((input[ 0] & 0x80000000) >>23);
output[14] |= ((input[ 1] & 0x80000000) >>22);
output[14] |= ((input[ 2] & 0x80000000) >>21);
output[14] |= ((input[ 3] & 0x80000000) >>20);
output[14] |= ((input[ 4] & 0x80000000) >>19);
output[14] |= ((input[ 5] & 0x80000000) >>18);
output[14] |= ((input[ 6] & 0x80000000) >>17);
output[14] |= ((input[ 7] & 0x80000000) >>16);
#pragma unroll 8
for (int i = 0; i < 16; i+=2) {
if (threadIdx.x & 1) output[i] = __byte_perm(output[i], 0, 0x1032);
output[i] = __byte_perm(output[i], __shfl((int)output[i], (threadIdx.x+1)%4, 4), 0x7610);
output[i+1] = __shfl((int)output[i], (threadIdx.x+2)%4, 4);
if ((threadIdx.x % 4) != 0) output[i] = output[i+1] = 0;
}
}

12
ccminer.vcxproj

@ -287,6 +287,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command> @@ -287,6 +287,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<ClInclude Include="uint256.h" />
</ItemGroup>
<ItemGroup>
<CudaCompile Include="bitslice_transformations_quad.cu">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</CudaCompile>
<CudaCompile Include="cuda_fugue256.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
@ -311,6 +317,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command> @@ -311,6 +317,12 @@ copy "$(CudaToolkitBinDir)\cudart*.dll" "$(OutDir)"</Command>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
</CudaCompile>
<CudaCompile Include="groestl_functions_quad.cu">
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</CudaCompile>
<CudaCompile Include="heavy\cuda_blake512.cu">
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>
<AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">-Xptxas "-abi=no -v" %(AdditionalOptions)</AdditionalOptions>

6
ccminer.vcxproj.filters

@ -340,5 +340,11 @@ @@ -340,5 +340,11 @@
<CudaCompile Include="x11\simd_functions.cu">
<Filter>Source Files\CUDA\x11</Filter>
</CudaCompile>
<CudaCompile Include="bitslice_transformations_quad.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
<CudaCompile Include="groestl_functions_quad.cu">
<Filter>Source Files\CUDA</Filter>
</CudaCompile>
</ItemGroup>
</Project>

2
configure.ac

@ -1,4 +1,4 @@ @@ -1,4 +1,4 @@
AC_INIT([ccminer], [2014.05.10])
AC_INIT([ccminer], [2014.06.14])
AC_PREREQ([2.59c])
AC_CANONICAL_SYSTEM

6
cpu-miner.c

@ -917,8 +917,8 @@ static void *miner_thread(void *userdata) @@ -917,8 +917,8 @@ static void *miner_thread(void *userdata)
goto out;
}
if (opt_benchmark)
if (++rounds == 1) exit(0);
// if (opt_benchmark)
// if (++rounds == 1) exit(0);
/* record scanhash elapsed time */
gettimeofday(&tv_end, NULL);
@ -1469,7 +1469,7 @@ static void signal_handler(int sig) @@ -1469,7 +1469,7 @@ static void signal_handler(int sig)
}
#endif
#define PROGRAM_VERSION "1.0"
#define PROGRAM_VERSION "1.1"
int main(int argc, char *argv[])
{
struct thr_info *thr;

4
cpuminer-config.h

@ -152,7 +152,7 @@ @@ -152,7 +152,7 @@
#define PACKAGE_NAME "ccminer"
/* Define to the full name and version of this package. */
#define PACKAGE_STRING "ccminer 2014.05.10"
#define PACKAGE_STRING "ccminer 2014.06.14"
/* Define to the one symbol short name of this package. */
#undef PACKAGE_TARNAME
@ -161,7 +161,7 @@ @@ -161,7 +161,7 @@
#undef PACKAGE_URL
/* Define to the version of this package. */
#define PACKAGE_VERSION "2014.05.10"
#define PACKAGE_VERSION "2014.06.14"
/* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be

537
cuda_groestlcoin.cu

@ -1,4 +1,4 @@ @@ -1,4 +1,4 @@
// Auf Groestlcoin spezialisierte Version von Groestl
// Auf Groestlcoin spezialisierte Version von Groestl inkl. Bitslice
#include <cuda.h>
#include "cuda_runtime.h"
@ -7,9 +7,6 @@ @@ -7,9 +7,6 @@
#include <stdio.h>
#include <memory.h>
// it's unfortunate that this is a compile time constant.
#define MAXWELL_OR_FERMI 1
// aus cpu-miner.c
extern int device_map[8];
@ -18,456 +15,152 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t @@ -18,456 +15,152 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props;
static cudaDeviceProp props[8];
// globaler Speicher für alle HeftyHashes aller Threads
__constant__ uint32_t pTarget[8]; // Single GPU
extern uint32_t *d_resultNonce[8];
__constant__ uint32_t groestlcoin_gpu_msg[32];
#define SPH_C32(x) ((uint32_t)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define PC32up(j, r) ((uint32_t)((j) + (r)))
#define PC32dn(j, r) 0
#define QC32up(j, r) 0xFFFFFFFF
#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
#define B32_0(x) __byte_perm(x, 0, 0x4440)
//((x) & 0xFF)
#define B32_1(x) __byte_perm(x, 0, 0x4441)
//(((x) >> 8) & 0xFF)
#define B32_2(x) __byte_perm(x, 0, 0x4442)
//(((x) >> 16) & 0xFF)
#define B32_3(x) __byte_perm(x, 0, 0x4443)
//((x) >> 24)
#if MAXWELL_OR_FERMI
#define USE_SHARED 1
// Maxwell and Fermi cards get the best speed with SHARED access it seems.
#if USE_SHARED
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
#define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
#else
#define T0up(x) tex1Dfetch(t0up1, x)
#define T0dn(x) tex1Dfetch(t0dn1, x)
#define T1up(x) tex1Dfetch(t1up1, x)
#define T1dn(x) tex1Dfetch(t1dn1, x)
#define T2up(x) tex1Dfetch(t2up1, x)
#define T2dn(x) tex1Dfetch(t2dn1, x)
#define T3up(x) tex1Dfetch(t3up1, x)
#define T3dn(x) tex1Dfetch(t3dn1, x)
#endif
#else
#define USE_SHARED 1
// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
#define T0dn(x) tex1Dfetch(t0dn1, x)
#define T1up(x) tex1Dfetch(t1up1, x)
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
#define T2up(x) tex1Dfetch(t2up1, x)
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
#define T3dn(x) tex1Dfetch(t3dn1, x)
#endif
texture<unsigned int, 1, cudaReadModeElementType> t0up1;
texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
texture<unsigned int, 1, cudaReadModeElementType> t1up1;
texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
texture<unsigned int, 1, cudaReadModeElementType> t2up1;
texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
texture<unsigned int, 1, cudaReadModeElementType> t3up1;
texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
extern uint32_t T0up_cpu[];
extern uint32_t T0dn_cpu[];
extern uint32_t T1up_cpu[];
extern uint32_t T1dn_cpu[];
extern uint32_t T2up_cpu[];
extern uint32_t T2dn_cpu[];
extern uint32_t T3up_cpu[];
extern uint32_t T3dn_cpu[];
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
__device__ __forceinline__ void groestlcoin_perm_P(uint32_t *a, char *mixtabs)
// 64 Register Variante für Compute 3.0
#include "groestl_functions_quad.cu"
#include "bitslice_transformations_quad.cu"
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
__global__ void __launch_bounds__(256, 4)
groestlcoin_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *resNounce)
{
uint32_t t[32];
//#pragma unroll 14
for(int r=0;r<14;r++)
{
switch(r)
{
case 0:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break;
case 1:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
case 2:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
case 3:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
case 4:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
case 5:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
case 6:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
case 7:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
case 8:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
case 9:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
case 10:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
case 11:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
case 12:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
case 13:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
}
// RBTT
#pragma unroll 16
for(int k=0;k<32;k+=2)
{
uint32_t t0_0 = B32_0(a[(k ) & 0x1f]), t9_0 = B32_0(a[(k + 9) & 0x1f]);
uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
t[k + 0] = T0up( t0_0 ) ^ T1up( t2_1 ) ^ T2up( t4_2 ) ^ T3up( t6_3 ) ^
T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 );
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
if (thread < threads)
{
// GROESTL
uint32_t paddedInput[8];
#pragma unroll 8
for(int k=0;k<8;k++) paddedInput[k] = groestlcoin_gpu_msg[4*k+threadIdx.x%4];
t[k + 1] = T0dn( t0_0 ) ^ T1dn( t2_1 ) ^ T2dn( t4_2 ) ^ T3dn( t6_3 ) ^
T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 );
}
#pragma unroll 32
for(int k=0;k<32;k++)
a[k] = t[k];
}
}
uint32_t nounce = startNounce + thread;
if ((threadIdx.x % 4) == 3)
paddedInput[4] = SWAB32(nounce); // 4*4+3 = 19
__device__ __forceinline__ void groestlcoin_perm_Q(uint32_t *a, char *mixtabs)
{
//#pragma unroll 14
for(int r=0;r<14;r++)
{
uint32_t t[32];
switch(r)
{
case 0:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break;
case 1:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break;
case 2:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break;
case 3:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
case 4:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
case 5:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
case 6:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
case 7:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
case 8:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
case 9:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
case 10:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
case 11:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
case 12:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
case 13:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
}
// RBTT
#pragma unroll 16
for(int k=0;k<32;k+=2)
uint32_t msgBitsliced[8];
to_bitslice_quad(paddedInput, msgBitsliced);
uint32_t state[8];
for (int round=0; round<2; round++)
{
uint32_t t2_0 = B32_0(a[(k + 2) & 0x1f]), t1_0 = B32_0(a[(k + 1) & 0x1f]);
uint32_t t6_1 = B32_1(a[(k + 6) & 0x1f]), t5_1 = B32_1(a[(k + 5) & 0x1f]);
uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2 = B32_2(a[(k + 9) & 0x1f]);
uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
t[k + 0] = T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^
T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn( t9_2 ) ^ T3dn( t13_3 );
groestl512_progressMessage_quad(state, msgBitsliced);
if (round < 1)
{
// Verkettung zweier Runden inclusive Padding.
msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + ((threadIdx.x%4)==3)*0x2000);
msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + ((threadIdx.x%4)==0)*0x0010);
}
}
t[k + 1] = T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^
T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up( t9_2 ) ^ T3up( t13_3 );
// Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
uint32_t out_state[16];
from_bitslice_quad(state, out_state);
if (threadIdx.x % 4 == 0)
{
int i, position = -1;
bool rc = true;
#pragma unroll 8
for (i = 7; i >= 0; i--) {
if (out_state[i] > pTarget[i]) {
if(position < i) {
position = i;
rc = false;
}
}
if (out_state[i] < pTarget[i]) {
if(position < i) {
position = i;
rc = true;
}
}
}
if(rc == true)
if(resNounce[0] > nounce)
resNounce[0] = nounce;
}
#pragma unroll 32
for(int k=0;k<32;k++)
a[k] = t[k];
}
}
#if USE_SHARED
__global__ void /* __launch_bounds__(256) */
#else
__global__ void
#endif
groestlcoin_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
{
#if USE_SHARED
extern __shared__ char mixtabs[];
if (threadIdx.x < 256)
{
*((uint32_t*)mixtabs + ( threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
*((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
*((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
*((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
*((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
*((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
*((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
*((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
}
__syncthreads();
#endif
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
// GROESTL
uint32_t message[32];
uint32_t state[32];
#pragma unroll 32
for(int k=0;k<32;k++) message[k] = groestlcoin_gpu_msg[k];
uint32_t nounce = startNounce + thread;
message[19] = SWAB32(nounce);
#pragma unroll 32
for(int u=0;u<32;u++) state[u] = message[u];
state[31] ^= 0x20000;
// Perm
#if USE_SHARED
groestlcoin_perm_P(state, mixtabs);
state[31] ^= 0x20000;
groestlcoin_perm_Q(message, mixtabs);
#else
groestlcoin_perm_P(state, NULL);
state[31] ^= 0x20000;
groestlcoin_perm_Q(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++) state[u] ^= message[u];
#pragma unroll 32
for(int u=0;u<32;u++) message[u] = state[u];
#if USE_SHARED
groestlcoin_perm_P(message, mixtabs);
#else
groestlcoin_perm_P(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++) state[u] ^= message[u];
////
//// 2. Runde groestl
////
#pragma unroll 16
for(int k=0;k<16;k++) message[k] = state[k + 16];
#pragma unroll 14
for(int k=1;k<15;k++)
message[k+16] = 0;
message[16] = 0x80;
message[31] = 0x01000000;
#pragma unroll 32
for(int u=0;u<32;u++)
state[u] = message[u];
state[31] ^= 0x20000;
// Perm
#if USE_SHARED
groestlcoin_perm_P(state, mixtabs);
state[31] ^= 0x20000;
groestlcoin_perm_Q(message, mixtabs);
#else
groestlcoin_perm_P(state, NULL);
state[31] ^= 0x20000;
groestlcoin_perm_Q(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++) state[u] ^= message[u];
#pragma unroll 32
for(int u=0;u<32;u++) message[u] = state[u];
#if USE_SHARED
groestlcoin_perm_P(message, mixtabs);
#else
groestlcoin_perm_P(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++) state[u] ^= message[u];
// kopiere Ergebnis
int i, position = -1;
bool rc = true;
#pragma unroll 8
for (i = 7; i >= 0; i--) {
if (state[i+16] > pTarget[i]) {
if(position < i) {
position = i;
rc = false;
}
}
if (state[i+16] < pTarget[i]) {
if(position < i) {
position = i;
rc = true;
}
}
}
if(rc == true)
if(resNounce[0] > nounce)
resNounce[0] = nounce;
}
}
#define texDef(texname, texmem, texsource, texsize) \
unsigned int *texmem; \
cudaMalloc(&texmem, texsize); \
cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
texname.normalized = 0; \
texname.filterMode = cudaFilterModePoint; \
texname.addressMode[0] = cudaAddressModeClamp; \
{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
// Setup-Funktionen
__host__ void groestlcoin_cpu_init(int thr_id, int threads)
{
cudaSetDevice(device_map[thr_id]);
cudaGetDeviceProperties(&props, device_map[thr_id]);
// Texturen mit obigem Makro initialisieren
texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
// Speicher für Gewinner-Nonce belegen
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
cudaSetDevice(device_map[thr_id]);
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
// Speicher für Gewinner-Nonce belegen
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
}
__host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
{
// Nachricht expandieren und setzen
uint32_t msgBlock[32];
// Nachricht expandieren und setzen
uint32_t msgBlock[32];
memset(msgBlock, 0, sizeof(uint32_t) * 32);
memcpy(&msgBlock[0], data, 80);
memset(msgBlock, 0, sizeof(uint32_t) * 32);
memcpy(&msgBlock[0], data, 80);
// Erweitere die Nachricht auf den Nachrichtenblock (padding)
// Unsere Nachricht hat 80 Byte
msgBlock[20] = 0x80;
msgBlock[31] = 0x01000000;
// Erweitere die Nachricht auf den Nachrichtenblock (padding)
// Unsere Nachricht hat 80 Byte
msgBlock[20] = 0x80;
msgBlock[31] = 0x01000000;
// groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
// auf der GPU ausgeführt)
// groestl512 braucht hierfür keinen CPU-Code (die einzige Runde wird
// auf der GPU ausgeführt)
// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
cudaMemcpyToSymbol( groestlcoin_gpu_msg,
msgBlock,
128);
// Blockheader setzen (korrekte Nonce und Hefty Hash fehlen da drin noch)
cudaMemcpyToSymbol( groestlcoin_gpu_msg,
msgBlock,
128);
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
cudaMemcpyToSymbol( pTarget,
pTargetIn,
sizeof(uint32_t) * 8 );
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
cudaMemcpyToSymbol( pTarget,
pTargetIn,
sizeof(uint32_t) * 8 );
}
__host__ void groestlcoin_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
{
// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
// alle anderen mit 512 Threads.
int threadsperblock = (props.major >= 3) ? 768 : 512;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
#if USE_SHARED
size_t shared_size = 8 * 256 * sizeof(uint32_t);
#else
size_t shared_size = 0;
#endif
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
//fprintf(stderr, "ThrID: %d\n", thr_id);
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
groestlcoin_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, 0, thr_id);
cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
int threadsperblock = 256;
// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
int factor = 4;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
groestlcoin_gpu_hash_quad<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, 0, thr_id);
cudaMemcpy(nounce, d_resultNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
}

665
cuda_myriadgroestl.cu

@ -1,4 +1,4 @@ @@ -1,4 +1,4 @@
// Auf Myriadcoin spezialisierte Version von Groestl
// Auf Myriadcoin spezialisierte Version von Groestl inkl. Bitslice
#include <cuda.h>
#include "cuda_runtime.h"
@ -7,9 +7,6 @@ @@ -7,9 +7,6 @@
#include <stdio.h>
#include <memory.h>
// it's unfortunate that this is a compile time constant.
#define MAXWELL_OR_FERMI 1
// aus cpu-miner.c
extern int device_map[8];
@ -22,30 +19,49 @@ typedef unsigned short uint16_t; @@ -22,30 +19,49 @@ typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props;
static cudaDeviceProp props[8];
// globaler Speicher für alle HeftyHashes aller Threads
__constant__ uint32_t pTarget[8]; // Single GPU
uint32_t *d_outputHashes[8];
extern uint32_t *d_resultNonce[8];
__constant__ uint32_t myriadgroestl_gpu_msg[32];
// muss expandiert werden
__constant__ uint32_t myr_sha256_gpu_constantTable[64];
__constant__ uint32_t myr_sha256_gpu_constantTable2[64];
__constant__ uint32_t myr_sha256_gpu_hashTable[8];
uint32_t myr_sha256_cpu_hashTable[] = {
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
uint32_t myr_sha256_cpu_constantTable[] = {
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
};
uint32_t myr_sha256_cpu_w2Table[] = {
0x80000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000,
0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000200,
0x80000000, 0x01400000, 0x00205000, 0x00005088, 0x22000800, 0x22550014, 0x05089742, 0xa0000020,
0x5a880000, 0x005c9400, 0x0016d49d, 0xfa801f00, 0xd33225d0, 0x11675959, 0xf6e6bfda, 0xb30c1549,
0x08b2b050, 0x9d7c4c27, 0x0ce2a393, 0x88e6e1ea, 0xa52b4335, 0x67a16f49, 0xd732016f, 0x4eeb2e91,
0x5dbf55e5, 0x8eee2335, 0xe2bc5ec2, 0xa83f4394, 0x45ad78f7, 0x36f3d0cd, 0xd99c05e8, 0xb0511dc7,
0x69bc7ac4, 0xbd11375b, 0xe3ba71e5, 0x3b209ff2, 0x18feee17, 0xe25ad9e7, 0x13375046, 0x0515089d,
0x4f0d0f04, 0x2627484e, 0x310128d2, 0xc668b434, 0x420841cc, 0x62d311b8, 0xe59ba771, 0x85a7a484 };
// 64 Register Variante für Compute 3.0
#include "groestl_functions_quad.cu"
#include "bitslice_transformations_quad.cu"
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
#if __CUDA_ARCH__ < 350
// Kepler (Compute 3.0)
#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
@ -53,452 +69,219 @@ uint32_t myr_sha256_cpu_constantTable[] = { @@ -53,452 +69,219 @@ uint32_t myr_sha256_cpu_constantTable[] = {
// Kepler (Compute 3.5)
#define ROTR32(x, n) __funnelshift_r( (x), (x), (n) )
#endif
#define R(x, n) ((x) >> (n))
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
#define s0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3))
#define s1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10))
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
#define R(x, n) ((x) >> (n))
#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
#define Maj(x, y, z) ((x & (y | z)) | (y & z))
#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
#define s0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3))
#define s1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10))
__device__ void myriadgroestl_gpu_sha256(uint32_t *message)
{
uint32_t W1[16];
uint32_t W2[16];
uint32_t W1[16];
uint32_t W2[16];
// Initialisiere die register a bis h mit der Hash-Tabelle
uint32_t regs[8];
uint32_t hash[8];
// Initialisiere die register a bis h mit der Hash-Tabelle
uint32_t regs[8];
uint32_t hash[8];
// pre
// pre
#pragma unroll 8
for (int k=0; k < 8; k++)
{
regs[k] = myr_sha256_gpu_hashTable[k];
hash[k] = regs[k];
}
for (int k=0; k < 8; k++)
{
regs[k] = myr_sha256_gpu_hashTable[k];
hash[k] = regs[k];
}
#pragma unroll 16
for(int k=0;k<16;k++)
W1[k] = SWAB32(message[k]);
for(int k=0;k<16;k++)
W1[k] = SWAB32(message[k]);
// Progress W1
#pragma unroll 16
for(int j=0;j<16;j++)
{
uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
#pragma unroll 7
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
regs[0] = T1 + T2;
regs[4] += T1;
}
for(int j=0;j<16;j++)
{
uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
#pragma unroll 7
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
regs[0] = T1 + T2;
regs[4] += T1;
}
// Progress W2...W3
#pragma unroll 3
for(int k=0;k<3;k++)
{
////// PART 1
#pragma unroll 2
for(int j=0;j<2;j++)
W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
for(int j=0;j<2;j++)
W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
#pragma unroll 5
for(int j=2;j<7;j++)
W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
for(int j=2;j<7;j++)
W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
#pragma unroll 8
for(int j=7;j<15;j++)
W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
for(int j=7;j<15;j++)
W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
// Rundenfunktion
// Rundenfunktion
#pragma unroll 16
for(int j=0;j<16;j++)
{
uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
#pragma unroll 7
for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
regs[0] = T1 + T2;
regs[4] += T1;
}
#pragma unroll 16
for(int j=0;j<16;j++)
W1[j] = W2[j];
}
for(int j=0;j<16;j++)
{
uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16] + W2[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
#pragma unroll 7
for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
regs[0] = T1 + T2;
regs[4] += T1;
}
#pragma unroll 8
for(int k=0;k<8;k++)
hash[k] += regs[k];
////// PART 2
#pragma unroll 2
for(int j=0;j<2;j++)
W1[j] = s1(W2[14+j]) + W2[9+j] + s0(W2[1+j]) + W2[j];
#pragma unroll 5
for(int j=2;j<7;j++)
W1[j] = s1(W1[j-2]) + W2[9+j] + s0(W2[1+j]) + W2[j];
/////
///// Zweite Runde (wegen Msg-Padding)
/////
#pragma unroll 8
for(int k=0;k<8;k++)
regs[k] = hash[k];
for(int j=7;j<15;j++)
W1[j] = s1(W1[j-2]) + W1[j-7] + s0(W2[1+j]) + W2[j];
W1[0] = SWAB32(0x80);
#pragma unroll 14
for(int k=1;k<15;k++)
W1[k] = 0;
W1[15] = 512;
W1[15] = s1(W1[13]) + W1[8] + s0(W1[0]) + W2[15];
// Progress W1
// Rundenfunktion
#pragma unroll 16
for(int j=0;j<16;j++)
{
uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j] + W1[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
#pragma unroll 7
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
regs[0] = T1 + T2;
regs[4] += T1;
}
for(int j=0;j<16;j++)
{
uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 32] + W1[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
#pragma unroll 7
for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
regs[0] = T1 + T2;
regs[4] += T1;
}
// Progress W2...W3
#pragma unroll 3
for(int k=0;k<3;k++)
{
////// PART 3
#pragma unroll 2
for(int j=0;j<2;j++)
W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
for(int j=0;j<2;j++)
W2[j] = s1(W1[14+j]) + W1[9+j] + s0(W1[1+j]) + W1[j];
#pragma unroll 5
for(int j=2;j<7;j++)
W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
for(int j=2;j<7;j++)
W2[j] = s1(W2[j-2]) + W1[9+j] + s0(W1[1+j]) + W1[j];
#pragma unroll 8
for(int j=7;j<15;j++)
W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
for(int j=7;j<15;j++)
W2[j] = s1(W2[j-2]) + W2[j-7] + s0(W1[1+j]) + W1[j];
W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15];
// Rundenfunktion
// Rundenfunktion
#pragma unroll 16
for(int j=0;j<16;j++)
{
uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 16 * (k+1)] + W2[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
#pragma unroll 7
for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
regs[0] = T1 + T2;
regs[4] += T1;
}
for(int j=0;j<16;j++)
{
uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable[j + 48] + W2[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
#pragma unroll 7
for (int l=6; l >= 0; l--) regs[l+1] = regs[l];
regs[0] = T1 + T2;
regs[4] += T1;
}
#pragma unroll 16
for(int j=0;j<16;j++)
W1[j] = W2[j];
}
#pragma unroll 8
for(int k=0;k<8;k++)
hash[k] += regs[k];
/////
///// Zweite Runde (wegen Msg-Padding)
/////
#pragma unroll 8
for(int k=0;k<8;k++)
hash[k] += regs[k];
for(int k=0;k<8;k++)
regs[k] = hash[k];
//// FERTIG
// Progress W1
#pragma unroll 64
for(int j=0;j<64;j++)
{
uint32_t T1, T2;
T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + myr_sha256_gpu_constantTable2[j];
T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]);
#pragma unroll 7
for (int k=6; k >= 0; k--) regs[k+1] = regs[k];
regs[0] = T1 + T2;
regs[4] += T1;
}
#pragma unroll 8
for(int k=0;k<8;k++)
message[k] = SWAB32(hash[k]);
}
for(int k=0;k<8;k++)
hash[k] += regs[k];
#define SPH_C32(x) ((uint32_t)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define PC32up(j, r) ((uint32_t)((j) + (r)))
#define PC32dn(j, r) 0
#define QC32up(j, r) 0xFFFFFFFF
#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
#define B32_0(x) __byte_perm(x, 0, 0x4440)
//((x) & 0xFF)
#define B32_1(x) __byte_perm(x, 0, 0x4441)
//(((x) >> 8) & 0xFF)
#define B32_2(x) __byte_perm(x, 0, 0x4442)
//(((x) >> 16) & 0xFF)
#define B32_3(x) __byte_perm(x, 0, 0x4443)
//((x) >> 24)
#if MAXWELL_OR_FERMI
#define USE_SHARED 1
// Maxwell and Fermi cards get the best speed with SHARED access it seems.
#if USE_SHARED
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
#define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
#else
#define T0up(x) tex1Dfetch(t0up1, x)
#define T0dn(x) tex1Dfetch(t0dn1, x)
#define T1up(x) tex1Dfetch(t1up1, x)
#define T1dn(x) tex1Dfetch(t1dn1, x)
#define T2up(x) tex1Dfetch(t2up1, x)
#define T2dn(x) tex1Dfetch(t2dn1, x)
#define T3up(x) tex1Dfetch(t3up1, x)
#define T3dn(x) tex1Dfetch(t3dn1, x)
#endif
#else
#define USE_SHARED 1
// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
#define T0dn(x) tex1Dfetch(t0dn1, x)
#define T1up(x) tex1Dfetch(t1up1, x)
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
#define T2up(x) tex1Dfetch(t2up1, x)
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
#define T3dn(x) tex1Dfetch(t3dn1, x)
#endif
//// FERTIG
texture<unsigned int, 1, cudaReadModeElementType> t0up1;
texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
texture<unsigned int, 1, cudaReadModeElementType> t1up1;
texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
texture<unsigned int, 1, cudaReadModeElementType> t2up1;
texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
texture<unsigned int, 1, cudaReadModeElementType> t3up1;
texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
#pragma unroll 8
for(int k=0;k<8;k++)
message[k] = SWAB32(hash[k]);
}
extern uint32_t T0up_cpu[];
extern uint32_t T0dn_cpu[];
extern uint32_t T1up_cpu[];
extern uint32_t T1dn_cpu[];
extern uint32_t T2up_cpu[];
extern uint32_t T2dn_cpu[];
extern uint32_t T3up_cpu[];
extern uint32_t T3dn_cpu[];
__global__ void __launch_bounds__(256, 4)
myriadgroestl_gpu_hash_quad(int threads, uint32_t startNounce, uint32_t *hashBuffer)
{
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
int thread = (blockDim.x * blockIdx.x + threadIdx.x) / 4;
if (thread < threads)
{
// GROESTL
uint32_t paddedInput[8];
#pragma unroll 8
for(int k=0;k<8;k++) paddedInput[k] = myriadgroestl_gpu_msg[4*k+threadIdx.x%4];
#define SWAB32(x) ( ((x & 0x000000FF) << 24) | ((x & 0x0000FF00) << 8) | ((x & 0x00FF0000) >> 8) | ((x & 0xFF000000) >> 24) )
uint32_t nounce = startNounce + thread;
if ((threadIdx.x % 4) == 3)
paddedInput[4] = SWAB32(nounce); // 4*4+3 = 19
uint32_t msgBitsliced[8];
to_bitslice_quad(paddedInput, msgBitsliced);
__device__ __forceinline__ void myriadgroestl_perm_P(uint32_t *a, char *mixtabs)
{
uint32_t t[32];
//#pragma unroll 14
for(int r=0;r<14;r++)
{
switch(r)
{
case 0:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 0); break;
case 1:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 1); break;
case 2:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 2); break;
case 3:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 3); break;
case 4:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 4); break;
case 5:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 5); break;
case 6:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 6); break;
case 7:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 7); break;
case 8:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 8); break;
case 9:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 9); break;
case 10:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 10); break;
case 11:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 11); break;
case 12:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 12); break;
case 13:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k * 0x10, 13); break;
}
uint32_t state[8];
// RBTT
#pragma unroll 16
for(int k=0;k<32;k+=2)
{
uint32_t t0_0 = B32_0(a[(k ) & 0x1f]), t9_0 = B32_0(a[(k + 9) & 0x1f]);
uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
t[k + 0] = T0up( t0_0 ) ^ T1up( t2_1 ) ^ T2up( t4_2 ) ^ T3up( t6_3 ) ^
T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 );
groestl512_progressMessage_quad(state, msgBitsliced);
t[k + 1] = T0dn( t0_0 ) ^ T1dn( t2_1 ) ^ T2dn( t4_2 ) ^ T3dn( t6_3 ) ^
T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 );
}
#pragma unroll 32
for(int k=0;k<32;k++)
a[k] = t[k];
}
}
uint32_t out_state[16];
from_bitslice_quad(state, out_state);
__device__ __forceinline__ void myriadgroestl_perm_Q(uint32_t *a, char *mixtabs)
{
//#pragma unroll 14
for(int r=0;r<14;r++)
{
uint32_t t[32];
switch(r)
{
case 0:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 0); a[(k*2)+1] ^= QC32dn(k * 0x10, 0);} break;
case 1:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 1); a[(k*2)+1] ^= QC32dn(k * 0x10, 1);} break;
case 2:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 2); a[(k*2)+1] ^= QC32dn(k * 0x10, 2);} break;
case 3:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 3); a[(k*2)+1] ^= QC32dn(k * 0x10, 3);} break;
case 4:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 4); a[(k*2)+1] ^= QC32dn(k * 0x10, 4);} break;
case 5:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 5); a[(k*2)+1] ^= QC32dn(k * 0x10, 5);} break;
case 6:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 6); a[(k*2)+1] ^= QC32dn(k * 0x10, 6);} break;
case 7:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 7); a[(k*2)+1] ^= QC32dn(k * 0x10, 7);} break;
case 8:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 8); a[(k*2)+1] ^= QC32dn(k * 0x10, 8);} break;
case 9:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 9); a[(k*2)+1] ^= QC32dn(k * 0x10, 9);} break;
case 10:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 10); a[(k*2)+1] ^= QC32dn(k * 0x10, 10);} break;
case 11:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 11); a[(k*2)+1] ^= QC32dn(k * 0x10, 11);} break;
case 12:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 12); a[(k*2)+1] ^= QC32dn(k * 0x10, 12);} break;
case 13:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k * 0x10, 13); a[(k*2)+1] ^= QC32dn(k * 0x10, 13);} break;
}
// RBTT
#pragma unroll 16
for(int k=0;k<32;k+=2)
if ((threadIdx.x & 0x03) == 0)
{
uint32_t t2_0 = B32_0(a[(k + 2) & 0x1f]), t1_0 = B32_0(a[(k + 1) & 0x1f]);
uint32_t t6_1 = B32_1(a[(k + 6) & 0x1f]), t5_1 = B32_1(a[(k + 5) & 0x1f]);
uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2 = B32_2(a[(k + 9) & 0x1f]);
uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
t[k + 0] = T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^
T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn( t9_2 ) ^ T3dn( t13_3 );
t[k + 1] = T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^
T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up( t9_2 ) ^ T3up( t13_3 );
uint32_t *outpHash = &hashBuffer[16 * thread];
#pragma unroll 16
for(int k=0;k<16;k++) outpHash[k] = out_state[k];
}
#pragma unroll 32
for(int k=0;k<32;k++)
a[k] = t[k];
}
}
__global__ void
myriadgroestl_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
__global__ void
myriadgroestl_gpu_hash_quad2(int threads, uint32_t startNounce, uint32_t *resNounce, uint32_t *hashBuffer)
{
#if USE_SHARED
extern __shared__ char mixtabs[];
if (threadIdx.x < 256)
{
*((uint32_t*)mixtabs + ( threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
*((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
*((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
*((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
*((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
*((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
*((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
*((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
}
__syncthreads();
#endif
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
// GROESTL
uint32_t message[32];
uint32_t state[32];
#pragma unroll 32
for(int k=0;k<32;k++) message[k] = myriadgroestl_gpu_msg[k];
uint32_t nounce = startNounce + thread;
message[19] = SWAB32(nounce);
#pragma unroll 32
for(int u=0;u<32;u++) state[u] = message[u];
state[31] ^= 0x20000;
// Perm
#if USE_SHARED
myriadgroestl_perm_P(state, mixtabs);
state[31] ^= 0x20000;
myriadgroestl_perm_Q(message, mixtabs);
#else
myriadgroestl_perm_P(state, NULL);
state[31] ^= 0x20000;
myriadgroestl_perm_Q(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++) state[u] ^= message[u];
#pragma unroll 32
for(int u=0;u<32;u++) message[u] = state[u];
#if USE_SHARED
myriadgroestl_perm_P(message, mixtabs);
#else
myriadgroestl_perm_P(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++) state[u] ^= message[u];
uint32_t nounce = startNounce + thread;
uint32_t out_state[16];
uint32_t *inpHash = &hashBuffer[16 * thread];
#pragma unroll 16
for(int u=0;u<16;u++) out_state[u] = state[u+16];
for (int i=0; i < 16; i++)
out_state[i] = inpHash[i];
myriadgroestl_gpu_sha256(out_state);
int i, position = -1;
@ -526,43 +309,35 @@ myriadgroestl_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce) @@ -526,43 +309,35 @@ myriadgroestl_gpu_hash(int threads, uint32_t startNounce, uint32_t *resNounce)
}
}
#define texDef(texname, texmem, texsource, texsize) \
unsigned int *texmem; \
cudaMalloc(&texmem, texsize); \
cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
texname.normalized = 0; \
texname.filterMode = cudaFilterModePoint; \
texname.addressMode[0] = cudaAddressModeClamp; \
{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
// Setup-Funktionen
__host__ void myriadgroestl_cpu_init(int thr_id, int threads)
{
cudaSetDevice(device_map[thr_id]);
cudaMemcpyToSymbol( myr_sha256_gpu_hashTable,
myr_sha256_cpu_hashTable,
sizeof(uint32_t) * 8 );
cudaMemcpyToSymbol( myr_sha256_gpu_constantTable,
myr_sha256_cpu_constantTable,
sizeof(uint32_t) * 64 );
cudaGetDeviceProperties(&props, device_map[thr_id]);
// Texturen mit obigem Makro initialisieren
texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
cudaSetDevice(device_map[thr_id]);
cudaMemcpyToSymbol( myr_sha256_gpu_hashTable,
myr_sha256_cpu_hashTable,
sizeof(uint32_t) * 8 );
cudaMemcpyToSymbol( myr_sha256_gpu_constantTable,
myr_sha256_cpu_constantTable,
sizeof(uint32_t) * 64 );
// zweite CPU-Tabelle bauen und auf die GPU laden
uint32_t temp[64];
for(int i=0;i<64;i++)
temp[i] = myr_sha256_cpu_w2Table[i] + myr_sha256_cpu_constantTable[i];
cudaMemcpyToSymbol( myr_sha256_gpu_constantTable2,
temp,
sizeof(uint32_t) * 64 );
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
// Speicher für Gewinner-Nonce belegen
cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
// Speicher für temporäreHashes
cudaMalloc(&d_outputHashes[thr_id], 16*sizeof(uint32_t)*threads);
}
__host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
@ -594,25 +369,23 @@ __host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn @@ -594,25 +369,23 @@ __host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn
__host__ void myriadgroestl_cpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
{
// Compute 3.x und 5.x Geräte am besten mit 768 Threads ansteuern,
// alle anderen mit 512 Threads.
int threadsperblock = (props.major >= 3) ? 768 : 512;
int threadsperblock = 256;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
const int factor=4;
// Größe des dynamischen Shared Memory Bereichs
#if USE_SHARED
size_t shared_size = 8 * 256 * sizeof(uint32_t);
#else
size_t shared_size = 0;
#endif
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
//fprintf(stderr, "ThrID: %d\n", thr_id);
cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
myriadgroestl_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id]);
// berechne wie viele Thread Blocks wir brauchen
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
dim3 block(threadsperblock);
myriadgroestl_gpu_hash_quad<<<grid, block, shared_size>>>(threads, startNounce, d_outputHashes[thr_id]);
dim3 grid2((threads + threadsperblock-1)/threadsperblock);
myriadgroestl_gpu_hash_quad2<<<grid2, block, shared_size>>>(threads, startNounce, d_resultNonce[thr_id], d_outputHashes[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, 0, thr_id);

1
cuda_nist5.cu

@ -85,7 +85,6 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata, @@ -85,7 +85,6 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
{
const uint32_t first_nonce = pdata[19];
// TODO: entfernen für eine Release! Ist nur zum Testen!
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;

315
groestl_functions_quad.cu

@ -0,0 +1,315 @@ @@ -0,0 +1,315 @@
__device__ __forceinline__ void G256_Mul2(uint32_t *regs)
{
uint32_t tmp = regs[7];
regs[7] = regs[6];
regs[6] = regs[5];
regs[5] = regs[4];
regs[4] = regs[3] ^ tmp;
regs[3] = regs[2] ^ tmp;
regs[2] = regs[1];
regs[1] = regs[0] ^ tmp;
regs[0] = tmp;
}
__device__ __forceinline__ void G256_AddRoundConstantQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int round)
{
x0 = ~x0;
x1 = ~x1;
x2 = ~x2;
x3 = ~x3;
x4 = ~x4;
x5 = ~x5;
x6 = ~x6;
x7 = ~x7;
if ((threadIdx.x & 0x03) == 3) {
x0 ^= ((- (round & 0x01) ) & 0xFFFF0000);
x1 ^= ((-((round & 0x02)>>1)) & 0xFFFF0000);
x2 ^= ((-((round & 0x04)>>2)) & 0xFFFF0000);
x3 ^= ((-((round & 0x08)>>3)) & 0xFFFF0000);
x4 ^= 0xAAAA0000;
x5 ^= 0xCCCC0000;
x6 ^= 0xF0F00000;
x7 ^= 0xFF000000;
}
}
__device__ __forceinline__ void G256_AddRoundConstantP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0, int round)
{
if ((threadIdx.x & 0x03) == 0)
{
x4 ^= 0xAAAA;
x5 ^= 0xCCCC;
x6 ^= 0xF0F0;
x7 ^= 0xFF00;
x0 ^= ((- (round & 0x01) ) & 0xFFFF);
x1 ^= ((-((round & 0x02)>>1)) & 0xFFFF);
x2 ^= ((-((round & 0x04)>>2)) & 0xFFFF);
x3 ^= ((-((round & 0x08)>>3)) & 0xFFFF);
}
}
__device__ __forceinline__ void G16mul_quad(uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0,
uint32_t &y3, uint32_t &y2, uint32_t &y1, uint32_t &y0)
{
uint32_t t0,t1,t2;
t0 = ((x2 ^ x0) ^ (x3 ^ x1)) & ((y2 ^ y0) ^ (y3 ^ y1));
t1 = ((x2 ^ x0) & (y2 ^ y0)) ^ t0;
t2 = ((x3 ^ x1) & (y3 ^ y1)) ^ t0 ^ t1;
t0 = (x2^x3) & (y2^y3);
x3 = (x3 & y3) ^ t0 ^ t1;
x2 = (x2 & y2) ^ t0 ^ t2;
t0 = (x0^x1) & (y0^y1);
x1 = (x1 & y1) ^ t0 ^ t1;
x0 = (x0 & y0) ^ t0 ^ t2;
}
__device__ __forceinline__ void G256_inv_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
{
uint32_t t0,t1,t2,t3,t4,t5,t6,a,b;
t3 = x7;
t2 = x6;
t1 = x5;
t0 = x4;
G16mul_quad(t3, t2, t1, t0, x3, x2, x1, x0);
a = (x4 ^ x0);
t0 ^= a;
t2 ^= (x7 ^ x3) ^ (x5 ^ x1);
t1 ^= (x5 ^ x1) ^ a;
t3 ^= (x6 ^ x2) ^ a;
b = t0 ^ t1;
t4 = (t2 ^ t3) & b;
a = t4 ^ t3 ^ t1;
t5 = (t3 & t1) ^ a;
t6 = (t2 & t0) ^ a ^ (t2 ^ t0);
t4 = (t5 ^ t6) & b;
t1 = (t6 & t1) ^ t4;
t0 = (t5 & t0) ^ t4;
t4 = (t5 ^ t6) & (t2^t3);
t3 = (t6 & t3) ^ t4;
t2 = (t5 & t2) ^ t4;
G16mul_quad(x3, x2, x1, x0, t1, t0, t3, t2);
G16mul_quad(x7, x6, x5, x4, t1, t0, t3, t2);
}
__device__ __forceinline__ void transAtoX_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7)
{
uint32_t t0, t1;
t0 = x0 ^ x1 ^ x2;
t1 = x5 ^ x6;
x2 = t0 ^ t1 ^ x7;
x6 = t0 ^ x3 ^ x6;
x3 = x0 ^ x1 ^ x3 ^ x4 ^ x7;
x4 = x0 ^ x4 ^ t1;
x2 = t0 ^ t1 ^ x7;
x1 = x0 ^ x1 ^ t1;
x7 = x0 ^ t1 ^ x7;
x5 = x0 ^ t1;
}
__device__ __forceinline__ void transXtoA_quad(uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, uint32_t &x4, uint32_t &x5, uint32_t &x6, uint32_t &x7)
{
uint32_t t0,t2,t3,t5;
x1 ^= x4;
t0 = x1 ^ x6;
x1 ^= x5;
t2 = x0 ^ x2;
x2 = x3 ^ x5;
t2 ^= x2 ^ x6;
x2 ^= x7;
t3 = x4 ^ x2 ^ x6;
t5 = x0 ^ x6;
x4 = x3 ^ x7;
x0 = x3 ^ x5;
x6 = t0;
x3 = t2;
x7 = t3;
x5 = t5;
}
__device__ __forceinline__ void sbox_quad(uint32_t *r)
{
transAtoX_quad(r[0], r[1], r[2], r[3], r[4], r[5], r[6], r[7]);
G256_inv_quad(r[2], r[4], r[1], r[7], r[3], r[0], r[5], r[6]);
transXtoA_quad(r[7], r[1], r[4], r[2], r[6], r[5], r[0], r[3]);
r[0] = ~r[0];
r[1] = ~r[1];
r[5] = ~r[5];
r[6] = ~r[6];
}
__device__ __forceinline__ void G256_ShiftBytesP_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
{
uint32_t t0,t1;
int tpos = threadIdx.x & 0x03;
int shift1 = tpos << 1;
int shift2 = shift1+1 + ((tpos == 3)<<2);
t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
x0 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
t1 = __byte_perm(x1, 0, 0x3232)>>shift2;
x1 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x2, 0, 0x1010)>>shift1;
t1 = __byte_perm(x2, 0, 0x3232)>>shift2;
x2 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x3, 0, 0x1010)>>shift1;
t1 = __byte_perm(x3, 0, 0x3232)>>shift2;
x3 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x4, 0, 0x1010)>>shift1;
t1 = __byte_perm(x4, 0, 0x3232)>>shift2;
x4 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x5, 0, 0x1010)>>shift1;
t1 = __byte_perm(x5, 0, 0x3232)>>shift2;
x5 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x6, 0, 0x1010)>>shift1;
t1 = __byte_perm(x6, 0, 0x3232)>>shift2;
x6 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x7, 0, 0x1010)>>shift1;
t1 = __byte_perm(x7, 0, 0x3232)>>shift2;
x7 = __byte_perm(t0, t1, 0x5410);
}
__device__ __forceinline__ void G256_ShiftBytesQ_quad(uint32_t &x7, uint32_t &x6, uint32_t &x5, uint32_t &x4, uint32_t &x3, uint32_t &x2, uint32_t &x1, uint32_t &x0)
{
uint32_t t0,t1;
int tpos = threadIdx.x & 0x03;
int shift1 = (1-(tpos>>1)) + ((tpos & 0x01)<<2);
int shift2 = shift1+2 + ((tpos == 1)<<2);
t0 = __byte_perm(x0, 0, 0x1010)>>shift1;
t1 = __byte_perm(x0, 0, 0x3232)>>shift2;
x0 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x1, 0, 0x1010)>>shift1;
t1 = __byte_perm(x1, 0, 0x3232)>>shift2;
x1 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x2, 0, 0x1010)>>shift1;
t1 = __byte_perm(x2, 0, 0x3232)>>shift2;
x2 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x3, 0, 0x1010)>>shift1;
t1 = __byte_perm(x3, 0, 0x3232)>>shift2;
x3 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x4, 0, 0x1010)>>shift1;
t1 = __byte_perm(x4, 0, 0x3232)>>shift2;
x4 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x5, 0, 0x1010)>>shift1;
t1 = __byte_perm(x5, 0, 0x3232)>>shift2;
x5 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x6, 0, 0x1010)>>shift1;
t1 = __byte_perm(x6, 0, 0x3232)>>shift2;
x6 = __byte_perm(t0, t1, 0x5410);
t0 = __byte_perm(x7, 0, 0x1010)>>shift1;
t1 = __byte_perm(x7, 0, 0x3232)>>shift2;
x7 = __byte_perm(t0, t1, 0x5410);
}
__device__ __forceinline__ void G256_MixFunction_quad(uint32_t *r)
{
#define SHIFT64_16(hi, lo) __byte_perm(lo, hi, 0x5432)
#define A(v, u) __shfl((int)r[v], ((threadIdx.x+u)&0x03), 4)
#define S(idx, l) SHIFT64_16( A(idx, (l+1)), A(idx, l) )
#define DOUBLE_ODD(i, bc) ( S(i, (bc)) ^ A(i, (bc) + 1) )
#define DOUBLE_EVEN(i, bc) ( S(i, (bc)) ^ A(i, (bc) ) )
#define SINGLE_ODD(i, bc) ( S(i, (bc)) )
#define SINGLE_EVEN(i, bc) ( A(i, (bc)) )
uint32_t b[8];
#pragma unroll 8
for(int i=0;i<8;i++)
b[i] = DOUBLE_ODD(i, 1) ^ DOUBLE_EVEN(i, 3);
G256_Mul2(b);
#pragma unroll 8
for(int i=0;i<8;i++)
b[i] = b[i] ^ DOUBLE_ODD(i, 3) ^ DOUBLE_ODD(i, 4) ^ SINGLE_ODD(i, 6);
G256_Mul2(b);
#pragma unroll 8
for(int i=0;i<8;i++)
r[i] = b[i] ^ DOUBLE_EVEN(i, 2) ^ DOUBLE_EVEN(i, 3) ^ SINGLE_EVEN(i, 5);
#undef S
#undef A
#undef SHIFT64_16
#undef t
#undef X
}
__device__ __forceinline__ void groestl512_perm_P_quad(uint32_t *r)
{
for(int round=0;round<14;round++)
{
G256_AddRoundConstantP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
sbox_quad(r);
G256_ShiftBytesP_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
G256_MixFunction_quad(r);
}
}
__device__ __forceinline__ void groestl512_perm_Q_quad(uint32_t *r)
{
for(int round=0;round<14;round++)
{
G256_AddRoundConstantQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0], round);
sbox_quad(r);
G256_ShiftBytesQ_quad(r[7], r[6], r[5], r[4], r[3], r[2], r[1], r[0]);
G256_MixFunction_quad(r);
}
}
__device__ __forceinline__ void groestl512_progressMessage_quad(uint32_t *state, uint32_t *message)
{
#pragma unroll 8
for(int u=0;u<8;u++) state[u] = message[u];
if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
groestl512_perm_P_quad(state);
if ((threadIdx.x & 0x03) == 3) state[ 1] ^= 0x00008000;
groestl512_perm_Q_quad(message);
#pragma unroll 8
for(int u=0;u<8;u++) state[u] ^= message[u];
#pragma unroll 8
for(int u=0;u<8;u++) message[u] = state[u];
groestl512_perm_P_quad(message);
#pragma unroll 8
for(int u=0;u<8;u++) state[u] ^= message[u];
}

227
groestlcoin.cpp

@ -15,163 +15,118 @@ @@ -15,163 +15,118 @@
void sha256func(unsigned char *hash, const unsigned char *data, int len)
{
uint32_t S[16], T[16];
int i, r;
sha256_init(S);
for (r = len; r > -9; r -= 64) {
if (r < 64)
memset(T, 0, 64);
memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
if (r >= 0 && r < 64)
((unsigned char *)T)[r] = 0x80;
for (i = 0; i < 16; i++)
T[i] = be32dec(T + i);
if (r < 56)
T[15] = 8 * len;
sha256_transform(S, T, 0);
}
/*
memcpy(S + 8, sha256d_hash1 + 8, 32);
sha256_init(T);
sha256_transform(T, S, 0);
*/
for (i = 0; i < 8; i++)
be32enc((uint32_t *)hash + i, T[i]);
uint32_t S[16], T[16];
int i, r;
sha256_init(S);
for (r = len; r > -9; r -= 64) {
if (r < 64)
memset(T, 0, 64);
memcpy(T, data + len - r, r > 64 ? 64 : (r < 0 ? 0 : r));
if (r >= 0 && r < 64)
((unsigned char *)T)[r] = 0x80;
for (i = 0; i < 16; i++)
T[i] = be32dec(T + i);
if (r < 56)
T[15] = 8 * len;
sha256_transform(S, T, 0);
}
/*
memcpy(S + 8, sha256d_hash1 + 8, 32);
sha256_init(T);
sha256_transform(T, S, 0);
*/
for (i = 0; i < 8; i++)
be32enc((uint32_t *)hash + i, T[i]);
}
static void groestlhash(void *state, const void *input)
{
// Tryout GPU-groestl
// Tryout GPU-groestl
sph_groestl512_context ctx_groestl[2];
static unsigned char pblank[1];
int ii;
uint32_t mask = 8;
uint32_t zero = 0;
//these uint512 in the c++ source of the client are backed by an array of uint32
uint32_t hashA[16], hashB[16];
//these uint512 in the c++ source of the client are backed by an array of uint32
uint32_t hashA[16], hashB[16];
sph_groestl512_init(&ctx_groestl[0]);
sph_groestl512 (&ctx_groestl[0], input, 80); //6
sph_groestl512_close(&ctx_groestl[0], hashA); //7
sph_groestl512_close(&ctx_groestl[0], hashA); //7
sph_groestl512_init(&ctx_groestl[1]);
sph_groestl512 (&ctx_groestl[1], hashA, 64); //6
sph_groestl512_init(&ctx_groestl[1]);
sph_groestl512 (&ctx_groestl[1], hashA, 64); //6
sph_groestl512_close(&ctx_groestl[1], hashB); //7
memcpy(state, hashB, 32);
memcpy(state, hashB, 32);
}
extern bool opt_benchmark;
extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t start_nonce = pdata[19]++;
const uint32_t Htarg = ptarget[7];
const uint32_t throughPut = 4096 * 128;
//const uint32_t throughPut = 1;
int i;
uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t));
// init
static bool init[8] = { false, false, false, false, false, false, false, false };
if(!init[thr_id])
{
groestlcoin_cpu_init(thr_id, throughPut);
init[thr_id] = true;
}
// Endian Drehung ist notwendig
//char testdata[] = {"\x70\x00\x00\x00\x5d\x38\x5b\xa1\x14\xd0\x79\x97\x0b\x29\xa9\x41\x8f\xd0\x54\x9e\x7d\x68\xa9\x5c\x7f\x16\x86\x21\xa3\x14\x20\x10\x00\x00\x00\x00\x57\x85\x86\xd1\x49\xfd\x07\xb2\x2f\x3a\x8a\x34\x7c\x51\x6d\xe7\x05\x2f\x03\x4d\x2b\x76\xff\x68\xe0\xd6\xec\xff\x9b\x77\xa4\x54\x89\xe3\xfd\x51\x17\x32\x01\x1d\xf0\x73\x10\x00"};
//pdata = (uint32_t*)testdata;
uint32_t endiandata[32];
for (int kk=0; kk < 32; kk++)
be32enc(&endiandata[kk], pdata[kk]);
// Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
groestlcoin_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
do {
// GPU
uint32_t foundNounce = 0xFFFFFFFF;
groestlcoin_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce);
/*
{
for(i=0;i<throughPut;i++)
{
uint32_t tmpHash[8];
endiandata[19] = SWAP32(pdata[19]);
groestlhash(tmpHash, endiandata);
int ii;
printf("result GPU: ");
for (ii=0; ii < 32; ii++)
{
printf ("%.2x",((uint8_t*)&outputHash[8*i])[ii]);
};
printf ("\n");
groestlhash(tmpHash, endiandata);
printf("result CPU: ");
for (ii=0; ii < 32; ii++)
{
printf ("%.2x",((uint8_t*)tmpHash)[ii]);
};
}
exit(0);
}
*/
if(foundNounce < 0xffffffff)
{
uint32_t tmpHash[8];
endiandata[19] = SWAP32(foundNounce);
groestlhash(tmpHash, endiandata);
if (tmpHash[7] <= Htarg &&
fulltest(tmpHash, ptarget)) {
pdata[19] = foundNounce;
*hashes_done = foundNounce - start_nonce;
free(outputHash);
return true;
} else {
applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce);
}
foundNounce = 0xffffffff;
/*
int ii;
printf("result GPU: ");
for (ii=0; ii < 32; ii++)
{
printf ("%.2x",((uint8_t*)&outputHash[0])[ii]);
};
printf ("\n");
printf("result CPU: ");
for (ii=0; ii < 32; ii++)
{
printf ("%.2x",((uint8_t*)tmpHash)[ii]);
};
printf ("\n");
*/
}
if (pdata[19] + throughPut < pdata[19])
pdata[19] = max_nonce;
else pdata[19] += throughPut;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = pdata[19] - start_nonce;
free(outputHash);
return 0;
uint32_t max_nonce, unsigned long *hashes_done)
{
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x000000ff;
uint32_t start_nonce = pdata[19]++;
const uint32_t Htarg = ptarget[7];
const uint32_t throughPut = 4096 * 128;
//const uint32_t throughPut = 1;
uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t));
// init
static bool init[8] = { false, false, false, false, false, false, false, false };
if(!init[thr_id])
{
groestlcoin_cpu_init(thr_id, throughPut);
init[thr_id] = true;
}
// Endian Drehung ist notwendig
uint32_t endiandata[32];
for (int kk=0; kk < 32; kk++)
be32enc(&endiandata[kk], pdata[kk]);
// Context mit dem Endian gedrehten Blockheader vorbereiten (Nonce wird später ersetzt)
groestlcoin_cpu_setBlock(thr_id, endiandata, (void*)ptarget);
do {
// GPU
uint32_t foundNounce = 0xFFFFFFFF;
groestlcoin_cpu_hash(thr_id, throughPut, pdata[19], outputHash, &foundNounce);
if(foundNounce < 0xffffffff)
{
uint32_t tmpHash[8];
endiandata[19] = SWAP32(foundNounce);
groestlhash(tmpHash, endiandata);
if (tmpHash[7] <= Htarg &&
fulltest(tmpHash, ptarget)) {
pdata[19] = foundNounce;
*hashes_done = foundNounce - start_nonce;
free(outputHash);
return true;
} else {
applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNounce);
}
foundNounce = 0xffffffff;
}
if (pdata[19] + throughPut < pdata[19])
pdata[19] = max_nonce;
else pdata[19] += throughPut;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = pdata[19] - start_nonce;
free(outputHash);
return 0;
}

2
heavy/cuda_blake512.cu

@ -269,8 +269,6 @@ __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce) @@ -269,8 +269,6 @@ __host__ void blake512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
if (BLOCKSIZE == 80)
blake512_gpu_hash<80><<<grid, block, shared_size>>>(threads, startNounce, d_hash5output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
else if (BLOCKSIZE == 84)

2
heavy/cuda_combine.cu

@ -141,8 +141,6 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h @@ -141,8 +141,6 @@ void combine_cpu_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *h
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
combine_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, d_hashoutput[thr_id], d_hash2output[thr_id], d_hash3output[thr_id], d_hash4output[thr_id], d_hash5output[thr_id], d_nonceVector[thr_id]);
// da die Hash Auswertung noch auf der CPU erfolgt, müssen die Ergebnisse auf jeden Fall zum Host kopiert werden

2
heavy/cuda_groestl512.cu

@ -824,8 +824,6 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce) @@ -824,8 +824,6 @@ __host__ void groestl512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
if (BLOCKSIZE == 84)
groestl512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash4output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
else if (BLOCKSIZE == 80)

2
heavy/cuda_hefty1.cu

@ -416,8 +416,6 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce) @@ -416,8 +416,6 @@ __host__ void hefty_cpu_hash(int thr_id, int threads, int startNounce)
size_t shared_size = 0;
#endif
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
hefty_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (void*)d_heftyHashes[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last

1
heavy/cuda_keccak512.cu

@ -279,7 +279,6 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce) @@ -279,7 +279,6 @@ __host__ void keccak512_cpu_hash(int thr_id, int threads, uint32_t startNounce)
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
if (BLOCKSIZE==84)
keccak512_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash3output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
else if (BLOCKSIZE==80)

1
heavy/cuda_sha256.cu

@ -271,7 +271,6 @@ __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce) @@ -271,7 +271,6 @@ __host__ void sha256_cpu_hash(int thr_id, int threads, int startNounce)
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
if (BLOCKSIZE == 84)
sha256_gpu_hash<84><<<grid, block, shared_size>>>(threads, startNounce, d_hash2output[thr_id], d_heftyHashes[thr_id], d_nonceVector[thr_id]);
else if (BLOCKSIZE == 80) {

10
myriadgroestl.cpp

@ -35,17 +35,19 @@ static void myriadhash(void *state, const void *input) @@ -35,17 +35,19 @@ static void myriadhash(void *state, const void *input)
memcpy(state, hashB, 32);
}
extern bool opt_benchmark;
extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t start_nonce = pdata[19]++;
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x000000ff;
uint32_t start_nonce = pdata[19]++;
const uint32_t throughPut = 128 * 1024;
// const uint32_t throughPut = 1;
uint32_t *outputHash = (uint32_t*)malloc(throughPut * 16 * sizeof(uint32_t));
// TODO: entfernen für eine Release! Ist nur zum Testen!
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;

1
quark/animecoin.cu

@ -175,7 +175,6 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata, @@ -175,7 +175,6 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
{
const uint32_t first_nonce = pdata[19];
// TODO: entfernen für eine Release! Ist nur zum Testen!
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x00000f;

4
quark/cuda_bmw512.cu

@ -447,8 +447,6 @@ __host__ void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNo @@ -447,8 +447,6 @@ __host__ void quark_bmw512_cpu_hash_64(int thr_id, int threads, uint32_t startNo
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_bmw512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}
@ -464,8 +462,6 @@ __host__ void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNo @@ -464,8 +462,6 @@ __host__ void quark_bmw512_cpu_hash_80(int thr_id, int threads, uint32_t startNo
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_bmw512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
MyStreamSynchronize(NULL, order, thr_id);
}

2
quark/cuda_jh512.cu

@ -350,8 +350,6 @@ __host__ void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNou @@ -350,8 +350,6 @@ __host__ void quark_jh512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_jh512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}

4
quark/cuda_quark_blake512.cu

@ -406,8 +406,6 @@ __host__ void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t start @@ -406,8 +406,6 @@ __host__ void quark_blake512_cpu_hash_64(int thr_id, int threads, uint32_t start
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_blake512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, (uint64_t*)d_outputHash);
// Strategisches Sleep Kommando zur Senkung der CPU Last
@ -425,8 +423,6 @@ __host__ void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t start @@ -425,8 +423,6 @@ __host__ void quark_blake512_cpu_hash_80(int thr_id, int threads, uint32_t start
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_blake512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
// Strategisches Sleep Kommando zur Senkung der CPU Last

2
quark/cuda_quark_checkhash.cu

@ -89,8 +89,6 @@ __host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t star @@ -89,8 +89,6 @@ __host__ uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t star
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_check_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
// Strategisches Sleep Kommando zur Senkung der CPU Last

414
quark/cuda_quark_groestl512.cu

@ -1,4 +1,4 @@ @@ -1,4 +1,4 @@
// Auf QuarkCoin spezialisierte Version von Groestl
// Auf QuarkCoin spezialisierte Version von Groestl inkl. Bitslice
#include <cuda.h>
#include "cuda_runtime.h"
@ -7,9 +7,6 @@ @@ -7,9 +7,6 @@
#include <stdio.h>
#include <memory.h>
// it's unfortunate that this is a compile time constant.
#define MAXWELL_OR_FERMI 1
// aus cpu-miner.c
extern int device_map[8];
@ -18,353 +15,137 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t @@ -18,353 +15,137 @@ extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int t
// Folgende Definitionen später durch header ersetzen
typedef unsigned char uint8_t;
typedef unsigned short uint16_t;
typedef unsigned int uint32_t;
typedef unsigned long long uint64_t;
// diese Struktur wird in der Init Funktion angefordert
static cudaDeviceProp props[8];
#define SPH_C32(x) ((uint32_t)(x ## U))
#define SPH_T32(x) ((x) & SPH_C32(0xFFFFFFFF))
#define PC32up(j, r) ((uint32_t)((j) + (r)))
#define PC32dn(j, r) 0
#define QC32up(j, r) 0xFFFFFFFF
#define QC32dn(j, r) (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
#define B32_0(x) __byte_perm(x, 0, 0x4440)
//((x) & 0xFF)
#define B32_1(x) __byte_perm(x, 0, 0x4441)
//(((x) >> 8) & 0xFF)
#define B32_2(x) __byte_perm(x, 0, 0x4442)
//(((x) >> 16) & 0xFF)
#define B32_3(x) __byte_perm(x, 0, 0x4443)
//((x) >> 24)
#if MAXWELL_OR_FERMI
#define USE_SHARED 1
// Maxwell and Fermi cards get the best speed with SHARED access it seems.
#if USE_SHARED
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
#define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
#else
#define T0up(x) tex1Dfetch(t0up1, x)
#define T0dn(x) tex1Dfetch(t0dn1, x)
#define T1up(x) tex1Dfetch(t1up1, x)
#define T1dn(x) tex1Dfetch(t1dn1, x)
#define T2up(x) tex1Dfetch(t2up1, x)
#define T2dn(x) tex1Dfetch(t2dn1, x)
#define T3up(x) tex1Dfetch(t3up1, x)
#define T3dn(x) tex1Dfetch(t3dn1, x)
#endif
#else
#define USE_SHARED 1
// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
#define T0up(x) (*((uint32_t*)mixtabs + ( (x))))
#define T0dn(x) tex1Dfetch(t0dn1, x)
#define T1up(x) tex1Dfetch(t1up1, x)
#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
#define T2up(x) tex1Dfetch(t2up1, x)
#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
#define T3dn(x) tex1Dfetch(t3dn1, x)
#endif
texture<unsigned int, 1, cudaReadModeElementType> t0up1;
texture<unsigned int, 1, cudaReadModeElementType> t0dn1;
texture<unsigned int, 1, cudaReadModeElementType> t1up1;
texture<unsigned int, 1, cudaReadModeElementType> t1dn1;
texture<unsigned int, 1, cudaReadModeElementType> t2up1;
texture<unsigned int, 1, cudaReadModeElementType> t2dn1;
texture<unsigned int, 1, cudaReadModeElementType> t3up1;
texture<unsigned int, 1, cudaReadModeElementType> t3dn1;
extern uint32_t T0up_cpu[];
extern uint32_t T0dn_cpu[];
extern uint32_t T1up_cpu[];
extern uint32_t T1dn_cpu[];
extern uint32_t T2up_cpu[];
extern uint32_t T2dn_cpu[];
extern uint32_t T3up_cpu[];
extern uint32_t T3dn_cpu[];
__device__ __forceinline__ void quark_groestl512_perm_P(uint32_t *a, char *mixtabs)
{
uint32_t t[32];
// 64 Register Variante für Compute 3.0
#include "groestl_functions_quad.cu"
#include "bitslice_transformations_quad.cu"
//#pragma unroll 14
for(int r=0;r<14;r++)
__global__ void __launch_bounds__(256, 4)
quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
{
// durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
int thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 2;
if (thread < threads)
{
switch(r)
{
case 0:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 0); break;
case 1:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 1); break;
case 2:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 2); break;
case 3:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 3); break;
case 4:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 4); break;
case 5:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 5); break;
case 6:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 6); break;
case 7:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 7); break;
case 8:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 8); break;
case 9:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 9); break;
case 10:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 10); break;
case 11:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 11); break;
case 12:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 12); break;
case 13:
#pragma unroll 16
for(int k=0;k<16;k++) a[(k*2)+0] ^= PC32up(k<< 4, 13); break;
}
// GROESTL
uint32_t message[8];
uint32_t state[8];
// RBTT
#pragma unroll 16
for(int k=0;k<32;k+=2)
{
uint32_t t0_0 = B32_0(a[(k ) & 0x1f]), t9_0 = B32_0(a[(k + 9) & 0x1f]);
uint32_t t2_1 = B32_1(a[(k + 2) & 0x1f]), t11_1 = B32_1(a[(k + 11) & 0x1f]);
uint32_t t4_2 = B32_2(a[(k + 4) & 0x1f]), t13_2 = B32_2(a[(k + 13) & 0x1f]);
uint32_t t6_3 = B32_3(a[(k + 6) & 0x1f]), t23_3 = B32_3(a[(k + 23) & 0x1f]);
t[k + 0] = T0up( t0_0 ) ^ T1up( t2_1 ) ^ T2up( t4_2 ) ^ T3up( t6_3 ) ^
T0dn( t9_0 ) ^ T1dn( t11_1 ) ^ T2dn( t13_2 ) ^ T3dn( t23_3 );
t[k + 1] = T0dn( t0_0 ) ^ T1dn( t2_1 ) ^ T2dn( t4_2 ) ^ T3dn( t6_3 ) ^
T0up( t9_0 ) ^ T1up( t11_1 ) ^ T2up( t13_2 ) ^ T3up( t23_3 );
}
#pragma unroll 32
for(int k=0;k<32;k++)
a[k] = t[k];
}
}
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
__device__ __forceinline__ void quark_groestl512_perm_Q(uint32_t *a, char *mixtabs)
{
//#pragma unroll 14
for(int r=0;r<14;r++)
{
uint32_t t[32];
int hashPosition = nounce - startNounce;
uint32_t *inpHash = &g_hash[hashPosition<<4];
switch(r)
{
case 0:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 0); a[(k*2)+1] ^= QC32dn(k<< 4, 0);} break;
case 1:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 1); a[(k*2)+1] ^= QC32dn(k<< 4, 1);} break;
case 2:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 2); a[(k*2)+1] ^= QC32dn(k<< 4, 2);} break;
case 3:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 3); a[(k*2)+1] ^= QC32dn(k<< 4, 3);} break;
case 4:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 4); a[(k*2)+1] ^= QC32dn(k<< 4, 4);} break;
case 5:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 5); a[(k*2)+1] ^= QC32dn(k<< 4, 5);} break;
case 6:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 6); a[(k*2)+1] ^= QC32dn(k<< 4, 6);} break;
case 7:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 7); a[(k*2)+1] ^= QC32dn(k<< 4, 7);} break;
case 8:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 8); a[(k*2)+1] ^= QC32dn(k<< 4, 8);} break;
case 9:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 9); a[(k*2)+1] ^= QC32dn(k<< 4, 9);} break;
case 10:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 10); a[(k*2)+1] ^= QC32dn(k<< 4, 10);} break;
case 11:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 11); a[(k*2)+1] ^= QC32dn(k<< 4, 11);} break;
case 12:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 12); a[(k*2)+1] ^= QC32dn(k<< 4, 12);} break;
case 13:
#pragma unroll 16
for(int k=0;k<16;k++) { a[(k*2)+0] ^= QC32up(k<< 4, 13); a[(k*2)+1] ^= QC32dn(k<< 4, 13);} break;
}
#pragma unroll 4
for(int k=0;k<4;k++) message[k] = inpHash[(k<<2) + (threadIdx.x&0x03)];
#pragma unroll 4
for(int k=4;k<8;k++) message[k] = 0;
// RBTT
#pragma unroll 16
for(int k=0;k<32;k+=2)
if ((threadIdx.x&0x03) == 0) message[4] = 0x80;
if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000;
uint32_t msgBitsliced[8];
to_bitslice_quad(message, msgBitsliced);
groestl512_progressMessage_quad(state, msgBitsliced);
// Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
uint32_t *outpHash = &g_hash[hashPosition<<4];
uint32_t hash[16];
from_bitslice_quad(state, hash);
if ((threadIdx.x & 0x03) == 0)
{
uint32_t t2_0 = B32_0(a[(k + 2) & 0x1f]), t1_0 = B32_0(a[(k + 1) & 0x1f]);
uint32_t t6_1 = B32_1(a[(k + 6) & 0x1f]), t5_1 = B32_1(a[(k + 5) & 0x1f]);
uint32_t t10_2 = B32_2(a[(k + 10) & 0x1f]), t9_2 = B32_2(a[(k + 9) & 0x1f]);
uint32_t t22_3 = B32_3(a[(k + 22) & 0x1f]), t13_3 = B32_3(a[(k + 13) & 0x1f]);
t[k + 0] = T0up( t2_0 ) ^ T1up( t6_1 ) ^ T2up( t10_2 ) ^ T3up( t22_3 ) ^
T0dn( t1_0 ) ^ T1dn( t5_1 ) ^ T2dn( t9_2 ) ^ T3dn( t13_3 );
t[k + 1] = T0dn( t2_0 ) ^ T1dn( t6_1 ) ^ T2dn( t10_2 ) ^ T3dn( t22_3 ) ^
T0up( t1_0 ) ^ T1up( t5_1 ) ^ T2up( t9_2 ) ^ T3up( t13_3 );
#pragma unroll 16
for(int k=0;k<16;k++) outpHash[k] = hash[k];
}
#pragma unroll 32
for(int k=0;k<32;k++)
a[k] = t[k];
}
}
__global__ void quark_groestl512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
{
#if USE_SHARED
extern __shared__ char mixtabs[];
if (threadIdx.x < 256)
{
*((uint32_t*)mixtabs + ( threadIdx.x)) = tex1Dfetch(t0up1, threadIdx.x);
*((uint32_t*)mixtabs + (256+threadIdx.x)) = tex1Dfetch(t0dn1, threadIdx.x);
*((uint32_t*)mixtabs + (512+threadIdx.x)) = tex1Dfetch(t1up1, threadIdx.x);
*((uint32_t*)mixtabs + (768+threadIdx.x)) = tex1Dfetch(t1dn1, threadIdx.x);
*((uint32_t*)mixtabs + (1024+threadIdx.x)) = tex1Dfetch(t2up1, threadIdx.x);
*((uint32_t*)mixtabs + (1280+threadIdx.x)) = tex1Dfetch(t2dn1, threadIdx.x);
*((uint32_t*)mixtabs + (1536+threadIdx.x)) = tex1Dfetch(t3up1, threadIdx.x);
*((uint32_t*)mixtabs + (1792+threadIdx.x)) = tex1Dfetch(t3dn1, threadIdx.x);
}
__syncthreads();
#endif
int thread = (blockDim.x * blockIdx.x + threadIdx.x);
__global__ void __launch_bounds__(256, 4)
quark_doublegroestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
{
int thread = (blockDim.x * blockIdx.x + threadIdx.x)>>2;
if (thread < threads)
{
// GROESTL
uint32_t message[32];
uint32_t state[32];
uint32_t message[8];
uint32_t state[8];
uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
int hashPosition = nounce - startNounce;
uint32_t *inpHash = &g_hash[16 * hashPosition];
uint32_t *inpHash = &g_hash[hashPosition<<4];
#pragma unroll 16
for(int k=0;k<16;k++) message[k] = inpHash[k];
#pragma unroll 14
for(int k=1;k<15;k++)
message[k+16] = 0;
message[16] = 0x80;
message[31] = 0x01000000;
#pragma unroll 32
for(int u=0;u<32;u++) state[u] = message[u];
state[31] ^= 0x20000;
// Perm
#if USE_SHARED
quark_groestl512_perm_P(state, mixtabs);
state[31] ^= 0x20000;
quark_groestl512_perm_Q(message, mixtabs);
#else
quark_groestl512_perm_P(state, NULL);
state[31] ^= 0x20000;
quark_groestl512_perm_Q(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++) state[u] ^= message[u];
#pragma unroll 32
for(int u=0;u<32;u++) message[u] = state[u];
#if USE_SHARED
quark_groestl512_perm_P(message, mixtabs);
#else
quark_groestl512_perm_P(message, NULL);
#endif
#pragma unroll 32
for(int u=0;u<32;u++) state[u] ^= message[u];
// Erzeugten Hash rausschreiben
uint32_t *outpHash = &g_hash[16 * hashPosition];
#pragma unroll 4
for(int k=0;k<4;k++) message[k] = inpHash[(k<<2)+(threadIdx.x&0x03)];
#pragma unroll 4
for(int k=4;k<8;k++) message[k] = 0;
if ((threadIdx.x&0x03) == 0) message[4] = 0x80;
if ((threadIdx.x&0x03) == 3) message[7] = 0x01000000;
uint32_t msgBitsliced[8];
to_bitslice_quad(message, msgBitsliced);
for (int round=0; round<2; round++)
{
groestl512_progressMessage_quad(state, msgBitsliced);
if (round < 1)
{
// Verkettung zweier Runden inclusive Padding.
msgBitsliced[ 0] = __byte_perm(state[ 0], 0x00800100, 0x4341 + (((threadIdx.x%4)==3)<<13));
msgBitsliced[ 1] = __byte_perm(state[ 1], 0x00800100, 0x4341);
msgBitsliced[ 2] = __byte_perm(state[ 2], 0x00800100, 0x4341);
msgBitsliced[ 3] = __byte_perm(state[ 3], 0x00800100, 0x4341);
msgBitsliced[ 4] = __byte_perm(state[ 4], 0x00800100, 0x4341);
msgBitsliced[ 5] = __byte_perm(state[ 5], 0x00800100, 0x4341);
msgBitsliced[ 6] = __byte_perm(state[ 6], 0x00800100, 0x4341);
msgBitsliced[ 7] = __byte_perm(state[ 7], 0x00800100, 0x4341 + (((threadIdx.x%4)==0)<<4));
}
}
// Nur der erste von jeweils 4 Threads bekommt das Ergebns-Hash
uint32_t *outpHash = &g_hash[hashPosition<<4];
uint32_t hash[16];
from_bitslice_quad(state, hash);
if ((threadIdx.x & 0x03) == 0)
{
#pragma unroll 16
for(int k=0;k<16;k++) outpHash[k] = state[k+16];
for(int k=0;k<16;k++) outpHash[k] = hash[k];
}
}
}
#define texDef(texname, texmem, texsource, texsize) \
unsigned int *texmem; \
cudaMalloc(&texmem, texsize); \
cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
texname.normalized = 0; \
texname.filterMode = cudaFilterModePoint; \
texname.addressMode[0] = cudaAddressModeClamp; \
{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
// Setup-Funktionen
__host__ void quark_groestl512_cpu_init(int thr_id, int threads)
{
cudaGetDeviceProperties(&props[thr_id], device_map[thr_id]);
// Texturen mit obigem Makro initialisieren
texDef(t0up1, d_T0up, T0up_cpu, sizeof(uint32_t)*256);
texDef(t0dn1, d_T0dn, T0dn_cpu, sizeof(uint32_t)*256);
texDef(t1up1, d_T1up, T1up_cpu, sizeof(uint32_t)*256);
texDef(t1dn1, d_T1dn, T1dn_cpu, sizeof(uint32_t)*256);
texDef(t2up1, d_T2up, T2up_cpu, sizeof(uint32_t)*256);
texDef(t2dn1, d_T2dn, T2dn_cpu, sizeof(uint32_t)*256);
texDef(t3up1, d_T3up, T3up_cpu, sizeof(uint32_t)*256);
texDef(t3dn1, d_T3dn, T3dn_cpu, sizeof(uint32_t)*256);
}
__host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
// Compute 3.5 und 5.x Geräte am besten mit 768 Threads ansteuern,
// alle anderen mit 512 Threads.
int threadsperblock = ((props[thr_id].major == 3 && props[thr_id].minor == 5) || props[thr_id].major > 3) ? 768 : 512;
int threadsperblock = 256;
// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
const int factor = 4;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
#if USE_SHARED
size_t shared_size = 8 * 256 * sizeof(uint32_t);
#else
size_t shared_size = 0;
#endif
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
//fprintf(stderr, "ThrID: %d\n", thr_id);
quark_groestl512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
quark_groestl512_gpu_hash_64_quad<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
// Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, order, thr_id);
@ -372,25 +153,20 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t sta @@ -372,25 +153,20 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t sta
__host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
{
// Compute 3.5 und 5.x Geräte am besten mit 768 Threads ansteuern,
// alle anderen mit 512 Threads.
int threadsperblock = ((props[thr_id].major == 3 && props[thr_id].minor == 5) || props[thr_id].major > 3) ? 768 : 512;
int threadsperblock = 256;
// Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
// mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
const int factor = 4;
// berechne wie viele Thread Blocks wir brauchen
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
dim3 block(threadsperblock);
// Größe des dynamischen Shared Memory Bereichs
#if USE_SHARED
size_t shared_size = 8 * 256 * sizeof(uint32_t);
#else
size_t shared_size = 0;
#endif
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
//fprintf(stderr, "ThrID: %d\n", thr_id);
quark_groestl512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
quark_groestl512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
quark_doublegroestl512_gpu_hash_64_quad<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
// Strategisches Sleep Kommando zur Senkung der CPU Last
MyStreamSynchronize(NULL, order, thr_id);

2
quark/cuda_quark_keccak512.cu

@ -175,8 +175,6 @@ __host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t star @@ -175,8 +175,6 @@ __host__ void quark_keccak512_cpu_hash_64(int thr_id, int threads, uint32_t star
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_keccak512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}

1
quark/cuda_skein512.cu

@ -442,7 +442,6 @@ __host__ void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t start @@ -442,7 +442,6 @@ __host__ void quark_skein512_cpu_hash_64(int thr_id, int threads, uint32_t start
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
quark_skein512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
// Strategisches Sleep Kommando zur Senkung der CPU Last

1
quark/quarkcoin.cu

@ -157,7 +157,6 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata, @@ -157,7 +157,6 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
{
const uint32_t first_nonce = pdata[19];
// TODO: entfernen für eine Release! Ist nur zum Testen!
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;

2
x11/cuda_x11_cubehash512.cu

@ -307,8 +307,6 @@ __host__ void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t star @@ -307,8 +307,6 @@ __host__ void x11_cubehash512_cpu_hash_64(int thr_id, int threads, uint32_t star
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
x11_cubehash512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}

2
x11/cuda_x11_echo.cu

@ -225,8 +225,6 @@ __host__ void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNou @@ -225,8 +225,6 @@ __host__ void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNou
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
x11_echo512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}

2
x11/cuda_x11_luffa512.cu

@ -376,8 +376,6 @@ __host__ void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNo @@ -376,8 +376,6 @@ __host__ void x11_luffa512_cpu_hash_64(int thr_id, int threads, uint32_t startNo
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
x11_luffa512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}

2
x11/cuda_x11_shavite512.cu

@ -1372,8 +1372,6 @@ __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t start @@ -1372,8 +1372,6 @@ __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t start
// Größe des dynamischen Shared Memory Bereichs
size_t shared_size = 0;
// fprintf(stderr, "threads=%d, %d blocks, %d threads per block, %d bytes shared\n", threads, grid.x, block.x, shared_size);
x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
MyStreamSynchronize(NULL, order, thr_id);
}

5
x11/x11.cu

@ -162,11 +162,8 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, @@ -162,11 +162,8 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
{
const uint32_t first_nonce = pdata[19];
// TODO: entfernen für eine Release! Ist nur zum Testen!
if (opt_benchmark) {
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x0000ff;
pdata[17] = 0;
}
const uint32_t Htarg = ptarget[7];

Loading…
Cancel
Save