From 95ac1d0f194a36695f60fe2da627a38baa21f38d Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 6 Sep 2014 20:54:41 +0200 Subject: [PATCH] x11: adapt some blake 256 opts to 512 one blake512: for the moment 6.2ms vs 7.12 before (+10%) --- cuda_nist5.cu | 9 +- quark/cuda_quark_blake512.cu | 156 ++++++++++++++++------------------- quark/quarkcoin.cu | 31 ++----- x11/x11.cu | 18 ++-- x15/x14.cu | 4 +- x15/x15.cu | 31 +------ x17/x17.cu | 18 +--- 7 files changed, 98 insertions(+), 169 deletions(-) diff --git a/cuda_nist5.cu b/cuda_nist5.cu index 2feb32e..419c1a5 100644 --- a/cuda_nist5.cu +++ b/cuda_nist5.cu @@ -5,9 +5,11 @@ extern "C" #include "sph/sph_skein.h" #include "sph/sph_jh.h" #include "sph/sph_keccak.h" +} + #include "miner.h" + #include "cuda_helper.h" -} // aus cpu-miner.c extern int device_map[8]; @@ -74,9 +76,6 @@ extern "C" void nist5hash(void *state, const void *input) memcpy(state, hash, 32); } - -extern bool opt_benchmark; - extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) @@ -84,7 +83,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata, const uint32_t first_nonce = pdata[19]; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; + ((uint32_t*)ptarget)[7] = 0x00FF; const uint32_t Htarg = ptarget[7]; diff --git a/quark/cuda_quark_blake512.cu b/quark/cuda_quark_blake512.cu index e3d299d..787b8a0 100644 --- a/quark/cuda_quark_blake512.cu +++ b/quark/cuda_quark_blake512.cu @@ -50,59 +50,60 @@ const uint64_t c_u512[16] = 0x0801f2e2858efc16ULL, 0x636920d871574e69ULL }; -#define G(a,b,c,d,e) \ - v[a] += (m[sigma[i][e]] ^ u512[sigma[i][e+1]]) + v[b];\ - v[d] = ROTR( v[d] ^ v[a],32); \ - v[c] += v[d]; \ - v[b] = ROTR( v[b] ^ v[c],25); \ - v[a] += (m[sigma[i][e+1]] ^ u512[sigma[i][e]])+v[b]; \ - v[d] = ROTR( v[d] ^ v[a],16); \ - v[c] += v[d]; \ - v[b] = ROTR( v[b] ^ v[c],11); - +#define G(a,b,c,d,x) { \ + uint32_t idx1 = sigma[i][x]; \ + uint32_t idx2 = sigma[i][x+1]; \ + v[a] += (m[idx1] ^ u512[idx2]) + v[b]; \ + v[d] = ROTR( v[d] ^ v[a], 32); \ + v[c] += v[d]; \ + v[b] = ROTR( v[b] ^ v[c], 25); \ + v[a] += (m[idx2] ^ u512[idx1]) + v[b]; \ + v[d] = ROTR( v[d] ^ v[a], 16); \ + v[c] += v[d]; \ + v[b] = ROTR( v[b] ^ v[c], 11); \ +} __device__ static -void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int bits ) +void quark_blake512_compress( uint64_t *h, const uint64_t *block, const uint8_t ((*sigma)[16]), const uint64_t *u512, const int T0) { uint64_t v[16], m[16], i; -#pragma unroll 16 - for( i = 0; i < 16; ++i ) { - m[i] = cuda_swab64(block[i]); - } - -#pragma unroll 8 - for( i = 0; i < 8; ++i ) v[i] = h[i]; - - v[ 8] = u512[0]; - v[ 9] = u512[1]; - v[10] = u512[2]; - v[11] = u512[3]; - v[12] = u512[4]; - v[13] = u512[5]; - v[14] = u512[6]; - v[15] = u512[7]; - - v[12] ^= bits; - v[13] ^= bits; - -//#pragma unroll 16 - for( i = 0; i < 16; ++i ) - { - /* column step */ - G( 0, 4, 8, 12, 0 ); - G( 1, 5, 9, 13, 2 ); - G( 2, 6, 10, 14, 4 ); - G( 3, 7, 11, 15, 6 ); - /* diagonal step */ - G( 0, 5, 10, 15, 8 ); - G( 1, 6, 11, 12, 10 ); - G( 2, 7, 8, 13, 12 ); - G( 3, 4, 9, 14, 14 ); - } - -#pragma unroll 16 - for( i = 0; i < 16; ++i ) h[i % 8] ^= v[i]; + #pragma unroll 16 + for( i = 0; i < 16; i++) { + m[i] = cuda_swab64(block[i]); + } + + #pragma unroll 8 + for (i = 0; i < 8; i++) + v[i] = h[i]; + + v[ 8] = u512[0]; + v[ 9] = u512[1]; + v[10] = u512[2]; + v[11] = u512[3]; + v[12] = u512[4] ^ T0; + v[13] = u512[5] ^ T0; + v[14] = u512[6]; + v[15] = u512[7]; + + //#pragma unroll 16 + for( i = 0; i < 16; ++i ) + { + /* column step */ + G( 0, 4, 8, 12, 0 ); + G( 1, 5, 9, 13, 2 ); + G( 2, 6, 10, 14, 4 ); + G( 3, 7, 11, 15, 6 ); + /* diagonal step */ + G( 0, 5, 10, 15, 8 ); + G( 1, 6, 11, 12, 10 ); + G( 2, 7, 8, 13, 12 ); + G( 3, 4, 9, 14, 14 ); + } + + #pragma unroll 16 + for( i = 0; i < 16; ++i ) + h[i % 8] ^= v[i]; } __device__ __constant__ @@ -114,7 +115,8 @@ static const uint64_t d_constMem[8] = { 0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL, 0x1f83d9abfb41bd6bULL, - 0x5be0cd19137e2179ULL }; + 0x5be0cd19137e2179ULL +}; // Hash-Padding __device__ __constant__ @@ -126,7 +128,8 @@ static const uint64_t d_constHashPadding[8] = { 0, 0x0100000000000000ull, 0, - 0x0002000000000000ull }; + 0x0002000000000000ull +}; __global__ __launch_bounds__(256, 4) void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint64_t *g_hash) @@ -145,48 +148,42 @@ void quark_blake512_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_n if (thread < threads) #endif { - uint8_t i; - // bestimme den aktuellen Zähler uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); int hashPosition = nounce - startNounce; uint64_t *inpHash = &g_hash[hashPosition<<3]; // hashPosition * 8 - // 128 Byte für die Message + // 128 Bytes uint64_t buf[16]; - // State vorbereiten + // State uint64_t h[8]; #pragma unroll 8 - for (i=0;i<8;i++) + for (int i=0;i<8;i++) h[i] = d_constMem[i]; - // Message für die erste Runde in Register holen + // Message for first round #pragma unroll 8 - for (i=0; i < 8; ++i) + for (int i=0; i < 8; ++i) buf[i] = inpHash[i]; #pragma unroll 8 - for (i=0; i < 8; i++) + for (int i=0; i < 8; i++) buf[i+8] = d_constHashPadding[i]; - // die einzige Hashing-Runde + // Ending round quark_blake512_compress( h, buf, c_sigma, c_u512, 512 ); -#if __CUDA_ARCH__ >= 130 - // ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind +#if __CUDA_ARCH__ <= 350 uint32_t *outHash = (uint32_t*)&g_hash[8 * hashPosition]; #pragma unroll 8 - for (i=0; i < 8; ++i) { + for (int i=0; i < 8; i++) { outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) ); outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) ); } #else - // in dieser Version passieren auch ein paar 64 Bit Shifts uint64_t *outHash = &g_hash[8 * hashPosition]; - #pragma unroll 8 - for (i=0; i < 8; ++i) - { + for (int i=0; i < 8; i++) { outHash[i] = cuda_swab64(h[i]); } #endif @@ -198,45 +195,38 @@ __global__ void quark_blake512_gpu_hash_80(int threads, uint32_t startNounce, vo int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - // State vorbereiten uint64_t h[8]; - // 128 Byte für die Message uint64_t buf[16]; - uint8_t i; - // bestimme den aktuellen Zähler uint32_t nounce = startNounce + thread; #pragma unroll 8 - for(i=0;i<8;i++) + for(int i=0; i<8; i++) h[i] = d_constMem[i]; // Message für die erste Runde in Register holen #pragma unroll 16 - for (i=0; i < 16; ++i) buf[i] = c_PaddedMessage80[i]; + for (int i=0; i < 16; ++i) + buf[i] = c_PaddedMessage80[i]; - // die Nounce durch die thread-spezifische ersetzen - buf[9] = REPLACE_HIWORD(buf[9], cuda_swab32(nounce)); + // The test Nonce + ((uint32_t*)buf)[19] = cuda_swab32(nounce); - // die einzige Hashing-Runde quark_blake512_compress( h, buf, c_sigma, c_u512, 640 ); - // Hash rauslassen -#if __CUDA_ARCH__ >= 130 - // ausschliesslich 32 bit Operationen sofern die SM1.3 double intrinsics verfügbar sind +#if __CUDA_ARCH__ <= 350 uint32_t *outHash = (uint32_t *)outputHash + 16 * thread; #pragma unroll 8 - for (i=0; i < 8; ++i) { - outHash[2*i+0] = cuda_swab32( _HIWORD(h[i]) ); + for (uint32_t i=0; i < 8; i++) { + outHash[2*i] = cuda_swab32( _HIWORD(h[i]) ); outHash[2*i+1] = cuda_swab32( _LOWORD(h[i]) ); } #else - // in dieser Version passieren auch ein paar 64 Bit Shifts uint64_t *outHash = (uint64_t *)outputHash + 8 * thread; - #pragma unroll 8 - for (i=0; i < 8; ++i) { + for (uint32_t i=0; i < 8; i++) { outHash[i] = cuda_swab64( h[i] ); } #endif + } } diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu index be6eda8..a905ec4 100644 --- a/quark/quarkcoin.cu +++ b/quark/quarkcoin.cu @@ -6,12 +6,12 @@ extern "C" #include "sph/sph_skein.h" #include "sph/sph_jh.h" #include "sph/sph_keccak.h" +} + #include "miner.h" #include "cuda_helper.h" -} -// aus cpu-miner.c extern int device_map[8]; // Speicher für Input/Output der verketteten Hashfunktionen @@ -70,76 +70,64 @@ extern "C" void quarkhash(void *state, const void *input) unsigned char hash[64]; sph_blake512_init(&ctx_blake); - // ZBLAKE; sph_blake512 (&ctx_blake, input, 80); sph_blake512_close(&ctx_blake, (void*) hash); sph_bmw512_init(&ctx_bmw); - // ZBMW; sph_bmw512 (&ctx_bmw, (const void*) hash, 64); sph_bmw512_close(&ctx_bmw, (void*) hash); if (hash[0] & 0x8) { sph_groestl512_init(&ctx_groestl); - // ZGROESTL; sph_groestl512 (&ctx_groestl, (const void*) hash, 64); sph_groestl512_close(&ctx_groestl, (void*) hash); } else { sph_skein512_init(&ctx_skein); - // ZSKEIN; sph_skein512 (&ctx_skein, (const void*) hash, 64); sph_skein512_close(&ctx_skein, (void*) hash); } sph_groestl512_init(&ctx_groestl); - // ZGROESTL; sph_groestl512 (&ctx_groestl, (const void*) hash, 64); sph_groestl512_close(&ctx_groestl, (void*) hash); sph_jh512_init(&ctx_jh); - // ZJH; sph_jh512 (&ctx_jh, (const void*) hash, 64); sph_jh512_close(&ctx_jh, (void*) hash); if (hash[0] & 0x8) { sph_blake512_init(&ctx_blake); - // ZBLAKE; sph_blake512 (&ctx_blake, (const void*) hash, 64); sph_blake512_close(&ctx_blake, (void*) hash); } else { sph_bmw512_init(&ctx_bmw); - // ZBMW; sph_bmw512 (&ctx_bmw, (const void*) hash, 64); sph_bmw512_close(&ctx_bmw, (void*) hash); } sph_keccak512_init(&ctx_keccak); - // ZKECCAK; sph_keccak512 (&ctx_keccak, (const void*) hash, 64); sph_keccak512_close(&ctx_keccak, (void*) hash); sph_skein512_init(&ctx_skein); - // SKEIN; sph_skein512 (&ctx_skein, (const void*) hash, 64); sph_skein512_close(&ctx_skein, (void*) hash); if (hash[0] & 0x8) { sph_keccak512_init(&ctx_keccak); - // ZKECCAK; sph_keccak512 (&ctx_keccak, (const void*) hash, 64); sph_keccak512_close(&ctx_keccak, (void*) hash); } else { sph_jh512_init(&ctx_jh); - // ZJH; sph_jh512 (&ctx_jh, (const void*) hash, 64); sph_jh512_close(&ctx_jh, (void*) hash); } @@ -147,23 +135,17 @@ extern "C" void quarkhash(void *state, const void *input) memcpy(state, hash, 32); } - -extern bool opt_benchmark; - extern "C" int scanhash_quark(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { const uint32_t first_nonce = pdata[19]; + const int throughput = 256*4096; // 100; + static bool init[8] = {0,0,0,0,0,0,0,0}; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; - - const uint32_t Htarg = ptarget[7]; + ((uint32_t*)ptarget)[7] = 0x00FF; - const int throughput = 256*4096; // 100; - - static bool init[8] = {0,0,0,0,0,0,0,0}; if (!init[thr_id]) { cudaSetDevice(device_map[thr_id]); @@ -252,11 +234,12 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata, uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++); if (foundNonce != 0xffffffff) { + const uint32_t Htarg = ptarget[7]; uint32_t vhash64[8]; be32enc(&endiandata[19], foundNonce); quarkhash(vhash64, endiandata); - if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) { + if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { pdata[19] = foundNonce; *hashes_done = (foundNonce - first_nonce + 1)/2; diff --git a/x11/x11.cu b/x11/x11.cu index 3c18030..dc2f97f 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -21,10 +21,9 @@ extern "C" #include } -// aus cpu-miner.c +// in cpu-miner.c extern int device_map[8]; -// Speicher für Input/Output der verketteten Hashfunktionen static uint32_t *d_hash[8]; extern void quark_blake512_cpu_init(int thr_id, int threads); @@ -140,22 +139,17 @@ extern "C" void x11hash(void *output, const void *input) } -extern bool opt_benchmark; - extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) { const uint32_t first_nonce = pdata[19]; + const int throughput = 256*256*8; + static bool init[8] = {0,0,0,0,0,0,0,0}; if (opt_benchmark) ((uint32_t*)ptarget)[7] = 0x0000ff; - const uint32_t Htarg = ptarget[7]; - - const int throughput = 256*256*8; - - static bool init[8] = {0,0,0,0,0,0,0,0}; if (!init[thr_id]) { CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id])); @@ -186,8 +180,10 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, cuda_check_cpu_setTarget(ptarget); do { - uint32_t foundNonce; + const uint32_t Htarg = ptarget[7]; + int order = 0; + uint32_t foundNonce; // Hash with CUDA quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); @@ -204,7 +200,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, // Scan nach Gewinner Hashes auf der GPU foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - if (foundNonce != 0xffffffff) + if (foundNonce != 0xffffffff) { uint32_t vhash64[8]; be32enc(&endiandata[19], foundNonce); diff --git a/x15/x14.cu b/x15/x14.cu index 0b56584..b3519cd 100644 --- a/x15/x14.cu +++ b/x15/x14.cu @@ -20,11 +20,11 @@ extern "C" { #include "sph/sph_hamsi.h" #include "sph/sph_fugue.h" #include "sph/sph_shabal.h" +} #include "miner.h" #include "cuda_helper.h" -} // from cpu-miner.c extern int device_map[8]; @@ -167,8 +167,6 @@ extern "C" void x14hash(void *output, const void *input) } -extern bool opt_benchmark; - extern "C" int scanhash_x14(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) diff --git a/x15/x15.cu b/x15/x15.cu index 50e2080..faea354 100644 --- a/x15/x15.cu +++ b/x15/x15.cu @@ -21,14 +21,11 @@ extern "C" { #include "sph/sph_fugue.h" #include "sph/sph_shabal.h" #include "sph/sph_whirlpool.h" +} #include "miner.h" #include "cuda_helper.h" -} - -// to test gpu hash on a null buffer -#define NULLTEST 0 // from cpu-miner.c extern int device_map[8]; @@ -92,8 +89,6 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order); -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); - // X15 CPU Hash function extern "C" void x15hash(void *output, const void *input) { @@ -181,17 +176,6 @@ extern "C" void x15hash(void *output, const void *input) memcpy(output, hash, 32); } -#if NULLTEST -static void print_hash(unsigned char *hash) -{ - for (int i=0; i < 32; i += 4) { - printf("%02x%02x%02x%02x ", hash[i], hash[i+1], hash[i+2], hash[i+3]); - } -} -#endif - -extern bool opt_benchmark; - extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done) @@ -203,12 +187,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, uint32_t Htarg = ptarget[7]; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = Htarg = 0x0000ff; - -#if NULLTEST - for (int k=0; k < 20; k++) - pdata[k] = 0; -#endif + ((uint32_t*)ptarget)[7] = Htarg = 0x00FF; if (!init[thr_id]) { @@ -259,12 +238,6 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata, x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); -#if NULLTEST - uint32_t buf[8]; memset(buf, 0, sizeof buf); - CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost)); - CUDA_SAFE_CALL(cudaThreadSynchronize()); - print_hash((unsigned char*)buf); printf("\n"); -#endif /* Scan with GPU */ uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); diff --git a/x17/x17.cu b/x17/x17.cu index ffcd57a..65d2259 100644 --- a/x17/x17.cu +++ b/x17/x17.cu @@ -26,17 +26,15 @@ extern "C" #include "sph/sph_sha2.h" #include "sph/sph_haval.h" +} #include "miner.h" -} +#include "cuda_helper.h" static uint32_t *d_hash[8]; - -// cpu-miner.c +// in cpu-miner.c extern int device_map[8]; -extern bool opt_benchmark; - extern void quark_blake512_cpu_init(int thr_id, int threads); extern void quark_blake512_cpu_setBlock_80(void *pdata); @@ -204,20 +202,12 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata, unsigned long *hashes_done) { const uint32_t first_nonce = pdata[19]; - - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; - const int throughput = 256*256*8; - - if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x0000ff; - static bool init[8] = {0,0,0,0,0,0,0,0}; uint32_t Htarg = ptarget[7]; if (opt_benchmark) - ((uint32_t*)ptarget)[7] = Htarg = 0x0000ff; + ((uint32_t*)ptarget)[7] = Htarg = 0x00FF; if (!init[thr_id]) {