From eae4ede11170a530da2c6de8e63d90742c2a4c53 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Thu, 23 Jun 2016 03:53:55 +0200 Subject: [PATCH] decred: return to previous implementation + second nonce seems better on windows and a bit easier to read... --- Algo256/decred.cu | 585 ++++++++++++++++++++-------------------------- 1 file changed, 253 insertions(+), 332 deletions(-) diff --git a/Algo256/decred.cu b/Algo256/decred.cu index 7c072b2..9e5c488 100644 --- a/Algo256/decred.cu +++ b/Algo256/decred.cu @@ -1,5 +1,5 @@ /** - * Blake-256 Decred 180-Bytes input Cuda Kernel (Tested on SM 5/5.2) + * Blake-256 Decred 180-Bytes input Cuda Kernel (Tested on SM 5/5.2/6.1) * * Tanguy Pruvot - Feb 2016 * @@ -17,11 +17,10 @@ extern "C" { } /* threads per block */ -#define TPB 768 -#define NPT 192 -#define maxResults 8 -/* max count of found nonces in one call */ -#define NBN 2 +#define TPB 640 + +/* max count of found nonces in one call (like sgminer) */ +#define maxResults 4 /* hash by cpu with blake 256 */ extern "C" void decred_hash(void *output, const void *input) @@ -39,311 +38,182 @@ extern "C" void decred_hash(void *output, const void *input) #ifdef __INTELLISENSE__ #define __byte_perm(x, y, b) x -#define atomicInc(p, max) (*p) +#define atomicInc(p, max) (*p)++ #endif -__constant__ uint32_t c_m[3]; -__constant__ uint32_t _ALIGN(8) c_h[2]; -__constant__ uint32_t _ALIGN(32) c_v[16]; -__constant__ uint32_t _ALIGN(32) c_x[90]; +__constant__ uint32_t _ALIGN(16) c_h[2]; +__constant__ uint32_t _ALIGN(16) c_data[32]; +__constant__ uint32_t _ALIGN(16) c_xors[215]; /* Buffers of candidate nonce(s) */ static uint32_t *d_resNonce[MAX_GPUS]; static uint32_t *h_resNonce[MAX_GPUS]; -__device__ __forceinline__ -uint32_t ROR8(const uint32_t a) { - return __byte_perm(a, 0, 0x0321); -} +#define ROR8(a) __byte_perm(a, 0, 0x0321) +#define ROL16(a) __byte_perm(a, 0, 0x1032) -__device__ __forceinline__ -uint32_t ROL16(const uint32_t a) { - return __byte_perm(a, 0, 0x1032); -} - -__device__ __forceinline__ -uint32_t xor3x(uint32_t a, uint32_t b, uint32_t c) { - uint32_t result; -#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 - asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result) : "r"(a), "r"(b),"r"(c)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA -#else - result = a^b^c; -#endif - return result; -} - -#define GSn(a,b,c,d,x,y) { \ - v[a]+= x + v[b]; \ +/* macro bodies */ +#define pxorGS(a,b,c,d) { \ + v[a]+= c_xors[i++] + v[b]; \ v[d] = ROL16(v[d] ^ v[a]); \ v[c]+= v[d]; \ v[b] = ROTR32(v[b] ^ v[c], 12); \ - v[a]+= y + v[b]; \ + v[a]+= c_xors[i++] + v[b]; \ v[d] = ROR8(v[d] ^ v[a]); \ v[c]+= v[d]; \ v[b] = ROTR32(v[b] ^ v[c], 7); \ } -#define GSn3(a,b,c,d,x,y, a1,b1,c1,d1,x1,y1, a2,b2,c2,d2,x2,y2) { \ - v[ a]+= x + v[ b]; v[a1]+= x1 + v[b1]; v[a2]+= x2 + v[b2];\ - v[ d] = ROL16(v[ d] ^ v[ a]); v[d1] = ROL16(v[d1] ^ v[a1]); v[d2] = ROL16(v[d2] ^ v[a2]);\ - v[ c]+= v[ d]; v[c1]+= v[d1]; v[c2]+= v[d2];\ - v[ b] = ROTR32(v[ b] ^ v[ c], 12); v[b1] = ROTR32(v[b1] ^ v[c1], 12); v[b2] = ROTR32(v[b2] ^ v[c2], 12);\ - v[ a]+= y + v[ b]; v[a1]+= y1 + v[b1]; v[a2]+= y2 + v[b2];\ - v[ d] = ROR8(v[ d] ^ v[ a]); v[d1] = ROR8(v[d1] ^ v[a1]); v[d2] = ROR8(v[d2] ^ v[a2]);\ - v[ c]+= v[ d]; v[c1]+= v[d1]; v[c2]+= v[d2];\ - v[ b] = ROTR32(v[ b] ^ v[ c], 7); v[b1] = ROTR32(v[b1] ^ v[c1], 7); v[b2] = ROTR32(v[b2] ^ v[c2], 7);\ +#define pxorGS2(a,b,c,d, a1,b1,c1,d1) {\ + v[ a]+= c_xors[i++] + v[ b]; v[a1]+= c_xors[i++] + v[b1]; \ + v[ d] = ROL16(v[ d] ^ v[ a]); v[d1] = ROL16(v[d1] ^ v[a1]); \ + v[ c]+= v[ d]; v[c1]+= v[d1]; \ + v[ b] = ROTR32(v[ b] ^ v[ c], 12); v[b1] = ROTR32(v[b1] ^ v[c1], 12); \ + v[ a]+= c_xors[i++] + v[ b]; v[a1]+= c_xors[i++] + v[b1]; \ + v[ d] = ROR8(v[ d] ^ v[ a]); v[d1] = ROR8(v[d1] ^ v[a1]); \ + v[ c]+= v[ d]; v[c1]+= v[d1]; \ + v[ b] = ROTR32(v[ b] ^ v[ c], 7); v[b1] = ROTR32(v[b1] ^ v[c1], 7); \ } -#define GSn4(a,b,c,d,x,y, a1,b1,c1,d1,x1,y1, a2,b2,c2,d2,x2,y2, a3,b3,c3,d3,x3,y3) { \ - v[ a]+= x + v[ b]; v[a1]+= x1 + v[b1]; v[a2]+= x2 + v[b2]; v[a3]+= x3 + v[b3]; \ - v[ d] = ROL16(v[ d] ^ v[ a]); v[d1] = ROL16(v[d1] ^ v[a1]); v[d2] = ROL16(v[d2] ^ v[a2]); v[d3] = ROL16(v[d3] ^ v[a3]); \ - v[ c]+= v[ d]; v[c1]+= v[d1]; v[c2]+= v[d2]; v[c3]+= v[d3]; \ - v[ b] = ROTR32(v[ b] ^ v[ c], 12); v[b1] = ROTR32(v[b1] ^ v[c1], 12); v[b2] = ROTR32(v[b2] ^ v[c2], 12); v[b3] = ROTR32(v[b3] ^ v[c3], 12); \ - v[ a]+= y + v[ b]; v[a1]+= y1 + v[b1]; v[a2]+= y2 + v[b2]; v[a3]+= y3 + v[b3]; \ - v[ d] = ROR8(v[ d] ^ v[ a]); v[d1] = ROR8(v[d1] ^ v[a1]); v[d2] = ROR8(v[d2] ^ v[a2]); v[d3] = ROR8(v[d3] ^ v[a3]); \ - v[ c]+= v[ d]; v[c1]+= v[d1]; v[c2]+= v[d2]; v[c3]+= v[d3]; \ - v[ b] = ROTR32(v[ b] ^ v[ c], 7); v[b1] = ROTR32(v[b1] ^ v[c1], 7); v[b2] = ROTR32(v[b2] ^ v[c2], 7); v[b3] = ROTR32(v[b3] ^ v[c3], 7); \ +#define pxory1GS2(a,b,c,d, a1,b1,c1,d1) { \ + v[ a]+= c_xors[i++] + v[ b]; v[a1]+= c_xors[i++] + v[b1]; \ + v[ d] = ROL16(v[ d] ^ v[ a]); v[d1] = ROL16(v[d1] ^ v[a1]); \ + v[ c]+= v[ d]; v[c1]+= v[d1]; \ + v[ b] = ROTR32(v[ b] ^ v[ c], 12); v[b1] = ROTR32(v[b1] ^ v[c1], 12); \ + v[ a]+= c_xors[i++] + v[ b]; v[a1]+= (c_xors[i++]^nonce) + v[b1]; \ + v[ d] = ROR8(v[ d] ^ v[ a]); v[d1] = ROR8(v[d1] ^ v[a1]); \ + v[ c]+= v[ d]; v[c1]+= v[d1]; \ + v[ b] = ROTR32(v[ b] ^ v[ c], 7); v[b1] = ROTR32(v[b1] ^ v[c1], 7); \ +} + +#define pxory0GS2(a,b,c,d, a1,b1,c1,d1) { \ + v[ a]+= c_xors[i++] + v[ b]; v[a1]+= c_xors[i++] + v[b1]; \ + v[ d] = ROL16(v[ d] ^ v[ a]); v[d1] = ROL16(v[d1] ^ v[a1]); \ + v[ c]+= v[ d]; v[c1]+= v[d1]; \ + v[ b] = ROTR32(v[ b] ^ v[ c], 12); v[b1] = ROTR32(v[b1] ^ v[c1], 12); \ + v[ a]+= (c_xors[i++]^nonce) + v[ b]; v[a1]+= c_xors[i++] + v[b1]; \ + v[ d] = ROR8(v[ d] ^ v[ a]); v[d1] = ROR8(v[d1] ^ v[a1]); \ + v[ c]+= v[ d]; v[c1]+= v[d1]; \ + v[ b] = ROTR32(v[ b] ^ v[ c], 7); v[b1] = ROTR32(v[b1] ^ v[c1], 7); \ +} + +#define pxorx1GS2(a,b,c,d, a1,b1,c1,d1) { \ + v[ a]+= c_xors[i++] + v[ b]; v[a1]+= (c_xors[i++]^nonce) + v[b1]; \ + v[ d] = ROL16(v[ d] ^ v[ a]); v[d1] = ROL16(v[d1] ^ v[a1]); \ + v[ c]+= v[ d]; v[c1]+= v[d1]; \ + v[ b] = ROTR32(v[ b] ^ v[ c], 12); v[b1] = ROTR32(v[b1] ^ v[c1], 12); \ + v[ a]+= c_xors[i++] + v[ b]; v[a1]+= c_xors[i++] + v[b1]; \ + v[ d] = ROR8(v[ d] ^ v[ a]); v[d1] = ROR8(v[d1] ^ v[a1]); \ + v[ c]+= v[ d]; v[c1]+= v[d1]; \ + v[ b] = ROTR32(v[ b] ^ v[ c], 7); v[b1] = ROTR32(v[b1] ^ v[c1], 7); \ +} + +#define pxorx0GS2(a,b,c,d, a1,b1,c1,d1) { \ + v[ a]+= (c_xors[i++]^nonce) + v[ b]; v[a1]+= c_xors[i++] + v[b1]; \ + v[ d] = ROL16(v[ d] ^ v[ a]); v[d1] = ROL16(v[d1] ^ v[a1]); \ + v[ c]+= v[ d]; v[c1]+= v[d1]; \ + v[ b] = ROTR32(v[ b] ^ v[ c], 12); v[b1] = ROTR32(v[b1] ^ v[c1], 12); \ + v[ a]+= c_xors[i++] + v[ b]; v[a1]+= c_xors[i++] + v[b1]; \ + v[ d] = ROR8(v[ d] ^ v[ a]); v[d1] = ROR8(v[d1] ^ v[a1]); \ + v[ c]+= v[ d]; v[c1]+= v[d1]; \ + v[ b] = ROTR32(v[ b] ^ v[ c], 7); v[b1] = ROTR32(v[b1] ^ v[c1], 7); \ } __global__ __launch_bounds__(TPB,1) -void decred_gpu_hash_nonce(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce) +void decred_gpu_hash_nonce(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint32_t highTarget) { - uint64_t m3 = startNonce + blockDim.x * blockIdx.x + threadIdx.x; - const uint32_t step = gridDim.x * blockDim.x; - const uint64_t maxNonce = startNonce + threads; + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; - const uint32_t z[16] = { - 0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344, - 0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89, - 0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C, - 0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917 - }; - - uint32_t v[16]; - uint32_t m[16]; - - #pragma unroll - for(int i=0;i<3;i++) { - m[i] = c_m[i]; - } - m[13] = 0x80000001; - m[15] = 0x000005a0; - - const uint32_t m130 = z[12] ^ m[13]; - const uint32_t m131 = m[13] ^ z[ 6]; - const uint32_t m132 = z[15] ^ m[13]; - const uint32_t m133 = z[ 3] ^ m[13]; - const uint32_t m134 = z[ 4] ^ m[13]; - const uint32_t m135 = z[14] ^ m[13]; - const uint32_t m136 = m[13] ^ z[11]; - const uint32_t m137 = m[13] ^ z[ 7]; - const uint32_t m138 = m[13] ^ z[ 0]; - - volatile uint32_t m150 = z[14] ^ m[15]; - volatile uint32_t m151 = z[ 9] ^ m[15]; - volatile uint32_t m152 = m[15] ^ z[13]; - volatile uint32_t m153 = m[15] ^ z[ 8]; - const uint32_t m154 = z[10] ^ m[15]; - const uint32_t m155 = z[ 1] ^ m[15]; - const uint32_t m156 = m[15] ^ z[ 4]; - const uint32_t m157 = z[ 6] ^ m[15]; - const uint32_t m158 = m[15] ^ z[11]; - - const uint32_t h7 = c_h[ 0]; - - for( ; m3 500 && !is_windows()) ? 29 : 25; if (device_sm[dev_id] < 350) intensity = 22; @@ -443,7 +364,7 @@ extern "C" int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); - const dim3 grid((throughput + (NPT*TPB)-1)/(NPT*TPB)); + const dim3 grid((throughput + TPB-1)/(TPB)); const dim3 block(TPB); if (opt_benchmark) { @@ -465,7 +386,7 @@ extern "C" int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce } memcpy(endiandata, pdata, 180); - decred_cpu_setBlock_52(thr_id, endiandata, &pdata[32]); + decred_cpu_setBlock_52(endiandata); h_resNonce[thr_id][0] = 1; do { @@ -473,7 +394,7 @@ extern "C" int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce cudaMemset(d_resNonce[thr_id], 0x00, sizeof(uint32_t)); // GPU HASH - decred_gpu_hash_nonce <<>> (throughput, (*pnonce), d_resNonce[thr_id]); + decred_gpu_hash_nonce <<>> (throughput, (*pnonce), d_resNonce[thr_id], targetHigh); cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); if (h_resNonce[thr_id][0]) @@ -482,26 +403,26 @@ extern "C" int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce for(uint32_t i=1; i <= h_resNonce[thr_id][0]; i++) { - uint32_t vhash64[8]; + uint32_t _ALIGN(64) vhash[8]; be32enc(&endiandata[DCR_NONCE_OFT32], h_resNonce[thr_id][i]); - decred_hash(vhash64, endiandata); - if (vhash64[6] <= ptarget[6] && fulltest(vhash64, ptarget)) + decred_hash(vhash, endiandata); + if (vhash[6] <= ptarget[6] && fulltest(vhash, ptarget)) { int rc = 1; - work_set_target_ratio(work, vhash64); + work_set_target_ratio(work, vhash); *hashes_done = (*pnonce) - first_nonce + throughput; work->nonces[0] = swab32(h_resNonce[thr_id][i]); // search for another nonce for(uint32_t j=i+1; j <= h_resNonce[thr_id][0]; j++) { be32enc(&endiandata[DCR_NONCE_OFT32], h_resNonce[thr_id][j]); - decred_hash(vhash64, endiandata); - if (vhash64[6] <= ptarget[6] && fulltest(vhash64, ptarget)){ + decred_hash(vhash, endiandata); + if (vhash[6] <= ptarget[6] && fulltest(vhash, ptarget)){ work->nonces[1] = swab32(h_resNonce[thr_id][j]); if(!opt_quiet) gpulog(LOG_NOTICE, thr_id, "second nonce found %u / %08x - %u / %08x", i, work->nonces[0], j, work->nonces[1]); - if(bn_hash_target_ratio(vhash64, ptarget) > work->shareratio) { - work_set_target_ratio(work, vhash64); + if(bn_hash_target_ratio(vhash, ptarget) > work->shareratio) { + work_set_target_ratio(work, vhash); xchg(work->nonces[1], work->nonces[0]); } rc = 2;