diff --git a/Algo256/decred.cu b/Algo256/decred.cu index c71b2aa..7c072b2 100644 --- a/Algo256/decred.cu +++ b/Algo256/decred.cu @@ -2,7 +2,10 @@ * Blake-256 Decred 180-Bytes input Cuda Kernel (Tested on SM 5/5.2) * * Tanguy Pruvot - Feb 2016 - * Alexis Provos - Mar 2016 + * + * Merged 8-round blake (XVC) tweaks + * Further improved by: ~2.72% + * Alexis Provos - Jun 2016 */ #include @@ -14,7 +17,11 @@ extern "C" { } /* threads per block */ -#define TPB 640 +#define TPB 768 +#define NPT 192 +#define maxResults 8 +/* max count of found nonces in one call */ +#define NBN 2 /* hash by cpu with blake 256 */ extern "C" void decred_hash(void *output, const void *input) @@ -32,179 +39,312 @@ extern "C" void decred_hash(void *output, const void *input) #ifdef __INTELLISENSE__ #define __byte_perm(x, y, b) x +#define atomicInc(p, max) (*p) #endif -__constant__ uint32_t _ALIGN(16) c_data[32]; -__constant__ uint32_t _ALIGN(16) c_h[ 2]; -__constant__ uint32_t _ALIGN(16) c_xors[215]; +__constant__ uint32_t c_m[3]; +__constant__ uint32_t _ALIGN(8) c_h[2]; +__constant__ uint32_t _ALIGN(32) c_v[16]; +__constant__ uint32_t _ALIGN(32) c_x[90]; -/* 16 adapters max */ +/* Buffers of candidate nonce(s) */ static uint32_t *d_resNonce[MAX_GPUS]; static uint32_t *h_resNonce[MAX_GPUS]; -/* macro bodies */ -#define pxorGS(a,b,c,d) { \ - v[a]+= c_xors[i++] + v[b]; \ - v[d] = __byte_perm(v[d] ^ v[a], 0, 0x1032); \ +__device__ __forceinline__ +uint32_t ROR8(const uint32_t a) { + return __byte_perm(a, 0, 0x0321); +} + +__device__ __forceinline__ +uint32_t ROL16(const uint32_t a) { + return __byte_perm(a, 0, 0x1032); +} + +__device__ __forceinline__ +uint32_t xor3x(uint32_t a, uint32_t b, uint32_t c) { + uint32_t result; +#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 + asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r"(result) : "r"(a), "r"(b),"r"(c)); //0x96 = 0xF0 ^ 0xCC ^ 0xAA +#else + result = a^b^c; +#endif + return result; +} + +#define GSn(a,b,c,d,x,y) { \ + v[a]+= x + v[b]; \ + v[d] = ROL16(v[d] ^ v[a]); \ v[c]+= v[d]; \ v[b] = ROTR32(v[b] ^ v[c], 12); \ - v[a]+= c_xors[i++] + v[b]; \ - v[d] = __byte_perm(v[d] ^ v[a], 0, 0x0321); \ + v[a]+= y + v[b]; \ + v[d] = ROR8(v[d] ^ v[a]); \ v[c]+= v[d]; \ v[b] = ROTR32(v[b] ^ v[c], 7); \ } -#define pxorGS2(a,b,c,d,a1,b1,c1,d1) {\ - v[ a]+= c_xors[i++] + v[ b]; v[a1]+= c_xors[i++] + v[b1]; \ - v[ d] = __byte_perm(v[ d] ^ v[ a], 0, 0x1032); v[d1] = __byte_perm(v[d1] ^ v[a1], 0, 0x1032); \ - v[ c]+= v[ d]; v[c1]+= v[d1]; \ - v[ b] = ROTR32(v[ b] ^ v[ c], 12); v[b1] = ROTR32(v[b1] ^ v[c1], 12); \ - v[ a]+= c_xors[i++] + v[ b]; v[a1]+= c_xors[i++] + v[b1]; \ - v[ d] = __byte_perm(v[ d] ^ v[ a], 0, 0x0321); v[d1] = __byte_perm(v[d1] ^ v[a1], 0, 0x0321); \ - v[ c]+= v[ d]; v[c1]+= v[d1]; \ - v[ b] = ROTR32(v[ b] ^ v[ c], 7); v[b1] = ROTR32(v[b1] ^ v[c1], 7); \ +#define GSn3(a,b,c,d,x,y, a1,b1,c1,d1,x1,y1, a2,b2,c2,d2,x2,y2) { \ + v[ a]+= x + v[ b]; v[a1]+= x1 + v[b1]; v[a2]+= x2 + v[b2];\ + v[ d] = ROL16(v[ d] ^ v[ a]); v[d1] = ROL16(v[d1] ^ v[a1]); v[d2] = ROL16(v[d2] ^ v[a2]);\ + v[ c]+= v[ d]; v[c1]+= v[d1]; v[c2]+= v[d2];\ + v[ b] = ROTR32(v[ b] ^ v[ c], 12); v[b1] = ROTR32(v[b1] ^ v[c1], 12); v[b2] = ROTR32(v[b2] ^ v[c2], 12);\ + v[ a]+= y + v[ b]; v[a1]+= y1 + v[b1]; v[a2]+= y2 + v[b2];\ + v[ d] = ROR8(v[ d] ^ v[ a]); v[d1] = ROR8(v[d1] ^ v[a1]); v[d2] = ROR8(v[d2] ^ v[a2]);\ + v[ c]+= v[ d]; v[c1]+= v[d1]; v[c2]+= v[d2];\ + v[ b] = ROTR32(v[ b] ^ v[ c], 7); v[b1] = ROTR32(v[b1] ^ v[c1], 7); v[b2] = ROTR32(v[b2] ^ v[c2], 7);\ } -#define pxory1GS2(a,b,c,d,a1,b1,c1,d1) { \ - v[ a]+= c_xors[i++] + v[ b]; v[a1]+= c_xors[i++] + v[b1]; \ - v[ d] = __byte_perm(v[ d] ^ v[ a], 0, 0x1032); v[d1] = __byte_perm(v[d1] ^ v[a1], 0, 0x1032); \ - v[ c]+= v[ d]; v[c1]+= v[d1]; \ - v[ b] = ROTR32(v[ b] ^ v[ c], 12); v[b1] = ROTR32(v[b1] ^ v[c1], 12); \ - v[ a]+= c_xors[i++] + v[ b]; v[a1]+= (c_xors[i++]^nonce) + v[b1]; \ - v[ d] = __byte_perm(v[ d] ^ v[ a], 0, 0x0321); v[d1] = __byte_perm(v[d1] ^ v[a1], 0, 0x0321); \ - v[ c]+= v[ d]; v[c1]+= v[d1]; \ - v[ b] = ROTR32(v[ b] ^ v[ c], 7); v[b1] = ROTR32(v[b1] ^ v[c1], 7); \ -} -#define pxory0GS2(a,b,c,d,a1,b1,c1,d1) { \ - v[ a]+= c_xors[i++] + v[ b]; v[a1]+= c_xors[i++] + v[b1]; \ - v[ d] = __byte_perm(v[ d] ^ v[ a], 0, 0x1032); v[d1] = __byte_perm(v[d1] ^ v[a1], 0, 0x1032); \ - v[ c]+= v[ d]; v[c1]+= v[d1]; \ - v[ b] = ROTR32(v[ b] ^ v[ c], 12); v[b1] = ROTR32(v[b1] ^ v[c1], 12); \ - v[ a]+= (c_xors[i++]^nonce) + v[ b]; v[a1]+= c_xors[i++] + v[b1]; \ - v[ d] = __byte_perm(v[ d] ^ v[ a], 0, 0x0321); v[d1] = __byte_perm(v[d1] ^ v[a1], 0, 0x0321); \ - v[ c]+= v[ d]; v[c1]+= v[d1]; \ - v[ b] = ROTR32(v[ b] ^ v[ c], 7); v[b1] = ROTR32(v[b1] ^ v[c1], 7); \ -} -#define pxorx1GS2(a,b,c,d,a1,b1,c1,d1) { \ - v[ a]+= c_xors[i++] + v[ b]; v[a1]+= (c_xors[i++]^nonce) + v[b1]; \ - v[ d] = __byte_perm(v[ d] ^ v[ a], 0, 0x1032); v[d1] = __byte_perm(v[d1] ^ v[a1], 0, 0x1032); \ - v[ c]+= v[ d]; v[c1]+= v[d1]; \ - v[ b] = ROTR32(v[ b] ^ v[ c], 12); v[b1] = ROTR32(v[b1] ^ v[c1], 12); \ - v[ a]+= c_xors[i++] + v[ b]; v[a1]+= c_xors[i++] + v[b1]; \ - v[ d] = __byte_perm(v[ d] ^ v[ a], 0, 0x0321); v[d1] = __byte_perm(v[d1] ^ v[a1], 0, 0x0321); \ - v[ c]+= v[ d]; v[c1]+= v[d1]; \ - v[ b] = ROTR32(v[ b] ^ v[ c], 7); v[b1] = ROTR32(v[b1] ^ v[c1], 7); \ -} -#define pxorx0GS2(a,b,c,d,a1,b1,c1,d1) { \ - v[ a]+= (c_xors[i++]^nonce) + v[ b]; v[a1]+= c_xors[i++] + v[b1]; \ - v[ d] = __byte_perm(v[ d] ^ v[ a], 0, 0x1032); v[d1] = __byte_perm(v[d1] ^ v[a1], 0, 0x1032); \ - v[ c]+= v[ d]; v[c1]+= v[d1]; \ - v[ b] = ROTR32(v[ b] ^ v[ c], 12); v[b1] = ROTR32(v[b1] ^ v[c1], 12); \ - v[ a]+= c_xors[i++] + v[ b]; v[a1]+= c_xors[i++] + v[b1]; \ - v[ d] = __byte_perm(v[ d] ^ v[ a], 0, 0x0321); v[d1] = __byte_perm(v[d1] ^ v[a1], 0, 0x0321); \ - v[ c]+= v[ d]; v[c1]+= v[d1]; \ - v[ b] = ROTR32(v[ b] ^ v[ c], 7); v[b1] = ROTR32(v[b1] ^ v[c1], 7); \ +#define GSn4(a,b,c,d,x,y, a1,b1,c1,d1,x1,y1, a2,b2,c2,d2,x2,y2, a3,b3,c3,d3,x3,y3) { \ + v[ a]+= x + v[ b]; v[a1]+= x1 + v[b1]; v[a2]+= x2 + v[b2]; v[a3]+= x3 + v[b3]; \ + v[ d] = ROL16(v[ d] ^ v[ a]); v[d1] = ROL16(v[d1] ^ v[a1]); v[d2] = ROL16(v[d2] ^ v[a2]); v[d3] = ROL16(v[d3] ^ v[a3]); \ + v[ c]+= v[ d]; v[c1]+= v[d1]; v[c2]+= v[d2]; v[c3]+= v[d3]; \ + v[ b] = ROTR32(v[ b] ^ v[ c], 12); v[b1] = ROTR32(v[b1] ^ v[c1], 12); v[b2] = ROTR32(v[b2] ^ v[c2], 12); v[b3] = ROTR32(v[b3] ^ v[c3], 12); \ + v[ a]+= y + v[ b]; v[a1]+= y1 + v[b1]; v[a2]+= y2 + v[b2]; v[a3]+= y3 + v[b3]; \ + v[ d] = ROR8(v[ d] ^ v[ a]); v[d1] = ROR8(v[d1] ^ v[a1]); v[d2] = ROR8(v[d2] ^ v[a2]); v[d3] = ROR8(v[d3] ^ v[a3]); \ + v[ c]+= v[ d]; v[c1]+= v[d1]; v[c2]+= v[d2]; v[c3]+= v[d3]; \ + v[ b] = ROTR32(v[ b] ^ v[ c], 7); v[b1] = ROTR32(v[b1] ^ v[c1], 7); v[b2] = ROTR32(v[b2] ^ v[c2], 7); v[b3] = ROTR32(v[b3] ^ v[c3], 7); \ } __global__ __launch_bounds__(TPB,1) -void decred_gpu_hash_nonce(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint32_t highTarget) +void decred_gpu_hash_nonce(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce) { - const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; - - if (thread < threads) - { - uint32_t v[16]; - #pragma unroll - for(int i=0;i<16;i+=4){ - *(uint4*)&v[i] = *(uint4*)&c_data[ i]; + uint64_t m3 = startNonce + blockDim.x * blockIdx.x + threadIdx.x; + const uint32_t step = gridDim.x * blockDim.x; + const uint64_t maxNonce = startNonce + threads; + + const uint32_t z[16] = { + 0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344, + 0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89, + 0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C, + 0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917 + }; + + uint32_t v[16]; + uint32_t m[16]; + + #pragma unroll + for(int i=0;i<3;i++) { + m[i] = c_m[i]; + } + m[13] = 0x80000001; + m[15] = 0x000005a0; + + const uint32_t m130 = z[12] ^ m[13]; + const uint32_t m131 = m[13] ^ z[ 6]; + const uint32_t m132 = z[15] ^ m[13]; + const uint32_t m133 = z[ 3] ^ m[13]; + const uint32_t m134 = z[ 4] ^ m[13]; + const uint32_t m135 = z[14] ^ m[13]; + const uint32_t m136 = m[13] ^ z[11]; + const uint32_t m137 = m[13] ^ z[ 7]; + const uint32_t m138 = m[13] ^ z[ 0]; + + volatile uint32_t m150 = z[14] ^ m[15]; + volatile uint32_t m151 = z[ 9] ^ m[15]; + volatile uint32_t m152 = m[15] ^ z[13]; + volatile uint32_t m153 = m[15] ^ z[ 8]; + const uint32_t m154 = z[10] ^ m[15]; + const uint32_t m155 = z[ 1] ^ m[15]; + const uint32_t m156 = m[15] ^ z[ 4]; + const uint32_t m157 = z[ 6] ^ m[15]; + const uint32_t m158 = m[15] ^ z[11]; + + const uint32_t h7 = c_h[ 0]; + + for( ; m3 500 && !is_windows()) ? 29 : 25; if (device_sm[dev_id] < 350) intensity = 22; @@ -357,15 +443,13 @@ extern "C" int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); - const dim3 grid((throughput + TPB-1)/(TPB)); + const dim3 grid((throughput + (NPT*TPB)-1)/(NPT*TPB)); const dim3 block(TPB); - int rc = 0; - if (opt_benchmark) { ptarget[6] = swab32(0xff); } - if (!init[thr_id]) { + if (!init[thr_id]){ cudaSetDevice(dev_id); if (opt_cudaschedule == -1 && gpu_threads == 1) { cudaDeviceReset(); @@ -375,36 +459,67 @@ extern "C" int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce CUDA_LOG_ERROR(); } - CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], sizeof(uint32_t)), -1); - CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], sizeof(uint32_t)), -1); + CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], maxResults*sizeof(uint32_t)), -1); + CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], maxResults*sizeof(uint32_t)), -1); init[thr_id] = true; } - cudaMemset(d_resNonce[thr_id], 0xff, sizeof(uint32_t)); - memcpy(endiandata, pdata, 180); - decred_cpu_setBlock_52(endiandata); + decred_cpu_setBlock_52(thr_id, endiandata, &pdata[32]); + h_resNonce[thr_id][0] = 1; do { + if (h_resNonce[thr_id][0]) + cudaMemset(d_resNonce[thr_id], 0x00, sizeof(uint32_t)); + // GPU HASH - decred_gpu_hash_nonce <<>> (throughput, (*pnonce), d_resNonce[thr_id], targetHigh); - cudaThreadSynchronize(); + decred_gpu_hash_nonce <<>> (throughput, (*pnonce), d_resNonce[thr_id]); cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); - if (h_resNonce[thr_id][0] != UINT32_MAX) { - rc = 1; -// work_set_target_ratio(work, vhashcpu); - *hashes_done = (*pnonce) - first_nonce + throughput; - work->nonces[0] = swab32(h_resNonce[thr_id][0]); - *pnonce = work->nonces[0]; - return 1; + + if (h_resNonce[thr_id][0]) + { + cudaMemcpy(h_resNonce[thr_id], d_resNonce[thr_id], (h_resNonce[thr_id][0]+1)*sizeof(uint32_t), cudaMemcpyDeviceToHost); + + for(uint32_t i=1; i <= h_resNonce[thr_id][0]; i++) + { + uint32_t vhash64[8]; + be32enc(&endiandata[DCR_NONCE_OFT32], h_resNonce[thr_id][i]); + decred_hash(vhash64, endiandata); + if (vhash64[6] <= ptarget[6] && fulltest(vhash64, ptarget)) + { + int rc = 1; + work_set_target_ratio(work, vhash64); + *hashes_done = (*pnonce) - first_nonce + throughput; + work->nonces[0] = swab32(h_resNonce[thr_id][i]); + // search for another nonce + for(uint32_t j=i+1; j <= h_resNonce[thr_id][0]; j++) + { + be32enc(&endiandata[DCR_NONCE_OFT32], h_resNonce[thr_id][j]); + decred_hash(vhash64, endiandata); + if (vhash64[6] <= ptarget[6] && fulltest(vhash64, ptarget)){ + work->nonces[1] = swab32(h_resNonce[thr_id][j]); + if(!opt_quiet) + gpulog(LOG_NOTICE, thr_id, "second nonce found %u / %08x - %u / %08x", i, work->nonces[0], j, work->nonces[1]); + if(bn_hash_target_ratio(vhash64, ptarget) > work->shareratio) { + work_set_target_ratio(work, vhash64); + xchg(work->nonces[1], work->nonces[0]); + } + rc = 2; + break; + } + } + *pnonce = work->nonces[0]; + return rc; + } + } } *pnonce += throughput; - } while (!work_restart[thr_id].restart && (uint64_t)max_nonce > (uint64_t)throughput + (uint64_t)(*pnonce)); + } while (!work_restart[thr_id].restart && max_nonce > (uint64_t)throughput + (*pnonce)); *hashes_done = (*pnonce) - first_nonce; MyStreamSynchronize(NULL, 0, device_map[thr_id]); - return rc; + return 0; } // cleanup