/** * Optimized Blake-256 8-rounds Cuda Kernel (Tested on SM >3.0) * Based upon Blake-256 implementation of Tanguy Pruvot - Nov. 2014 * * midstate computation inherited from * https://github.com/wfr/clblake * * Provos Alexis - Jan. 2016 * Reviewed by tpruvot - Feb 2016 */ #include #include #include #include "miner.h" extern "C" { #include "sph/sph_blake.h" } #include "cuda_helper.h" #ifdef __INTELLISENSE__ #define __byte_perm(x, y, b) x #endif /* threads per block and "magic" */ #define TPB 768 #define NPT 224 #define NBN 2 __constant__ uint32_t d_data[16]; /* 8 adapters max */ static uint32_t *d_resNonce[MAX_GPUS]; static uint32_t *h_resNonce[MAX_GPUS]; /* hash by cpu with blake 256 */ extern "C" void vanillahash(void *output, const void *input, int8_t blakerounds) { uchar hash[64]; sph_blake256_context ctx; sph_blake256_set_rounds(blakerounds); sph_blake256_init(&ctx); sph_blake256(&ctx, input, 80); sph_blake256_close(&ctx, hash); memcpy(output, hash, 32); } __global__ __launch_bounds__(TPB,1) void vanilla_gpu_hash_16_8(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce,const uint32_t highTarget) { uint32_t v[16]; uint32_t tmp[13]; const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; const uint32_t step = gridDim.x * blockDim.x; const uint32_t maxNonce = startNonce + threads; const uint32_t c_u256[16] = { 0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344, 0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89, 0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C, 0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917 }; const uint32_t h0 = d_data[0]; const uint32_t h1 = d_data[1]; const uint32_t h2 = d_data[2]; const uint32_t h3 = d_data[3]; const uint32_t h4 = d_data[4]; //const uint32_t h5 = d_data[5]; no need const uint32_t h6 = d_data[5]; const uint32_t h7 = d_data[6]; const uint32_t m0 = d_data[7]; const uint32_t m1 = d_data[8]; const uint32_t m2 = d_data[9]; //le' nonce const uint32_t m4 = 0x80000000UL; const uint32_t m5 = 0; const uint32_t m6 = 0; const uint32_t m7 = 0; const uint32_t m8 = 0; const uint32_t m9 = 0; const uint32_t m10 = 0; const uint32_t m11 = 0; const uint32_t m12 = 0; const uint32_t m13 = 1; const uint32_t m14 = 0; const uint32_t m15 = 640; //---MORE PRECOMPUTATIONS tmp[ 0] = d_data[10]; tmp[ 1] = d_data[11]; tmp[ 2] = d_data[12]; tmp[ 3] = c_u256[1] + tmp[2]; tmp[ 4] = d_data[13]; tmp[ 5] = d_data[14]; tmp[ 6] = c_u256[2] + tmp[5]; tmp[ 7] = d_data[15]; tmp[ 5] = __byte_perm(tmp[5] ^ h2,0, 0x0321); tmp[ 6] += tmp[5]; tmp[ 7] = ROTR32(tmp[7] ^ tmp[6],7); tmp[ 8] = __byte_perm(c_u256[7] ^ h3,0, 0x1032); tmp[ 9] = c_u256[3] + tmp[8]; tmp[10] = ROTR32(h7 ^ tmp[9], 12); tmp[11] = h3 + c_u256[6] + tmp[10]; tmp[ 8] = __byte_perm(tmp[8] ^ tmp[11],0, 0x0321); tmp[ 9] += tmp[8]; tmp[10] = ROTR32(tmp[10] ^ tmp[9],7); //---END OF MORE PRECOMPUTATIONS for(uint64_t m3 = startNonce + thread ; m3data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; const uint32_t targetHigh = ptarget[6]; int dev_id = device_map[thr_id]; int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 30 : 24; if (device_sm[dev_id] < 350) intensity = 22; uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); int rc = 0; if (!init[thr_id]) { cudaSetDevice(dev_id); if (opt_cudaschedule == -1 && gpu_threads == 1) { cudaDeviceReset(); // reduce cpu usage (linux) cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); CUDA_LOG_ERROR(); } CUDA_CALL_OR_RET_X(cudaHostAlloc((void**)&h_resNonce[thr_id], NBN*sizeof(uint32_t), cudaHostAllocMapped),0); CUDA_CALL_OR_RET_X(cudaHostGetDevicePointer((void**)&d_resNonce[thr_id],(void*)h_resNonce[thr_id], 0),0); init[thr_id] = true; } uint32_t endiandata[20]; for (int k = 0; k < 16; k++) be32enc(&endiandata[k], pdata[k]); vanilla_cpu_setBlock_16(endiandata,&pdata[16]); cudaMemset(d_resNonce[thr_id], 0xff, sizeof(uint32_t)); const dim3 grid((throughput + (NPT*TPB)-1)/(NPT*TPB)); const dim3 block(TPB); do { vanilla_gpu_hash_16_8<<>>(throughput, pdata[19], d_resNonce[thr_id], targetHigh); cudaThreadSynchronize(); if (h_resNonce[thr_id][0] != UINT32_MAX){ uint32_t vhashcpu[8]; uint32_t Htarg = (uint32_t)targetHigh; for (int k=0; k < 19; k++) be32enc(&endiandata[k], pdata[k]); be32enc(&endiandata[19], h_resNonce[thr_id][0]); vanillahash(vhashcpu, endiandata, blakerounds); if (vhashcpu[6] <= Htarg && fulltest(vhashcpu, ptarget)){ rc = 1; work_set_target_ratio(work, vhashcpu); *hashes_done = pdata[19] - first_nonce + throughput; work->nonces[0] = h_resNonce[thr_id][0]; #if NBN > 1 if (h_resNonce[thr_id][1] != UINT32_MAX) { work->nonces[1] = h_resNonce[thr_id][1]; be32enc(&endiandata[19], work->nonces[1]); vanillahash(vhashcpu, endiandata, blakerounds); if (bn_hash_target_ratio(vhashcpu, ptarget) > work->shareratio) { work_set_target_ratio(work, vhashcpu); xchg(work->nonces[1], work->nonces[0]); } rc = 2; } #endif pdata[19] = work->nonces[0]; return rc; } else { gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", h_resNonce[thr_id][0]); } } pdata[19] += throughput; } while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput))); *hashes_done = pdata[19] - first_nonce; MyStreamSynchronize(NULL, 0, dev_id); return rc; } // cleanup extern "C" void free_vanilla(int thr_id) { if (!init[thr_id]) return; cudaThreadSynchronize(); cudaFreeHost(h_resNonce[thr_id]); cudaFree(d_resNonce[thr_id]); init[thr_id] = false; cudaDeviceSynchronize(); }