/** * __shfl_up require SM 3.0 arch! * * SM 2 alternative method by tpruvot@github 2015 */ #include #include #include "cuda_helper.h" #ifdef __INTELLISENSE__ /* just for vstudio code colors */ #define __CUDA_ARCH__ 300 #define __shfl_up(var, delta, width) (0) #endif static uint32_t *h_numValid[MAX_GPUS]; static uint32_t *d_tempBranch1Nonces[MAX_GPUS]; static uint32_t *d_partSum[2][MAX_GPUS]; // für bis zu vier partielle Summen // True/False tester typedef uint32_t(*cuda_compactTestFunction_t)(uint32_t *inpHash); __device__ uint32_t QuarkTrueTest(uint32_t *inpHash) { return ((inpHash[0] & 0x08) == 0x08); } __device__ uint32_t QuarkFalseTest(uint32_t *inpHash) { return ((inpHash[0] & 0x08) == 0); } __device__ cuda_compactTestFunction_t d_QuarkTrueFunction = QuarkTrueTest, d_QuarkFalseFunction = QuarkFalseTest; cuda_compactTestFunction_t h_QuarkTrueFunction[MAX_GPUS], h_QuarkFalseFunction[MAX_GPUS]; // Setup/Alloc Function __host__ void quark_compactTest_cpu_init(int thr_id, uint32_t threads) { int dev_id = device_map[thr_id]; cuda_get_arch(thr_id); cudaMemcpyFromSymbol(&h_QuarkTrueFunction[thr_id], d_QuarkTrueFunction, sizeof(cuda_compactTestFunction_t)); cudaMemcpyFromSymbol(&h_QuarkFalseFunction[thr_id], d_QuarkFalseFunction, sizeof(cuda_compactTestFunction_t)); if (cuda_arch[dev_id] >= 300) { uint32_t s1 = (threads / 256) * 2; CUDA_SAFE_CALL(cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads * 2)); CUDA_SAFE_CALL(cudaMalloc(&d_partSum[0][thr_id], sizeof(uint32_t) * s1)); // BLOCKSIZE (Threads/Block) CUDA_SAFE_CALL(cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1)); // BLOCKSIZE (Threads/Block) } else { CUDA_SAFE_CALL(cudaMalloc(&d_tempBranch1Nonces[thr_id], sizeof(uint32_t) * threads)); } cudaMallocHost(&h_numValid[thr_id], 2*sizeof(uint32_t)); } // Because all alloc should have a free... __host__ void quark_compactTest_cpu_free(int thr_id) { int dev_id = device_map[thr_id]; cudaFreeHost(h_numValid[thr_id]); if (cuda_arch[dev_id] >= 300) { cudaFree(d_tempBranch1Nonces[thr_id]); cudaFree(d_partSum[0][thr_id]); cudaFree(d_partSum[1][thr_id]); } else { cudaFree(d_tempBranch1Nonces[thr_id]); } } __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, const int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) { #if __CUDA_ARCH__ >= 300 __shared__ uint32_t sums[32]; int id = ((blockIdx.x * blockDim.x) + threadIdx.x); //int lane_id = id % warpSize; int lane_id = id % width; // determine a warp_id within a block //int warp_id = threadIdx.x / warpSize; int warp_id = threadIdx.x / width; sums[lane_id] = 0; // Below is the basic structure of using a shfl instruction // for a scan. // Record "value" as a variable - we accumulate it along the way uint32_t value; if(testFunc != NULL) { if (id < threads) { uint32_t *inpHash; if(d_validNonceTable == NULL) { // keine Nonce-Liste inpHash = &inpHashes[id<<4]; } else { // Nonce-Liste verfügbar int nonce = d_validNonceTable[id] - startNounce; inpHash = &inpHashes[nonce<<4]; } value = (*testFunc)(inpHash); } else { value = 0; } } else { value = data[id]; } __syncthreads(); // Now accumulate in log steps up the chain // compute sums, with another thread's value who is // distance delta away (i). Note // those threads where the thread 'i' away would have // been out of bounds of the warp are unaffected. This // creates the scan sum. #pragma unroll for (int i=1; i<=width; i*=2) { uint32_t n = __shfl_up((int)value, i, width); if (lane_id >= i) value += n; } // value now holds the scan value for the individual thread // next sum the largest values for each warp // write the sum of the warp to smem //if (threadIdx.x % warpSize == warpSize-1) if (threadIdx.x % width == width-1) { sums[warp_id] = value; } __syncthreads(); // // scan sum the warp sums // the same shfl scan operation, but performed on warp sums // if (warp_id == 0) { uint32_t warp_sum = sums[lane_id]; for (int i=1; i<=width; i*=2) { uint32_t n = __shfl_up((int)warp_sum, i, width); if (lane_id >= i) warp_sum += n; } sums[lane_id] = warp_sum; } __syncthreads(); // perform a uniform add across warps in the block // read neighbouring warp's sum and add it to threads value uint32_t blockSum = 0; if (warp_id > 0) { blockSum = sums[warp_id-1]; } value += blockSum; // Now write out our result data[id] = value; // last thread has sum, write write out the block's sum if (partial_sums != NULL && threadIdx.x == blockDim.x-1) { partial_sums[blockIdx.x] = value; } #endif // SM3+ } // Uniform add: add partial sums array __global__ void quark_compactTest_gpu_ADD(uint32_t *data, uint32_t *partial_sums, int len) { __shared__ uint32_t buf; int id = ((blockIdx.x * blockDim.x) + threadIdx.x); if (id > len) return; if (threadIdx.x == 0) { buf = partial_sums[blockIdx.x]; } __syncthreads(); data[id] += buf; } __global__ void quark_compactTest_gpu_SCATTER(uint32_t *sum, uint32_t *outp, cuda_compactTestFunction_t testFunc, uint32_t threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) { int id = ((blockIdx.x * blockDim.x) + threadIdx.x); uint32_t actNounce = id; uint32_t value; if (id < threads) { uint32_t *inpHash; if(d_validNonceTable == NULL) { // keine Nonce-Liste inpHash = &inpHashes[id<<4]; } else { // Nonce-Liste verfügbar int nonce = d_validNonceTable[id] - startNounce; actNounce = nonce; inpHash = &inpHashes[nonce<<4]; } value = (*testFunc)(inpHash); } else { value = 0; } if (value) { int idx = sum[id]; if(idx > 0) outp[idx-1] = startNounce + actNounce; } } __host__ static uint32_t quark_compactTest_roundUpExp(uint32_t val) { if(val == 0) return 0; uint32_t mask = 0x80000000; while( (val & mask) == 0 ) mask = mask >> 1; if( (val & (~mask)) != 0 ) return mask << 1; return mask; } __host__ void quark_compactTest_cpu_singleCompaction(int thr_id, uint32_t threads, uint32_t *nrm,uint32_t *d_nonces1, cuda_compactTestFunction_t function, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) { int orgThreads = threads; threads = quark_compactTest_roundUpExp(threads); // threadsPerBlock ausrechnen int blockSize = 256; int nSummen = threads / blockSize; int thr1 = (threads+blockSize-1) / blockSize; int thr2 = threads / (blockSize*blockSize); int blockSize2 = (nSummen < blockSize) ? nSummen : blockSize; int thr3 = (nSummen + blockSize2-1) / blockSize2; bool callThrid = (thr2 > 0) ? true : false; // Erster Initialscan quark_compactTest_gpu_SCAN <<>>( d_tempBranch1Nonces[thr_id], 32, d_partSum[0][thr_id], function, orgThreads, startNounce, inpHashes, d_validNonceTable); // weitere Scans if(callThrid) { quark_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], 32, d_partSum[1][thr_id]); quark_compactTest_gpu_SCAN<<<1, thr2>>>(d_partSum[1][thr_id], (thr2>32) ? 32 : thr2); } else { quark_compactTest_gpu_SCAN<<>>(d_partSum[0][thr_id], (blockSize2>32) ? 32 : blockSize2); } // Sync + Anzahl merken cudaStreamSynchronize(NULL); if(callThrid) cudaMemcpy(nrm, &(d_partSum[1][thr_id])[thr2-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); else cudaMemcpy(nrm, &(d_partSum[0][thr_id])[nSummen-1], sizeof(uint32_t), cudaMemcpyDeviceToHost); if(callThrid) { quark_compactTest_gpu_ADD<<>>(d_partSum[0][thr_id]+blockSize, d_partSum[1][thr_id], blockSize*thr2); } quark_compactTest_gpu_ADD<<>>(d_tempBranch1Nonces[thr_id]+blockSize, d_partSum[0][thr_id], threads); quark_compactTest_gpu_SCATTER<<>>(d_tempBranch1Nonces[thr_id], d_nonces1, function, orgThreads, startNounce, inpHashes, d_validNonceTable); // Sync cudaStreamSynchronize(NULL); } #if __CUDA_ARCH__ < 300 __global__ __launch_bounds__(128, 8) void quark_filter_gpu_sm2(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_branch2, uint32_t* d_NonceBranch) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t); uint4 *psrc = (uint4*) (&d_hash[offset]); d_NonceBranch[thread] = ((uint8_t*)psrc)[0] & 0x8; if (d_NonceBranch[thread]) return; // uint4 = 4x uint32_t = 16 bytes uint4 *pdst = (uint4*) (&d_branch2[offset]); pdst[0] = psrc[0]; pdst[1] = psrc[1]; pdst[2] = psrc[2]; pdst[3] = psrc[3]; } } __global__ __launch_bounds__(128, 8) void quark_merge_gpu_sm2(const uint32_t threads, uint32_t* d_hash, uint32_t* d_branch2, uint32_t* const d_NonceBranch) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads && !d_NonceBranch[thread]) { const uint32_t offset = thread * 16U; uint4 *pdst = (uint4*) (&d_hash[offset]); uint4 *psrc = (uint4*) (&d_branch2[offset]); pdst[0] = psrc[0]; pdst[1] = psrc[1]; pdst[2] = psrc[2]; pdst[3] = psrc[3]; } } #else __global__ void quark_filter_gpu_sm2(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_branch2, uint32_t* d_NonceBranch) {} __global__ void quark_merge_gpu_sm2(const uint32_t threads, uint32_t* d_hash, uint32_t* d_branch2, uint32_t* const d_NonceBranch) {} #endif __host__ uint32_t quark_filter_cpu_sm2(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_branch2) { const uint32_t threadsperblock = 128; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); // extract algo permution hashes to a second branch buffer quark_filter_gpu_sm2 <<>> (threads, inpHashes, d_branch2, d_tempBranch1Nonces[thr_id]); return threads; } __host__ void quark_merge_cpu_sm2(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_branch2) { const uint32_t threadsperblock = 128; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); // put back second branch hashes to the common buffer d_hash quark_merge_gpu_sm2 <<>> (threads, outpHashes, d_branch2, d_tempBranch1Nonces[thr_id]); } ////// ACHTUNG: Diese funktion geht aktuell nur mit threads > 65536 (Am besten 256 * 1024 oder 256*2048) __host__ void quark_compactTest_cpu_dualCompaction(int thr_id, uint32_t threads, uint32_t *nrm, uint32_t *d_nonces1, uint32_t *d_nonces2, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable) { quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[0], d_nonces1, h_QuarkTrueFunction[thr_id], startNounce, inpHashes, d_validNonceTable); quark_compactTest_cpu_singleCompaction(thr_id, threads, &nrm[1], d_nonces2, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); } __host__ void quark_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, uint32_t *d_nonces2, uint32_t *nrm2, int order) { // Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind // "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! quark_compactTest_cpu_dualCompaction(thr_id, threads, h_numValid[thr_id], d_nonces1, d_nonces2, startNounce, inpHashes, d_validNonceTable); cudaStreamSynchronize(NULL); // Das original braucht zwar etwas CPU-Last, ist an dieser Stelle aber evtl besser *nrm1 = h_numValid[thr_id][0]; *nrm2 = h_numValid[thr_id][1]; } __host__ void quark_compactTest_single_false_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, uint32_t *d_nonces1, uint32_t *nrm1, int order) { // Wenn validNonceTable genutzt wird, dann werden auch nur die Nonces betrachtet, die dort enthalten sind // "threads" ist in diesem Fall auf die Länge dieses Array's zu setzen! quark_compactTest_cpu_singleCompaction(thr_id, threads, h_numValid[thr_id], d_nonces1, h_QuarkFalseFunction[thr_id], startNounce, inpHashes, d_validNonceTable); cudaStreamSynchronize(NULL); *nrm1 = h_numValid[thr_id][0]; }