From 11dbbcc12d29639abb3c32bb6c3cab428d6d2f3e Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 9 Nov 2014 10:55:35 +0100 Subject: [PATCH] checkhash: some work on a faster variant (wip) This should not be used for all algos... not enabled yet todo: multiple nounces or blake32 style checkup --- cuda_checkhash.cu | 62 +++++++++++++++++++++++++++++++++++++---------- cuda_helper.h | 1 + x11/x11.cu | 1 + 3 files changed, 51 insertions(+), 13 deletions(-) diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu index a7806aa..e129de8 100644 --- a/cuda_checkhash.cu +++ b/cuda_checkhash.cu @@ -18,9 +18,8 @@ void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonce // bestimme den aktuellen Zähler uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; - uint32_t *inpHash = &g_hash[hashPosition<<4]; - + uint32_t hashPosition = (nounce - startNounce) << 4; + uint32_t *inpHash = &g_hash[hashPosition]; uint32_t hash[8]; #pragma unroll 8 @@ -31,12 +30,11 @@ void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonce if (hash[i] > pTarget[i]) { return; } - if (hash[i] < pTarget[i]) { + if (hash[i] <= pTarget[i]) { break; } } - - if(resNounce[0] > nounce) + if (resNounce[0] > nounce) resNounce[0] = nounce; } } @@ -53,8 +51,7 @@ void cuda_check_cpu_init(int thr_id, int threads) __host__ void cuda_check_cpu_setTarget(const void *ptarget) { - // die Message zur Berechnung auf der GPU - cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice)); } __host__ @@ -65,14 +62,10 @@ uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, u const int threadsperblock = 256; - // berechne wie viele Thread Blocks wir brauchen dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - // Größe des dynamischen Shared Memory Bereichs - size_t shared_size = 0; - - cuda_check_gpu_hash_64 <<>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]); + cuda_check_gpu_hash_64 <<>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]); // Strategisches Sleep Kommando zur Senkung der CPU Last MyStreamSynchronize(NULL, order, thr_id); @@ -86,3 +79,46 @@ uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, u return result; } + +__global__ +void cuda_check_gpu_hash_fast(int threads, uint32_t startNounce, uint32_t *hashEnd, uint32_t *resNounce) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + /* only test the last 2 dwords, ok for most algos */ + int hashPos = thread << 4; + uint32_t *inpHash = &hashEnd[hashPos]; + + if (inpHash[7] <= pTarget[7] && inpHash[6] <= pTarget[6]) { + uint32_t nounce = (startNounce + thread); + if (resNounce[0] > nounce) + resNounce[0] = nounce; + } + } +} + +__host__ +uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order) +{ + uint32_t result = 0xffffffff; + cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)); + + const int threadsperblock = 256; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + cuda_check_gpu_hash_fast <<>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]); + + // MyStreamSynchronize(NULL, order, thr_id); + cudaThreadSynchronize(); + + cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + + // cudaMemcpy() was asynchron ? + // cudaThreadSynchronize(); + result = *h_resNounce[thr_id]; + + return result; +} diff --git a/cuda_helper.h b/cuda_helper.h index 39b4354..2d5af22 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -20,6 +20,7 @@ extern int device_sm[8]; extern void cuda_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_setTarget(const void *ptarget); extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order); +extern uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern __device__ __device_builtin__ void __syncthreads(void); diff --git a/x11/x11.cu b/x11/x11.cu index 2b3596e..b48dde6 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -191,6 +191,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); // Scan nach Gewinner Hashes auf der GPU + //foundNonce = cuda_check_hash_fast(thr_id, throughput, pdata[19], d_hash[thr_id], order++); foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); if (foundNonce != 0xffffffff) {