diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu index aeafcd8..236bfbf 100644 --- a/cuda_checkhash.cu +++ b/cuda_checkhash.cu @@ -4,18 +4,21 @@ #include #include +#include "miner.h" + #include "cuda_helper.h" -__constant__ uint32_t pTarget[8]; +__constant__ uint32_t pTarget[8]; // 32 bytes -static uint32_t *d_resNounce[8]; -static uint32_t *h_resNounce[8]; +// store 8 device arrays of 8 nonces +static uint32_t* h_resNonces[8]; +static uint32_t* d_resNonces[8]; __host__ void cuda_check_cpu_init(int thr_id, int threads) { - CUDA_CALL_OR_RET(cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t))); - CUDA_CALL_OR_RET(cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t))); + CUDA_CALL_OR_RET(cudaMallocHost(&h_resNonces[thr_id], 8*sizeof(uint32_t))); + CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 8*sizeof(uint32_t))); } // Target Difficulty @@ -68,17 +71,18 @@ static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint3 } __global__ __launch_bounds__(512, 4) -void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNounce) +void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNonces) { int thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { // shl 4 = *16 x 4 (uint32) = 64 bytes + // todo: use only 32 bytes * threads if possible uint32_t *inpHash = &hash[thread << 4]; - if (hashbelowtarget(inpHash, pTarget)) { - uint32_t nounce = (startNounce + thread); - resNounce[0] = nounce; + if (resNonces[0] == UINT32_MAX) { + if (hashbelowtarget(inpHash, pTarget)) + resNonces[0] = (startNounce + thread); } } } @@ -86,20 +90,61 @@ void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32 __host__ uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash) { - uint32_t result = 0xffffffff; - cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)); + cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t)); const int threadsperblock = 512; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - cuda_checkhash_64 <<>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]); + cuda_checkhash_64 <<>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]); + cudaThreadSynchronize(); + + cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + return h_resNonces[thr_id][0]; +} + +/* --------------------------------------------------------------------------------------------- */ +__global__ __launch_bounds__(512, 4) +void cuda_checkhash_64_suppl(uint32_t startNounce, uint32_t *hash, uint32_t *resNonces) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + + uint32_t *inpHash = &hash[thread << 4]; + + if (hashbelowtarget(inpHash, pTarget)) { + int resNum = ++resNonces[0]; + __threadfence(); + if (resNum < 8) + resNonces[resNum] = (startNounce + thread); + } +} + +__host__ +uint32_t cuda_check_hash_suppl(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce) +{ + uint32_t rescnt, result = 0; + + const int threadsperblock = 512; + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + // first element stores the count of found nonces + cudaMemset(d_resNonces[thr_id], 0, sizeof(uint32_t)); + + cuda_checkhash_64_suppl <<>> (startNounce, d_inputHash, d_resNonces[thr_id]); cudaThreadSynchronize(); - cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); - result = *h_resNounce[thr_id]; + cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost); + rescnt = h_resNonces[thr_id][0]; + if (rescnt > numNonce) { + if (numNonce <= rescnt) { + result = h_resNonces[thr_id][numNonce+1]; + } + if (opt_debug) + applog(LOG_WARNING, "Found %d nonces: %x + %x", rescnt, h_resNonces[thr_id][1], result); + } return result; } @@ -115,11 +160,6 @@ void cuda_check_hash_branch_64(int threads, uint32_t startNounce, uint32_t *g_no uint32_t nounce = g_nonceVector[thread]; uint32_t hashPosition = (nounce - startNounce) << 4; uint32_t *inpHash = &g_hash[hashPosition]; - //uint32_t hash[8]; - - //#pragma unroll 8 - //for (int i=0; i < 8; i++) - // hash[i] = inpHash[i]; for (int i = 7; i >= 0; i--) { if (inpHash[i] > pTarget[i]) { @@ -138,21 +178,21 @@ __host__ uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order) { uint32_t result = 0xffffffff; - cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t)); + cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t)); const int threadsperblock = 256; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - cuda_check_hash_branch_64 <<>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]); + cuda_check_hash_branch_64 <<>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNonces[thr_id]); MyStreamSynchronize(NULL, order, thr_id); - cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); cudaThreadSynchronize(); - result = *h_resNounce[thr_id]; + result = *h_resNonces[thr_id]; return result; } \ No newline at end of file diff --git a/cuda_helper.h b/cuda_helper.h index 255ce45..4714103 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -20,6 +20,7 @@ extern "C" long device_sm[8]; extern void cuda_check_cpu_init(int thr_id, int threads); extern void cuda_check_cpu_setTarget(const void *ptarget); extern uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash); +extern uint32_t cuda_check_hash_suppl(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce); extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func); diff --git a/x11/x11.cu b/x11/x11.cu index 6c77c34..6a838e4 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -155,7 +155,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, if (x11_simd512_cpu_init(thr_id, throughput) != 0) { return 0; } - CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0); + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0); // why 64 ? cuda_check_cpu_init(thr_id, throughput); @@ -195,9 +195,12 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, be32enc(&endiandata[19], foundNonce); x11hash(vhash64, endiandata); - if ((vhash64[7] <= Htarg) && fulltest(vhash64, ptarget)) { + /* uint32_t secNonce = */ cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); + + if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) { + // just check if there was some other ones... + *hashes_done = pdata[19] - first_nonce + throughput; pdata[19] = foundNonce; - *hashes_done = foundNonce - first_nonce + 1; return 1; } else if (vhash64[7] > Htarg) {