Prepare multiple nonces support in one loop (if found)

Tested on x11 which find sometimes 3 nonces in one call, actually they are ignored because only the biggest was kept... This commit doesnt fix that, but will allow to enhance shares rate later...
10 years ago · f387898ead
3 changed files with 70 additions and 26 deletions
--- a/cuda_checkhash.cu
+++ b/cuda_checkhash.cu
@ -4,18 +4,21 @@
 #include <stdio.h>
 #include <memory.h>
 #include "miner.h"
 #include "cuda_helper.h"
-__constant__ uint32_t pTarget[8];
+__constant__ uint32_t pTarget[8]; // 32 bytes
-static uint32_t *d_resNounce[8];
+// store 8 device arrays of 8 nonces
-static uint32_t *h_resNounce[8];
+static uint32_t* h_resNonces[8];
 static uint32_t* d_resNonces[8];
 __host__
 void cuda_check_cpu_init(int thr_id, int threads)
 {
-    CUDA_CALL_OR_RET(cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t)));
+    CUDA_CALL_OR_RET(cudaMallocHost(&h_resNonces[thr_id], 8*sizeof(uint32_t)));
-    CUDA_CALL_OR_RET(cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t)));
+    CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 8*sizeof(uint32_t)));
 }
 // Target Difficulty
@ -68,17 +71,18 @@ static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint3
 }
 __global__ __launch_bounds__(512, 4)
-void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNounce)
+void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		// shl 4 = *16 x 4 (uint32) = 64 bytes
 		// todo: use only 32 bytes * threads if possible
 		uint32_t *inpHash = &hash[thread << 4];
-		if (hashbelowtarget(inpHash, pTarget)) {
+		if (resNonces[0] == UINT32_MAX) {
-			uint32_t nounce = (startNounce + thread);
+			if (hashbelowtarget(inpHash, pTarget))
-			resNounce[0] = nounce;
+				resNonces[0] = (startNounce + thread);
 		}
 	}
 }
@ -86,20 +90,61 @@ void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32
 __host__
 uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash)
 {
-	uint32_t result = 0xffffffff;
+	cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
 	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
 	const int threadsperblock = 512;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
-	cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
+	cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]);
 	cudaThreadSynchronize();
 	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	return h_resNonces[thr_id][0];
 }
 /* --------------------------------------------------------------------------------------------- */
 __global__ __launch_bounds__(512, 4)
 void cuda_checkhash_64_suppl(uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	uint32_t *inpHash = &hash[thread << 4];
 	if (hashbelowtarget(inpHash, pTarget)) {
 		int resNum = ++resNonces[0];
 		__threadfence();
 		if (resNum < 8)
 			resNonces[resNum] = (startNounce + thread);
 	}
 }
 __host__
 uint32_t cuda_check_hash_suppl(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce)
 {
 	uint32_t rescnt, result = 0;
 	const int threadsperblock = 512;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 	// first element stores the count of found nonces
 	cudaMemset(d_resNonces[thr_id], 0, sizeof(uint32_t));
 	cuda_checkhash_64_suppl <<<grid, block>>> (startNounce, d_inputHash, d_resNonces[thr_id]);
 	cudaThreadSynchronize();
-	cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost);
-	result = *h_resNounce[thr_id];
+	rescnt = h_resNonces[thr_id][0];
 	if (rescnt > numNonce) {
 		if (numNonce <= rescnt) {
 			result = h_resNonces[thr_id][numNonce+1];
 		}
 		if (opt_debug)
 			applog(LOG_WARNING, "Found %d nonces: %x + %x", rescnt, h_resNonces[thr_id][1], result);
 	}
 	return result;
 }
@ -115,11 +160,6 @@ void cuda_check_hash_branch_64(int threads, uint32_t startNounce, uint32_t *g_no
 		uint32_t nounce = g_nonceVector[thread];
 		uint32_t hashPosition = (nounce - startNounce) << 4;
 		uint32_t *inpHash = &g_hash[hashPosition];
 		//uint32_t hash[8];
 		//#pragma unroll 8
 		//for (int i=0; i < 8; i++)
 		//	hash[i] = inpHash[i];
 		for (int i = 7; i >= 0; i--) {
 			if (inpHash[i] > pTarget[i]) {
@ -138,21 +178,21 @@ __host__
 uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
 {
 	uint32_t result = 0xffffffff;
-	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
+	cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
 	const int threadsperblock = 256;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	cuda_check_hash_branch_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
+	cuda_check_hash_branch_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNonces[thr_id]);
 	MyStreamSynchronize(NULL, order, thr_id);
-	cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	cudaThreadSynchronize();
-	result = *h_resNounce[thr_id];
+	result = *h_resNonces[thr_id];
 	return result;
 }
--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -20,6 +20,7 @@ extern "C"  long device_sm[8];
 extern void cuda_check_cpu_init(int thr_id, int threads);
 extern void cuda_check_cpu_setTarget(const void *ptarget);
 extern uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash);
 extern uint32_t cuda_check_hash_suppl(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce);
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);
--- a/x11/x11.cu
+++ b/x11/x11.cu
@ -155,7 +155,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
 			return 0;
 		}
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0); // why 64 ?
 		cuda_check_cpu_init(thr_id, throughput);
@ -195,9 +195,12 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 			be32enc(&endiandata[19], foundNonce);
 			x11hash(vhash64, endiandata);
-			if ((vhash64[7] <= Htarg) && fulltest(vhash64, ptarget)) {
+			/* uint32_t secNonce = */ cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
 				// just check if there was some other ones...
 				*hashes_done = pdata[19] - first_nonce + throughput;
 				pdata[19] = foundNonce;
 				*hashes_done = foundNonce - first_nonce + 1;
 				return 1;
 			}
 			else if (vhash64[7] > Htarg) {