From f387898ead48530e11ca5c13395c2816a4b85067 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Fri, 5 Dec 2014 07:08:13 +0100
Subject: [PATCH] Prepare multiple nonces support in one loop (if found)

Tested on x11 which find sometimes 3 nonces in one call,
actually they are ignored because only the biggest was kept...

This commit doesnt fix that, but will allow to enhance shares rate later...
---
 cuda_checkhash.cu | 86 ++++++++++++++++++++++++++++++++++-------------
 cuda_helper.h     |  1 +
 x11/x11.cu        |  9 +++--
 3 files changed, 70 insertions(+), 26 deletions(-)

diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu
index aeafcd8..236bfbf 100644
--- a/cuda_checkhash.cu
+++ b/cuda_checkhash.cu
@@ -4,18 +4,21 @@
 #include <stdio.h>
 #include <memory.h>
 
+#include "miner.h"
+
 #include "cuda_helper.h"
 
-__constant__ uint32_t pTarget[8];
+__constant__ uint32_t pTarget[8]; // 32 bytes
 
-static uint32_t *d_resNounce[8];
-static uint32_t *h_resNounce[8];
+// store 8 device arrays of 8 nonces
+static uint32_t* h_resNonces[8];
+static uint32_t* d_resNonces[8];
 
 __host__
 void cuda_check_cpu_init(int thr_id, int threads)
 {
-    CUDA_CALL_OR_RET(cudaMallocHost(&h_resNounce[thr_id], 1*sizeof(uint32_t)));
-    CUDA_CALL_OR_RET(cudaMalloc(&d_resNounce[thr_id], 1*sizeof(uint32_t)));
+    CUDA_CALL_OR_RET(cudaMallocHost(&h_resNonces[thr_id], 8*sizeof(uint32_t)));
+    CUDA_CALL_OR_RET(cudaMalloc(&d_resNonces[thr_id], 8*sizeof(uint32_t)));
 }
 
 // Target Difficulty
@@ -68,17 +71,18 @@ static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint3
 }
 
 __global__ __launch_bounds__(512, 4)
-void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNounce)
+void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		// shl 4 = *16 x 4 (uint32) = 64 bytes
+		// todo: use only 32 bytes * threads if possible
 		uint32_t *inpHash = &hash[thread << 4];
 
-		if (hashbelowtarget(inpHash, pTarget)) {
-			uint32_t nounce = (startNounce + thread);
-			resNounce[0] = nounce;
+		if (resNonces[0] == UINT32_MAX) {
+			if (hashbelowtarget(inpHash, pTarget))
+				resNonces[0] = (startNounce + thread);
 		}
 	}
 }
@@ -86,20 +90,61 @@ void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32
 __host__
 uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash)
 {
-	uint32_t result = 0xffffffff;
-	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
+	cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
 
 	const int threadsperblock = 512;
 
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
+	cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNonces[thr_id]);
+	cudaThreadSynchronize();
+
+	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	return h_resNonces[thr_id][0];
+}
+
+/* --------------------------------------------------------------------------------------------- */
 
+__global__ __launch_bounds__(512, 4)
+void cuda_checkhash_64_suppl(uint32_t startNounce, uint32_t *hash, uint32_t *resNonces)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	uint32_t *inpHash = &hash[thread << 4];
+
+	if (hashbelowtarget(inpHash, pTarget)) {
+		int resNum = ++resNonces[0];
+		__threadfence();
+		if (resNum < 8)
+			resNonces[resNum] = (startNounce + thread);
+	}
+}
+
+__host__
+uint32_t cuda_check_hash_suppl(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce)
+{
+	uint32_t rescnt, result = 0;
+
+	const int threadsperblock = 512;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	// first element stores the count of found nonces
+	cudaMemset(d_resNonces[thr_id], 0, sizeof(uint32_t));
+
+	cuda_checkhash_64_suppl <<<grid, block>>> (startNounce, d_inputHash, d_resNonces[thr_id]);
 	cudaThreadSynchronize();
 
-	cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
-	result = *h_resNounce[thr_id];
+	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], 8*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	rescnt = h_resNonces[thr_id][0];
+	if (rescnt > numNonce) {
+		if (numNonce <= rescnt) {
+			result = h_resNonces[thr_id][numNonce+1];
+		}
+		if (opt_debug)
+			applog(LOG_WARNING, "Found %d nonces: %x + %x", rescnt, h_resNonces[thr_id][1], result);
+	}
 
 	return result;
 }
@@ -115,11 +160,6 @@ void cuda_check_hash_branch_64(int threads, uint32_t startNounce, uint32_t *g_no
 		uint32_t nounce = g_nonceVector[thread];
 		uint32_t hashPosition = (nounce - startNounce) << 4;
 		uint32_t *inpHash = &g_hash[hashPosition];
-		//uint32_t hash[8];
-
-		//#pragma unroll 8
-		//for (int i=0; i < 8; i++)
-		//	hash[i] = inpHash[i];
 
 		for (int i = 7; i >= 0; i--) {
 			if (inpHash[i] > pTarget[i]) {
@@ -138,21 +178,21 @@ __host__
 uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
 {
 	uint32_t result = 0xffffffff;
-	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
+	cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
 
 	const int threadsperblock = 256;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	cuda_check_hash_branch_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
+	cuda_check_hash_branch_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNonces[thr_id]);
 
 	MyStreamSynchronize(NULL, order, thr_id);
 
-	cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaMemcpy(h_resNonces[thr_id], d_resNonces[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 
 	cudaThreadSynchronize();
-	result = *h_resNounce[thr_id];
+	result = *h_resNonces[thr_id];
 
 	return result;
 }
\ No newline at end of file
diff --git a/cuda_helper.h b/cuda_helper.h
index 255ce45..4714103 100644
--- a/cuda_helper.h
+++ b/cuda_helper.h
@@ -20,6 +20,7 @@ extern "C"  long device_sm[8];
 extern void cuda_check_cpu_init(int thr_id, int threads);
 extern void cuda_check_cpu_setTarget(const void *ptarget);
 extern uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash);
+extern uint32_t cuda_check_hash_suppl(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce);
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);
 
diff --git a/x11/x11.cu b/x11/x11.cu
index 6c77c34..6a838e4 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -155,7 +155,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
 			return 0;
 		}
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0); // why 64 ?
 
 		cuda_check_cpu_init(thr_id, throughput);
 
@@ -195,9 +195,12 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 			be32enc(&endiandata[19], foundNonce);
 			x11hash(vhash64, endiandata);
 
-			if ((vhash64[7] <= Htarg) && fulltest(vhash64, ptarget)) {
+			/* uint32_t secNonce = */ cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+				// just check if there was some other ones...
+				*hashes_done = pdata[19] - first_nonce + throughput;
 				pdata[19] = foundNonce;
-				*hashes_done = foundNonce - first_nonce + 1;
 				return 1;
 			}
 			else if (vhash64[7] > Htarg) {