From 11dbbcc12d29639abb3c32bb6c3cab428d6d2f3e Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sun, 9 Nov 2014 10:55:35 +0100
Subject: [PATCH] checkhash: some work on a faster variant (wip)

This should not be used for all algos... not enabled yet

todo: multiple nounces or blake32 style checkup
---
 cuda_checkhash.cu | 62 +++++++++++++++++++++++++++++++++++++----------
 cuda_helper.h     |  1 +
 x11/x11.cu        |  1 +
 3 files changed, 51 insertions(+), 13 deletions(-)

diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu
index a7806aa..e129de8 100644
--- a/cuda_checkhash.cu
+++ b/cuda_checkhash.cu
@@ -18,9 +18,8 @@ void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonce
 		// bestimme den aktuellen Zähler
 		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 
-		int hashPosition = nounce - startNounce;
-		uint32_t *inpHash = &g_hash[hashPosition<<4];
-
+		uint32_t hashPosition = (nounce - startNounce) << 4;
+		uint32_t *inpHash = &g_hash[hashPosition];
 		uint32_t hash[8];
 
 		#pragma unroll 8
@@ -31,12 +30,11 @@ void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonce
 			if (hash[i] > pTarget[i]) {
 				return;
 			}
-			if (hash[i] < pTarget[i]) {
+			if (hash[i] <= pTarget[i]) {
 				break;
 			}
 		}
-
-		if(resNounce[0] > nounce)
+		if (resNounce[0] > nounce)
 			resNounce[0] = nounce;
 	}
 }
@@ -53,8 +51,7 @@ void cuda_check_cpu_init(int thr_id, int threads)
 __host__
 void cuda_check_cpu_setTarget(const void *ptarget)
 {
-	// die Message zur Berechnung auf der GPU
-	cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
 }
 
 __host__
@@ -65,14 +62,10 @@ uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, u
 
 	const int threadsperblock = 256;
 
-	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	// Größe des dynamischen Shared Memory Bereichs
-	size_t shared_size = 0;
-
-	cuda_check_gpu_hash_64 <<<grid, block, shared_size>>>(threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
+	cuda_check_gpu_hash_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
 
 	// Strategisches Sleep Kommando zur Senkung der CPU Last
 	MyStreamSynchronize(NULL, order, thr_id);
@@ -86,3 +79,46 @@ uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, u
 
 	return result;
 }
+
+__global__
+void cuda_check_gpu_hash_fast(int threads, uint32_t startNounce, uint32_t *hashEnd, uint32_t *resNounce)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		/* only test the last 2 dwords, ok for most algos */
+		int hashPos = thread << 4;
+		uint32_t *inpHash = &hashEnd[hashPos];
+
+		if (inpHash[7] <= pTarget[7] && inpHash[6] <= pTarget[6]) {
+			uint32_t nounce = (startNounce + thread);
+			if (resNounce[0] > nounce)
+				resNounce[0] = nounce;
+		}
+	}
+}
+
+__host__
+uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order)
+{
+	uint32_t result = 0xffffffff;
+	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
+
+	const int threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	cuda_check_gpu_hash_fast <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
+
+	// MyStreamSynchronize(NULL, order, thr_id);
+	cudaThreadSynchronize();
+
+	cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+
+	// cudaMemcpy() was asynchron ?
+	// cudaThreadSynchronize();
+	result = *h_resNounce[thr_id];
+
+	return result;
+}
diff --git a/cuda_helper.h b/cuda_helper.h
index 39b4354..2d5af22 100644
--- a/cuda_helper.h
+++ b/cuda_helper.h
@@ -20,6 +20,7 @@ extern int device_sm[8];
 extern void cuda_check_cpu_init(int thr_id, int threads);
 extern void cuda_check_cpu_setTarget(const void *ptarget);
 extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+extern uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order);
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 
 extern __device__ __device_builtin__ void __syncthreads(void);
diff --git a/x11/x11.cu b/x11/x11.cu
index 2b3596e..b48dde6 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -191,6 +191,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 
 		// Scan nach Gewinner Hashes auf der GPU
+		//foundNonce = cuda_check_hash_fast(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		if (foundNonce != 0xffffffff)
 		{