checkhash: simplify the common function

use klaus trivial function, the old code has always been a bit weird.. split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff
2025-08-26 05:42:02 +00:00 · 2014-11-30 23:35:43 +01:00 · 2014-11-30 23:35:43 +01:00 · 118a6be361
commit 118a6be361
parent cc9df2d361
15 changed files with 137 additions and 110 deletions
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@ -34,6 +34,8 @@ extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t st
 											uint32_t *d_nonces2, size_t *nrm2,
 											int order);

+extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+
 // Speicher zur Generierung der Noncevektoren für die bedingten Hashes
 static uint32_t *d_jackpotNonces[8];
 static uint32_t *d_branch1Nonces[8];
@ -205,8 +207,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 			quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
 		}

-		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			unsigned int rounds;
--- a/cuda_checkhash.cu
+++ b/cuda_checkhash.cu
@ -1,45 +1,16 @@
+/**
+ * This code compares final hash against target
+ */
 #include <stdio.h>
 #include <memory.h>

 #include "cuda_helper.h"

-// Hash Target gegen das wir testen sollen
 __constant__ uint32_t pTarget[8];

 static uint32_t *d_resNounce[8];
 static uint32_t *h_resNounce[8];

-__global__
-void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
-{
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		// bestimme den aktuellen Zähler
-		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
-
-		uint32_t hashPosition = (nounce - startNounce) << 4;
-		uint32_t *inpHash = &g_hash[hashPosition];
-		uint32_t hash[8];
-
-		#pragma unroll 8
-		for (int i=0; i < 8; i++)
-			hash[i] = inpHash[i];
-
-		for (int i = 7; i >= 0; i--) {
-			if (hash[i] > pTarget[i]) {
-				return;
-			}
-			if (hash[i] <= pTarget[i]) {
-				break;
-			}
-		}
-		if (resNounce[0] > nounce)
-			resNounce[0] = nounce;
-	}
-}
-
-// Setup-Funktionen
 __host__
 void cuda_check_cpu_init(int thr_id, int threads)
 {
@ -54,8 +25,117 @@ void cuda_check_cpu_setTarget(const void *ptarget)
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
 }

+/* --------------------------------------------------------------------------------------------- */
+
+__device__ __forceinline__
+static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint32_t *const __restrict__ target)
+{
+	if (hash[7] > target[7])
+		return false;
+	if (hash[7] < target[7])
+		return true;
+	if (hash[6] > target[6])
+		return false;
+	if (hash[6] < target[6])
+		return true;
+
+	if (hash[5] > target[5])
+		return false;
+	if (hash[5] < target[5])
+		return true;
+	if (hash[4] > target[4])
+		return false;
+	if (hash[4] < target[4])
+		return true;
+
+	if (hash[3] > target[3])
+		return false;
+	if (hash[3] < target[3])
+		return true;
+	if (hash[2] > target[2])
+		return false;
+	if (hash[2] < target[2])
+		return true;
+
+	if (hash[1] > target[1])
+		return false;
+	if (hash[1] < target[1])
+		return true;
+	if (hash[0] > target[0])
+		return false;
+
+	return true;
+}
+
+__global__ __launch_bounds__(512, 4)
+void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNounce)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		// shl 4 = *16 x 4 (uint32) = 64 bytes
+		uint32_t *inpHash = &hash[thread << 4];
+
+		if (hashbelowtarget(inpHash, pTarget)) {
+			uint32_t nounce = (startNounce + thread);
+			resNounce[0] = nounce;
+		}
+	}
+}
+
 __host__
-uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
+uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash)
+{
+	uint32_t result = 0xffffffff;
+	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
+
+	const int threadsperblock = 512;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
+
+	cudaThreadSynchronize();
+
+	cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	result = *h_resNounce[thr_id];
+
+	return result;
+}
+
+/* --------------------------------------------------------------------------------------------- */
+
+__global__
+void cuda_check_hash_branch_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = g_nonceVector[thread];
+		uint32_t hashPosition = (nounce - startNounce) << 4;
+		uint32_t *inpHash = &g_hash[hashPosition];
+		//uint32_t hash[8];
+
+		//#pragma unroll 8
+		//for (int i=0; i < 8; i++)
+		//	hash[i] = inpHash[i];
+
+		for (int i = 7; i >= 0; i--) {
+			if (inpHash[i] > pTarget[i]) {
+				return;
+			}
+			if (inpHash[i] < pTarget[i]) {
+				break;
+			}
+		}
+		if (resNounce[0] > nounce)
+			resNounce[0] = nounce;
+	}
+}
+
+__host__
+uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
 {
 	uint32_t result = 0xffffffff;
 	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
@ -65,60 +145,14 @@ uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, u
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);

-	cuda_check_gpu_hash_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
+	cuda_check_hash_branch_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);

-	// Strategisches Sleep Kommando zur Senkung der CPU Last
 	MyStreamSynchronize(NULL, order, thr_id);

-	// Ergebnis zum Host kopieren (in page locked memory, damits schneller geht)
 	cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);

-	// cudaMemcpy() ist asynchron!
 	cudaThreadSynchronize();
 	result = *h_resNounce[thr_id];

 	return result;
-}
-
-__global__
-void cuda_check_gpu_hash_fast(int threads, uint32_t startNounce, uint32_t *hashEnd, uint32_t *resNounce)
-{
-	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		/* only test the last 2 dwords, ok for most algos */
-		int hashPos = thread << 4;
-		uint32_t *inpHash = &hashEnd[hashPos];
-
-		if (inpHash[7] <= pTarget[7] && inpHash[6] <= pTarget[6]) {
-			uint32_t nounce = (startNounce + thread);
-			if (resNounce[0] > nounce)
-				resNounce[0] = nounce;
-		}
-	}
-}
-
-__host__
-uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order)
-{
-	uint32_t result = 0xffffffff;
-	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
-
-	const int threadsperblock = 256;
-
-	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
-	dim3 block(threadsperblock);
-
-	cuda_check_gpu_hash_fast <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
-
-	// MyStreamSynchronize(NULL, order, thr_id);
-	cudaThreadSynchronize();
-
-	cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
-
-	// cudaMemcpy() was asynchron ?
-	// cudaThreadSynchronize();
-	result = *h_resNounce[thr_id];
-
-	return result;
-}
+}
--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -19,8 +19,7 @@ extern "C"  long device_sm[8];
 // common functions
 extern void cuda_check_cpu_init(int thr_id, int threads);
 extern void cuda_check_cpu_setTarget(const void *ptarget);
-extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
-extern uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order);
+extern uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash);
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);

--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@ -11,7 +11,6 @@ extern "C"

 #include "cuda_helper.h"

-// Speicher für Input/Output der verketteten Hashfunktionen
 static uint32_t *d_hash[8];

 extern void quark_blake512_cpu_init(int thr_id, int threads);
@ -113,8 +112,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
 		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

-		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if  (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/quark/animecoin.cu
+++ b/quark/animecoin.cu
@ -48,7 +48,9 @@ extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads,
 											uint32_t *d_nonces1, size_t *nrm1,
 											int order);

-// Original Quarkhash Funktion aus einem miner Quelltext
+extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+
+/* CPU Hash */
 extern "C" void animehash(void *state, const void *input)
 {
    sph_blake512_context ctx_blake;
@ -255,7 +257,7 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
 		quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);

 		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@ -50,6 +50,8 @@ extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads,
 											uint32_t *d_nonces1, size_t *nrm1,
 											int order);

+extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+
 // Original Quarkhash Funktion aus einem miner Quelltext
 extern "C" void quarkhash(void *state, const void *input)
 {
@ -227,7 +229,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
 		quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);

 		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			const uint32_t Htarg = ptarget[7];
--- a/qubit/deep.cu
+++ b/qubit/deep.cu
@ -93,7 +93,8 @@ extern "C" int scanhash_deep(int thr_id, uint32_t *pdata,
 		qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/qubit/qubit.cu
+++ b/qubit/qubit.cu
@ -120,7 +120,7 @@ extern "C" int scanhash_qubit(int thr_id, uint32_t *pdata,
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/x11/fresh.cu
+++ b/x11/fresh.cu
@ -123,7 +123,7 @@ extern "C" int scanhash_fresh(int thr_id, uint32_t *pdata,
 		print_hash((unsigned char*)buf); printf("\n");
 #endif

-		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/x11/s3.cu
+++ b/x11/s3.cu
@ -99,7 +99,7 @@ extern "C" int scanhash_s3(int thr_id, uint32_t *pdata,
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

-		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);

 		if (foundNonce != 0xffffffff)
 		{
--- a/x11/x11.cu
+++ b/x11/x11.cu
@ -58,10 +58,6 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
                                          uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);

-// to check... new sp method
-//extern void x11_echo512_cpu_setTarget(const void *ptarget);
-//extern uint32_t x11_echo512_cpu_hash_64_final(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
-
 // X11 CPU Hash
 extern "C" void x11hash(void *output, const void *input)
 {
@ -172,7 +168,6 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,

 	quark_blake512_cpu_setBlock_80((void*)endiandata);

-	//x11_echo512_cpu_setTarget(ptarget);
 	cuda_check_cpu_setTarget(ptarget);

 	do {
@ -193,11 +188,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

-		// todo...
-		//foundNonce = x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		//foundNonce = cuda_check_hash_fast(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-
-		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/x13/x13.cu
+++ b/x13/x13.cu
@ -208,8 +208,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
 		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

-		// Scan nach Gewinner Hashes auf der GPU
-		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if  (foundNonce != 0xffffffff)
 		{
 			const uint32_t Htarg = ptarget[7];
--- a/x15/x14.cu
+++ b/x15/x14.cu
@ -215,7 +215,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != 0xffffffff)
 		{
 			/* check now with the CPU to confirm */
--- a/x15/x15.cu
+++ b/x15/x15.cu
@ -229,7 +229,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

 		/* Scan with GPU */
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);

 		if (foundNonce != 0xffffffff)
 		{
--- a/x17/x17.cu
+++ b/x17/x17.cu
@ -253,7 +253,7 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata,
 		x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];