checkhash: simplify the common function

use klaus trivial function, the old code has always been a bit weird.. split cuda_check_cpu_hash_64 in two functions, keep old for branched stuff
10 years ago · 118a6be361
15 changed files with 114 additions and 87 deletions
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@ -34,6 +34,8 @@ extern void jackpot_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t st
 											uint32_t *d_nonces2, size_t *nrm2,
 											int order);
 extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
 // Speicher zur Generierung der Noncevektoren für die bedingten Hashes
 static uint32_t *d_jackpotNonces[8];
 static uint32_t *d_branch1Nonces[8];
@ -205,8 +207,7 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 			quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
 		}
-		// Scan nach Gewinner Hashes auf der GPU
+		uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
 		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			unsigned int rounds;
--- a/cuda_checkhash.cu
+++ b/cuda_checkhash.cu
@ -1,45 +1,16 @@
 /**
 * This code compares final hash against target
 */
 #include <stdio.h>
 #include <memory.h>
 #include "cuda_helper.h"
 // Hash Target gegen das wir testen sollen
 __constant__ uint32_t pTarget[8];
 static uint32_t *d_resNounce[8];
 static uint32_t *h_resNounce[8];
 __global__
 void cuda_check_gpu_hash_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		// bestimme den aktuellen Zähler
 		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 		uint32_t hashPosition = (nounce - startNounce) << 4;
 		uint32_t *inpHash = &g_hash[hashPosition];
 		uint32_t hash[8];
 		#pragma unroll 8
 		for (int i=0; i < 8; i++)
 			hash[i] = inpHash[i];
 		for (int i = 7; i >= 0; i--) {
 			if (hash[i] > pTarget[i]) {
 				return;
 			}
 			if (hash[i] <= pTarget[i]) {
 				break;
 			}
 		}
 		if (resNounce[0] > nounce)
 			resNounce[0] = nounce;
 	}
 }
 // Setup-Funktionen
 __host__
 void cuda_check_cpu_init(int thr_id, int threads)
 {
@ -54,70 +25,133 @@ void cuda_check_cpu_setTarget(const void *ptarget)
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
 }
 /* --------------------------------------------------------------------------------------------- */
 __device__ __forceinline__
 static bool hashbelowtarget(const uint32_t *const __restrict__ hash, const uint32_t *const __restrict__ target)
 {
 	if (hash[7] > target[7])
 		return false;
 	if (hash[7] < target[7])
 		return true;
 	if (hash[6] > target[6])
 		return false;
 	if (hash[6] < target[6])
 		return true;
 	if (hash[5] > target[5])
 		return false;
 	if (hash[5] < target[5])
 		return true;
 	if (hash[4] > target[4])
 		return false;
 	if (hash[4] < target[4])
 		return true;
 	if (hash[3] > target[3])
 		return false;
 	if (hash[3] < target[3])
 		return true;
 	if (hash[2] > target[2])
 		return false;
 	if (hash[2] < target[2])
 		return true;
 	if (hash[1] > target[1])
 		return false;
 	if (hash[1] < target[1])
 		return true;
 	if (hash[0] > target[0])
 		return false;
 	return true;
 }
 __global__ __launch_bounds__(512, 4)
 void cuda_checkhash_64(int threads, uint32_t startNounce, uint32_t *hash, uint32_t *resNounce)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		// shl 4 = *16 x 4 (uint32) = 64 bytes
 		uint32_t *inpHash = &hash[thread << 4];
 		if (hashbelowtarget(inpHash, pTarget)) {
 			uint32_t nounce = (startNounce + thread);
 			resNounce[0] = nounce;
 		}
 	}
 }
 __host__
-uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
+uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash)
 {
 	uint32_t result = 0xffffffff;
 	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
-	const int threadsperblock = 256;
+	const int threadsperblock = 512;
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
-	cuda_check_gpu_hash_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
+	cuda_checkhash_64 <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
-	// Strategisches Sleep Kommando zur Senkung der CPU Last
+	cudaThreadSynchronize();
 	MyStreamSynchronize(NULL, order, thr_id);
 	// Ergebnis zum Host kopieren (in page locked memory, damits schneller geht)
 	cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	// cudaMemcpy() ist asynchron!
 	cudaThreadSynchronize();
 	result = *h_resNounce[thr_id];
 	return result;
 }
 /* --------------------------------------------------------------------------------------------- */
 __global__
-void cuda_check_gpu_hash_fast(int threads, uint32_t startNounce, uint32_t *hashEnd, uint32_t *resNounce)
+void cuda_check_hash_branch_64(int threads, uint32_t startNounce, uint32_t *g_nonceVector, uint32_t *g_hash, uint32_t *resNounce)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		/* only test the last 2 dwords, ok for most algos */
+		uint32_t nounce = g_nonceVector[thread];
-		int hashPos = thread << 4;
+		uint32_t hashPosition = (nounce - startNounce) << 4;
-		uint32_t *inpHash = &hashEnd[hashPos];
+		uint32_t *inpHash = &g_hash[hashPosition];
 		//uint32_t hash[8];
-		if (inpHash[7] <= pTarget[7] && inpHash[6] <= pTarget[6]) {
+		//#pragma unroll 8
-			uint32_t nounce = (startNounce + thread);
+		//for (int i=0; i < 8; i++)
 		//	hash[i] = inpHash[i];
 		for (int i = 7; i >= 0; i--) {
 			if (inpHash[i] > pTarget[i]) {
 				return;
 			}
 			if (inpHash[i] < pTarget[i]) {
 				break;
 			}
 		}
 		if (resNounce[0] > nounce)
 			resNounce[0] = nounce;
 	}
 	}
 }
 __host__
-uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order)
+uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order)
 {
 	uint32_t result = 0xffffffff;
 	cudaMemset(d_resNounce[thr_id], 0xff, sizeof(uint32_t));
 	const int threadsperblock = 256;
-	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
-	cuda_check_gpu_hash_fast <<<grid, block>>> (threads, startNounce, d_inputHash, d_resNounce[thr_id]);
+	cuda_check_hash_branch_64 <<<grid, block>>> (threads, startNounce, d_nonceVector, d_inputHash, d_resNounce[thr_id]);
-	// MyStreamSynchronize(NULL, order, thr_id);
+	MyStreamSynchronize(NULL, order, thr_id);
 	cudaThreadSynchronize();
 	cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
-	// cudaMemcpy() was asynchron ?
+	cudaThreadSynchronize();
 	// cudaThreadSynchronize();
 	result = *h_resNounce[thr_id];
 	return result;
--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -19,8 +19,7 @@ extern "C"  long device_sm[8];
 // common functions
 extern void cuda_check_cpu_init(int thr_id, int threads);
 extern void cuda_check_cpu_setTarget(const void *ptarget);
-extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+extern uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash);
 extern uint32_t cuda_check_hash_fast(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, int order);
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);
--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@ -11,7 +11,6 @@ extern "C"
 #include "cuda_helper.h"
 // Speicher für Input/Output der verketteten Hashfunktionen
 static uint32_t *d_hash[8];
 extern void quark_blake512_cpu_init(int thr_id, int threads);
@ -113,8 +112,7 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
 		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		// Scan nach Gewinner Hashes auf der GPU
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/quark/animecoin.cu
+++ b/quark/animecoin.cu
@ -48,7 +48,9 @@ extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads,
 											uint32_t *d_nonces1, size_t *nrm1,
 											int order);
-// Original Quarkhash Funktion aus einem miner Quelltext
+extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
 /* CPU Hash */
 extern "C" void animehash(void *state, const void *input)
 {
    sph_blake512_context ctx_blake;
@ -255,7 +257,7 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,
 		quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
 		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@ -50,6 +50,8 @@ extern void quark_compactTest_single_false_cpu_hash_64(int thr_id, int threads,
 											uint32_t *d_nonces1, size_t *nrm1,
 											int order);
 extern uint32_t cuda_check_hash_branch(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
 // Original Quarkhash Funktion aus einem miner Quelltext
 extern "C" void quarkhash(void *state, const void *input)
 {
@ -227,7 +229,7 @@ extern "C" int scanhash_quark(int thr_id, uint32_t *pdata,
 		quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);
 		// Scan nach Gewinner Hashes auf der GPU
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			const uint32_t Htarg = ptarget[7];
--- a/qubit/deep.cu
+++ b/qubit/deep.cu
@ -93,7 +93,8 @@ extern "C" int scanhash_deep(int thr_id, uint32_t *pdata,
 		qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
 		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/qubit/qubit.cu
+++ b/qubit/qubit.cu
@ -120,7 +120,7 @@ extern "C" int scanhash_qubit(int thr_id, uint32_t *pdata,
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/x11/fresh.cu
+++ b/x11/fresh.cu
@ -123,7 +123,7 @@ extern "C" int scanhash_fresh(int thr_id, uint32_t *pdata,
 		print_hash((unsigned char*)buf); printf("\n");
 #endif
-		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/x11/s3.cu
+++ b/x11/s3.cu
@ -99,7 +99,7 @@ extern "C" int scanhash_s3(int thr_id, uint32_t *pdata,
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != 0xffffffff)
 		{
--- a/x11/x11.cu
+++ b/x11/x11.cu
@ -58,10 +58,6 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
                                          uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order);
 // to check... new sp method
 //extern void x11_echo512_cpu_setTarget(const void *ptarget);
 //extern uint32_t x11_echo512_cpu_hash_64_final(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 // X11 CPU Hash
 extern "C" void x11hash(void *output, const void *input)
 {
@ -172,7 +168,6 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 	quark_blake512_cpu_setBlock_80((void*)endiandata);
 	//x11_echo512_cpu_setTarget(ptarget);
 	cuda_check_cpu_setTarget(ptarget);
 	do {
@ -193,11 +188,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		// todo...
+		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		//foundNonce = x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		//foundNonce = cuda_check_hash_fast(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		if (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
--- a/x13/x13.cu
+++ b/x13/x13.cu
@ -208,8 +208,7 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
 		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		// Scan nach Gewinner Hashes auf der GPU
+		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			const uint32_t Htarg = ptarget[7];
--- a/x15/x14.cu
+++ b/x15/x14.cu
@ -215,7 +215,7 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != 0xffffffff)
 		{
 			/* check now with the CPU to confirm */
--- a/x15/x15.cu
+++ b/x15/x15.cu
@ -229,7 +229,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		/* Scan with GPU */
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != 0xffffffff)
 		{
--- a/x17/x17.cu
+++ b/x17/x17.cu
@ -253,7 +253,7 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata,
 		x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		uint32_t foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];