Check and submit multiple nonces in one loop

Added to most algos, checkhash function scans a big range and can find multiple nonces at once if the difficulty is low. Stop ignoring them, submit second one if found... Clean the draft code for rc=2 implemented for blake and pentablake btw... fix the reduced displayed hashrate when a nonce is found... Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
2025-03-13 05:51:19 +00:00 · 2014-12-05 14:59:32 +01:00 · 2014-12-05 14:59:32 +01:00 · c3bdb623e8
commit c3bdb623e8
parent f387898ead
20 changed files with 217 additions and 196 deletions
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@ -218,13 +218,18 @@ extern "C" int scanhash_jackpot(int thr_id, uint32_t *pdata,
 			// diese jackpothash Funktion gibt die Zahl der Runden zurück
 			rounds = jackpothash(vhash64, endiandata);

-			if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
-
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+				int res = 1;
+				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				if (secNonce != 0) {
+					pdata[21] = secNonce;
+					res++;
+				}
 				pdata[19] = foundNonce;
-				*hashes_done = foundNonce - first_nonce + 1;
-				//applog(LOG_INFO, "GPU #%d: result for nonce $%08X does validate on CPU (%d rounds)!", thr_id, foundNonce, rounds);
-				return 1;
-			} else {
+				return res;
+			}
+			else {
 				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU (%d rounds)!", thr_id, foundNonce, rounds);
 			}
 		}
--- a/blake32.cu
+++ b/blake32.cu
@ -398,19 +398,6 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt

 	int rc = 0;

-#if NBN > 1
-	if (extra_results[0] != UINT32_MAX) {
-		// possible extra result found in previous call
-		if (first_nonce <= extra_results[0] && max_nonce >= extra_results[0]) {
-			pdata[19] = extra_results[0];
-			*hashes_done = pdata[19] - first_nonce + 1;
-			extra_results[0] = UINT32_MAX;
-			rc = 1;
-			goto exit_scan;
-		}
-	}
-#endif
-
 	if (opt_benchmark) {
 		targetHigh = 0x1ULL << 32;
 		((uint32_t*)ptarget)[6] = swab32(0xff);
@ -464,25 +451,23 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 			//applog(LOG_BLUE, "%08x %16llx", vhashcpu[6], targetHigh);
 			if (vhashcpu[6] <= Htarg && fulltest(vhashcpu, ptarget))
 			{
-				pdata[19] = foundNonce;
 				rc = 1;
-
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				pdata[19] = foundNonce;
+#if NBN > 1
 				if (extra_results[0] != UINT32_MAX) {
-					// Rare but possible if the throughput is big
 					be32enc(&endiandata[19], extra_results[0]);
-
 					blake256hash(vhashcpu, endiandata, blakerounds);
 					if (vhashcpu[6] <= Htarg /* && fulltest(vhashcpu, ptarget) */) {
-						applog(LOG_NOTICE, "GPU found more than one result " CL_GRN "yippee!");
+						pdata[21] = extra_results[0];
 						rc = 2;
-					} else {
-						extra_results[0] = UINT32_MAX;
 					}
+					extra_results[0] = UINT32_MAX;
 				}
-
+#endif
 				//applog_hash((uint8_t*)ptarget);
 				//applog_compare_hash((uint8_t*)vhashcpu,(uint8_t*)ptarget);
-				goto exit_scan;
+				return rc;
 			}
 			else if (opt_debug) {
 				applog_hash((uchar*)ptarget);
@ -500,7 +485,6 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt

 	} while (!work_restart[thr_id].restart);

-exit_scan:
 	*hashes_done = pdata[19] - first_nonce + 1; // (+1 to prevent locks)
 	return rc;
 }
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -570,7 +570,6 @@ static bool submit_upstream_work(CURL *curl, struct work *work)

 		le32enc(&ntime, work->data[17]);
 		le32enc(&nonce, work->data[19]);
-		be16enc(&nvote, *((uint16_t*)&work->data[20]));

 		noncestr = bin2hex((const uchar*)(&nonce), 4);

@ -593,6 +592,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		xnonce2str = bin2hex(work->xnonce2, work->xnonce2_len);

 		if (opt_algo == ALGO_HEAVY) {
+			be16enc(&nvote, *((uint16_t*)&work->data[20]));
 			nvotestr = bin2hex((const uchar*)(&nvote), 2);
 			sprintf(s,
 				"{\"method\": \"mining.submit\", \"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\":4}",
@ -1027,7 +1027,7 @@ static void *miner_thread(void *userdata)
 			nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
 			pthread_mutex_lock(&g_work_lock);
 			extrajob |= work_done;
-			if ((*nonceptr) >= end_nonce || extrajob) {
+			if (nonceptr[0] >= end_nonce || extrajob) {
 				work_done = false;
 				extrajob = false;
 				stratum_gen_work(&stratum, &g_work);
@ -1036,8 +1036,7 @@ static void *miner_thread(void *userdata)
 			int min_scantime = scan_time;
 			/* obtain new work from internal workio thread */
 			pthread_mutex_lock(&g_work_lock);
-			if (time(NULL) - g_work_time >= min_scantime ||
-			     (*nonceptr) >= end_nonce) {
+			if (time(NULL) - g_work_time >= min_scantime || nonceptr[0] >= end_nonce) {
 				if (unlikely(!get_work(mythr, &g_work))) {
 					applog(LOG_ERR, "work retrieval failed, exiting "
 						"mining thread %d", mythr->id);
@ -1047,21 +1046,6 @@ static void *miner_thread(void *userdata)
 				g_work_time = time(NULL);
 			}
 		}
-#if 0
-		if (!opt_benchmark && g_work.job_id[0] == '\0') {
-			applog(LOG_ERR, "work data not read yet");
-			extrajob = true;
-			work_done = true;
-			sleep(1);
-			//continue;
-		}
-#endif
-		if (rc > 1) {
-			/* if we found more than one on last loop */
-			/* todo: handle an array to get them directly */
-			pthread_mutex_unlock(&g_work_lock);
-			goto continue_scan;
-		}

 		if (!opt_benchmark && memcmp(work.target, g_work.target, sizeof(work.target))) {
 			calc_diff(&g_work, 0);
@ -1071,7 +1055,7 @@ static void *miner_thread(void *userdata)
 			}
 			memcpy(work.target, g_work.target, sizeof(work.target));
 			work.difficulty = g_work.difficulty;
-			(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
+			nonceptr[0] = (UINT32_MAX / opt_n_threads) * thr_id; // 0 if single thr
 			/* on new target, ignoring nonce, clear sent data (hashlog) */
 			if (memcmp(work.target, g_work.target, sizeof(work.target))) {
 				hashlog_purge_job(work.job_id);
@ -1090,9 +1074,9 @@ static void *miner_thread(void *userdata)
 			}
 			#endif
 			memcpy(&work, &g_work, sizeof(struct work));
-			(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
+			nonceptr[0] = (UINT32_MAX / opt_n_threads) * thr_id; // 0 if single thr
 		} else
-			(*nonceptr)++; //??
+			nonceptr[0]++; //??

 		work_restart[thr_id].restart = 0;
 		pthread_mutex_unlock(&g_work_lock);
@ -1131,7 +1115,7 @@ static void *miner_thread(void *userdata)
 		// we can't scan more than uint capacity
 		max64 = min(UINT32_MAX, max64);

-		start_nonce = *nonceptr;
+		start_nonce = nonceptr[0];
 #if 0
 		/* do not recompute something already scanned (hashharder workaround) */
 		if (opt_algo == ALGO_BLAKE && opt_n_threads == 1) {
@ -1163,7 +1147,7 @@ static void *miner_thread(void *userdata)
 					stats_purge_old();
 					// wait a bit for a new job...
 					usleep(500*1000);
-					(*nonceptr) = end_nonce + 1;
+					nonceptr[0] = end_nonce + 1;
 					work_done = true;
 					continue;
 				}
@ -1182,14 +1166,13 @@ static void *miner_thread(void *userdata)
 		// todo: keep it rounded for gpu threads ?

 		work.scanned_from = start_nonce;
-		(*nonceptr) = start_nonce;
+		nonceptr[0] = start_nonce;

 		if (opt_debug)
 			applog(LOG_DEBUG, "GPU #%d: start=%08x end=%08x range=%08x",
 				device_map[thr_id], start_nonce, max_nonce, (max_nonce-start_nonce));

 		hashes_done = 0;
-continue_scan:
 		gettimeofday(&tv_start, NULL);

 		/* scan nonces for a proof-of-work hash */
@ -1326,7 +1309,9 @@ continue_scan:
 		gettimeofday(&tv_end, NULL);

 		if (rc && opt_debug)
-			applog(LOG_NOTICE, CL_CYN "found => %08x" CL_GRN " %08x", *nonceptr, swab32(*nonceptr));
+			applog(LOG_NOTICE, CL_CYN "found => %08x" CL_GRN " %08x", nonceptr[0], swab32(nonceptr[0])); // data[19]
+		if (rc > 1 && opt_debug)
+			applog(LOG_NOTICE, CL_CYN "found => %08x" CL_GRN " %08x", nonceptr[2], swab32(nonceptr[2])); // data[21]

 		timeval_subtract(&diff, &tv_end, &tv_start);

@ -1346,22 +1331,22 @@ continue_scan:
 			pthread_mutex_lock(&stats_lock);
 			if (diff.tv_sec + 1e-6 * diff.tv_usec > 0.0) {
 				thr_hashrates[thr_id] = hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec);
-				if (rc > 1)
-					thr_hashrates[thr_id] = (rc * hashes_done) / (diff.tv_sec + 1e-6 * diff.tv_usec);
 				thr_hashrates[thr_id] *= rate_factor;
 				stats_remember_speed(thr_id, hashes_done, thr_hashrates[thr_id], (uint8_t) rc, work.height);
 			}
 			pthread_mutex_unlock(&stats_lock);
 		}

-		if (rc)
-			work.scanned_to = *nonceptr;
+		if (rc > 1)
+			work.scanned_to = nonceptr[2];
+		else if (rc)
+			work.scanned_to = nonceptr[0];
 		else {
 			work.scanned_to = max_nonce;
 			if (opt_debug && opt_benchmark) {
 				// to debug nonce ranges
 				applog(LOG_DEBUG, "GPU #%d:  ends=%08x range=%llx", device_map[thr_id],
-					*nonceptr, ((*nonceptr) - start_nonce));
+					nonceptr[0], (nonceptr[0] - start_nonce));
 			}
 		}

@ -1393,6 +1378,13 @@ continue_scan:
 		if (rc && !opt_benchmark) {
 			if (!submit_work(mythr, &work))
 				break;
+			// second nonce found, submit too
+			if (rc > 1 && work.data[21]) {
+				work.data[19] = work.data[21];
+				work.data[21] = 0;
+				if (!submit_work(mythr, &work))
+					break;
+			}
 		}

 		loopcnt++;
--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -23,8 +23,8 @@ extern uint32_t cuda_check_hash(int thr_id, int threads, uint32_t startNounce, u
 extern uint32_t cuda_check_hash_suppl(int thr_id, int threads, uint32_t startNounce, uint32_t *d_inputHash, uint8_t numNonce);
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
 extern void cudaReportHardwareFailure(int thr_id, cudaError_t error, const char* func);
-
 extern __device__ __device_builtin__ void __syncthreads(void);
+extern __device__ __device_builtin__ void __threadfence(void);

 #ifndef __CUDA_ARCH__
 // define blockDim and threadIdx for host
--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@ -113,19 +113,25 @@ extern "C" int scanhash_nist5(int thr_id, uint32_t *pdata,
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

 		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if  (foundNonce != 0xffffffff)
+		if (foundNonce != UINT32_MAX)
 		{
+			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash64[8];
-			uint32_t Htarg = ptarget[7];
 			be32enc(&endiandata[19], foundNonce);
 			nist5hash(vhash64, endiandata);

-			if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
-
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+				int res = 1;
+				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				if (secNonce != 0) {
+					pdata[21] = secNonce;
+					res++;
+				}
 				pdata[19] = foundNonce;
-				*hashes_done = foundNonce - first_nonce + 1;
-				return 1;
-			} else {
+				return res;
+			}
+			else {
 				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
 			}
 		}
--- a/keccak/keccak256.cu
+++ b/keccak/keccak256.cu
@ -67,20 +67,25 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
 		int order = 0;

 		uint32_t foundNonce = keccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		if  (foundNonce != 0xffffffff)
+		if (foundNonce != UINT32_MAX)
 		{
-			uint32_t vhash64[8];
 			uint32_t Htarg = ptarget[7];
+			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			keccak256_hash(vhash64, endiandata);

 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
-
+				int res = 1;
+				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				if (secNonce != 0) {
+					pdata[21] = secNonce;
+					res++;
+				}
 				pdata[19] = foundNonce;
-				*hashes_done = foundNonce - first_nonce + 1;
-				return 1;
-
-			} else {
+				return res;
+			}
+			else {
 				applog(LOG_DEBUG, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
 			}
 		}
--- a/pentablake.cu
+++ b/pentablake.cu
@ -43,8 +43,6 @@ extern "C" void pentablakehash(void *output, const void *input)

 #include "cuda_helper.h"

-#define MAXU 0xffffffffU
-
 __constant__
 static uint32_t __align__(32) c_Target[8];

@ -54,7 +52,7 @@ static uint64_t __align__(32) c_data[32];
 static uint32_t *d_hash[8];
 static uint32_t *d_resNounce[8];
 static uint32_t *h_resNounce[8];
-static uint32_t extra_results[2] = { MAXU, MAXU };
+static uint32_t extra_results[2] = { UINT32_MAX, UINT32_MAX };

 /* prefer uint32_t to prevent size conversions = speed +5/10 % */
 __constant__
@ -387,7 +385,7 @@ __host__
 uint32_t pentablake_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce)
 {
 	const int threadsperblock = TPB;
-	uint32_t result = MAXU;
+	uint32_t result = UINT32_MAX;

 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
@ -446,7 +444,7 @@ __host__ static
 uint32_t pentablake_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash, int order)
 {
 	const int threadsperblock = TPB;
-	uint32_t result = MAXU;
+	uint32_t result = UINT32_MAX;

 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
@ -497,17 +495,6 @@ extern "C" int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t *
 	int throughput = opt_work_size ? opt_work_size : (128 * 2560); // 18.5
 	throughput = min(throughput, (int)(max_nonce - first_nonce));

-	if (extra_results[0] != MAXU) {
-		// possible extra result found in previous call
-		if (first_nonce <= extra_results[0] && max_nonce >= extra_results[0]) {
-			pdata[19] = extra_results[0];
-			*hashes_done = pdata[19] - first_nonce + 1;
-			extra_results[0] = MAXU;
-			rc = 1;
-			goto exit_scan;
-		}
-	}
-
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x000F;

@ -539,39 +526,30 @@ extern "C" int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t *
 		pentablake_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++);

 		uint32_t foundNonce = pentablake_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-
-		if (foundNonce != MAXU)
+		if (foundNonce != UINT32_MAX)
 		{
+			const uint32_t Htarg = ptarget[7];
 			uint32_t vhashcpu[8];
-			uint32_t Htarg = ptarget[7];

 			be32enc(&endiandata[19], foundNonce);
-
 			pentablakehash(vhashcpu, endiandata);

-			if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget))
-			{
-				pdata[19] = foundNonce;
+			if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) {
 				rc = 1;
-
-				// Rare but possible if the throughput is big
-				be32enc(&endiandata[19], extra_results[0]);
-				pentablakehash(vhashcpu, endiandata);
-				if (vhashcpu[7] <= Htarg && fulltest(vhashcpu, ptarget)) {
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				if (extra_results[0] != UINT32_MAX) {
+					// Rare but possible if the throughput is big
 					applog(LOG_NOTICE, "GPU found more than one result yippee!");
-					rc = 2;
-				} else {
-					extra_results[0] = MAXU;
+					pdata[21] = extra_results[0];
+					extra_results[0] = UINT32_MAX;
+					rc++;
 				}
-
-				goto exit_scan;
+				pdata[19] = foundNonce;
+				return rc;
 			}
 			else if (vhashcpu[7] > Htarg) {
 				applog(LOG_WARNING, "GPU #%d: result for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[7], Htarg);
 			}
-			else if (vhashcpu[6] > ptarget[6]) {
-				applog(LOG_WARNING, "GPU #%d: hash[6] for nounce %08x is not in range: %x > %x", thr_id, foundNonce, vhashcpu[6], ptarget[6]);
-			}
 			else {
 				applog(LOG_WARNING, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
 			}
@ -581,17 +559,7 @@ extern "C" int scanhash_pentablake(int thr_id, uint32_t *pdata, const uint32_t *

 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);

-exit_scan:
 	*hashes_done = pdata[19] - first_nonce + 1;
-#if 0
-	/* reset the device to allow multiple instances
-	 * could be made in cpu-miner... check later if required */
-	if (opt_n_threads == 1) {
-		CUDA_SAFE_CALL(cudaDeviceReset());
-		init[thr_id] = false;
-	}
-#endif
-
 	cudaDeviceSynchronize();
 	return rc;
 }
--- a/quark/animecoin.cu
+++ b/quark/animecoin.cu
@ -258,18 +258,23 @@ extern "C" int scanhash_anime(int thr_id, uint32_t *pdata,

 		// Scan nach Gewinner Hashes auf der GPU
 		uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-		if  (foundNonce != 0xffffffff)
+		if (foundNonce != UINT32_MAX)
 		{
-			uint32_t vhash64[8];
 			const uint32_t Htarg = ptarget[7];
+			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			animehash(vhash64, endiandata);

-			if ((vhash64[7]<=Htarg) && fulltest(vhash64, ptarget)) {
-
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+				int res = 1;
+				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				if (secNonce != 0) {
+					pdata[21] = secNonce;
+					res++;
+				}
 				pdata[19] = foundNonce;
-				*hashes_done = foundNonce - first_nonce + 1;
-				return 1;
+				return res;
 			} else {
 				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
 			}
--- a/qubit/deep.cu
+++ b/qubit/deep.cu
@ -87,7 +87,6 @@ extern "C" int scanhash_deep(int thr_id, uint32_t *pdata,
 	cuda_check_cpu_setTarget(ptarget);

 	do {
-		const uint32_t Htarg = ptarget[7];
 		int order = 0;

 		qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
@ -95,18 +94,25 @@ extern "C" int scanhash_deep(int thr_id, uint32_t *pdata,
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

 		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != 0xffffffff)
+		if (foundNonce != UINT32_MAX)
 		{
+			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			deephash(vhash64, endiandata);

-			if (vhash64[7]<=Htarg && fulltest(vhash64, ptarget) )
-			{
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+				int res = 1;
+				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				if (secNonce != 0) {
+					pdata[21] = secNonce;
+					res++;
+				}
 				pdata[19] = foundNonce;
-				*hashes_done = foundNonce - first_nonce + 1;
-				return 1;
-			} else {
+				return res;
+			}
+			else {
 				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
 			}
 		}
--- a/qubit/doom.cu
+++ b/qubit/doom.cu
@ -63,22 +63,23 @@ extern "C" int scanhash_doom(int thr_id, uint32_t *pdata,
 	qubit_luffa512_cpufinal_setBlock_80((void*)endiandata,ptarget);

 	do {
-		const uint32_t Htarg = ptarget[7];
 		int order = 0;

 		uint32_t foundNonce = qubit_luffa512_cpu_finalhash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		if (foundNonce != 0xffffffff)
+		if (foundNonce != UINT32_MAX)
 		{
+			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			doomhash(vhash64, endiandata);

-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget) )
-			{
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+				int res = 1;
+				*hashes_done = pdata[19] - first_nonce + throughput;
 				pdata[19] = foundNonce;
-				*hashes_done = foundNonce - first_nonce + 1;
-				return 1;
-			} else {
+				return res;
+			}
+			else {
 				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
 			}
 		}
--- a/qubit/qubit.cu
+++ b/qubit/qubit.cu
@ -110,7 +110,6 @@ extern "C" int scanhash_qubit(int thr_id, uint32_t *pdata,
 	cuda_check_cpu_setTarget(ptarget);

 	do {
-		const uint32_t Htarg = ptarget[7];
 		int order = 0;

 		// Hash with CUDA
@ -121,18 +120,25 @@ extern "C" int scanhash_qubit(int thr_id, uint32_t *pdata,
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

 		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != 0xffffffff)
+		if (foundNonce != UINT32_MAX)
 		{
+			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			qubithash(vhash64, endiandata);

-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget) )
-			{
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+				int res = 1;
+				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				if (secNonce != 0) {
+					pdata[21] = secNonce;
+					res++;
+				}
 				pdata[19] = foundNonce;
-				*hashes_done = foundNonce - first_nonce + 1;
-				return 1;
-			} else {
+				return res;
+			}
+			else {
 				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
 			}
 		}
--- a/qubit/qubit_luffa512.cu
+++ b/qubit/qubit_luffa512.cu
@ -23,7 +23,9 @@

 #include "cuda_helper.h"

-#define MAXU 0xffffffffU
+#ifndef UINT32_MAX
+#define UINT32_MAX UINT_MAX
+#endif

 typedef unsigned char BitSequence;

@ -35,7 +37,7 @@ static uint32_t *d_resNounce[8];

 #define NBN 1 /* max results, could be 2, see blake32.cu */
 #if NBN > 1
-static uint32_t extra_results[2] = { MAXU, MAXU };
+static uint32_t extra_results[2] = { UINT32_MAX, UINT32_MAX };
 #endif

 typedef struct {
@ -454,7 +456,7 @@ void qubit_luffa512_cpu_init(int thr_id, int threads)
 __host__
 uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash,int order)
 {
-	uint32_t result = MAXU;
+	uint32_t result = UINT32_MAX;
 	cudaMemset(d_resNounce[thr_id], 0xff, NBN * sizeof(uint32_t));
 	const int threadsperblock = 256;

@ -517,4 +519,4 @@ void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget)

 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
-}
+}
--- a/x11/fresh.cu
+++ b/x11/fresh.cu
@ -124,16 +124,22 @@ extern "C" int scanhash_fresh(int thr_id, uint32_t *pdata,
 #endif

 		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != 0xffffffff)
+		if (foundNonce != UINT32_MAX)
 		{
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			fresh_hash(vhash64, endiandata);

 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+				int res = 1;
+				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				if (secNonce != 0) {
+					pdata[21] = secNonce;
+					res++;
+				}
 				pdata[19] = foundNonce;
-				*hashes_done = foundNonce - first_nonce + 1;
-				return 1;
+				return res;
 			}
 			else if (vhash64[7] > Htarg) {
 				applog(LOG_INFO, "GPU #%d: result for %08x is not in range: %x > %x", thr_id, foundNonce, vhash64[7], Htarg);
--- a/x11/s3.cu
+++ b/x11/s3.cu
@ -101,17 +101,22 @@ extern "C" int scanhash_s3(int thr_id, uint32_t *pdata,

 		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);

-		if (foundNonce != 0xffffffff)
+		if (foundNonce != UINT32_MAX)
 		{
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			s3hash(vhash64, endiandata);

 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
-
+				int res = 1;
+				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				if (secNonce != 0) {
+					pdata[21] = secNonce;
+					res++;
+				}
 				pdata[19] = foundNonce;
-				*hashes_done = foundNonce - first_nonce + 1;
-				return 1;
+				return res;

 			} else {
 				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
--- a/x11/x11.cu
+++ b/x11/x11.cu
@ -171,8 +171,6 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 	cuda_check_cpu_setTarget(ptarget);

 	do {
-		const uint32_t Htarg = ptarget[7];
-
 		int order = 0;
 		uint32_t foundNonce;

@ -189,19 +187,24 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

 		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != 0xffffffff)
+		if (foundNonce != UINT32_MAX)
 		{
+			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			x11hash(vhash64, endiandata);

-			/* uint32_t secNonce = */ cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-
 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
-				// just check if there was some other ones...
+				int res = 1;
+				// check if there was some other ones...
+				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 				*hashes_done = pdata[19] - first_nonce + throughput;
+				if (secNonce != 0) {
+					pdata[21] = secNonce;
+					res++;
+				}
 				pdata[19] = foundNonce;
-				return 1;
+				return res;
 			}
 			else if (vhash64[7] > Htarg) {
 				applog(LOG_INFO, "GPU #%d: result for %08x is not in range: %x > %x", thr_id, foundNonce, vhash64[7], Htarg);
--- a/x13/x13.cu
+++ b/x13/x13.cu
@ -209,17 +209,23 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
 		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

 		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if  (foundNonce != 0xffffffff)
+		if (foundNonce != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			x13hash(vhash64, endiandata);

-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget) ) {
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+				int res = 1;
+				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				if (secNonce != 0) {
+					pdata[21] = secNonce;
+					res++;
+				}
 				pdata[19] = foundNonce;
-				*hashes_done = foundNonce - first_nonce + 1;
-				return 1;
+				return res;
 			}
 			else if (vhash64[7] > Htarg) {
 				applog(LOG_INFO, "GPU #%d: result for %08x is not in range: %x > %x", thr_id, foundNonce, vhash64[7], Htarg);
--- a/x15/whirlpool.cu
+++ b/x15/whirlpool.cu
@ -87,18 +87,23 @@ extern "C" int scanhash_whc(int thr_id, uint32_t *pdata,
 		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

 		foundNonce = whirlpool512_cpu_finalhash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		if (foundNonce != 0xffffffff)
+		if (foundNonce != UINT32_MAX)
 		{
+			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash64[8];
-			uint32_t Htarg = ptarget[7];
 			be32enc(&endiandata[19], foundNonce);
 			wcoinhash(vhash64, endiandata);

-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
-			{
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+				int res = 1;
+				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				if (secNonce != 0) {
+					pdata[21] = secNonce;
+					res++;
+				}
 				pdata[19] = foundNonce;
-				*hashes_done = foundNonce - first_nonce + 1;
-				return 1;
+				return res;
 			}
 			else if (vhash64[7] > Htarg) {
 				applog(LOG_INFO, "GPU #%d: result for %08x is not in range: %x > %x", thr_id, foundNonce, vhash64[7], Htarg);
--- a/x15/x14.cu
+++ b/x15/x14.cu
@ -216,17 +216,24 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

 		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != 0xffffffff)
+		if (foundNonce != UINT32_MAX)
 		{
-			/* check now with the CPU to confirm */
+			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash64[8];
+			/* check now with the CPU to confirm */
 			be32enc(&endiandata[19], foundNonce);
 			x14hash(vhash64, endiandata);
-			uint32_t Htarg = ptarget[7];
+
 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+				int res = 1;
+				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				if (secNonce != 0) {
+					pdata[21] = secNonce;
+					res++;
+				}
 				pdata[19] = foundNonce;
-				*hashes_done = foundNonce - first_nonce + 1;
-				return 1;
+				return res;
 			}
 			else if (vhash64[7] > Htarg) {
 				applog(LOG_INFO, "GPU #%d: result for %08x is not in range: %x > %x", thr_id, foundNonce, vhash64[7], Htarg);
--- a/x15/x15.cu
+++ b/x15/x15.cu
@ -228,21 +228,25 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

-		/* Scan with GPU */
 		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-
-		if (foundNonce != 0xffffffff)
+		if (foundNonce != UINT32_MAX)
 		{
-			/* check now with the CPU to confirm */
+			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash64[8];
-			uint32_t Htarg = ptarget[7];
+			/* check now with the CPU to confirm */
 			be32enc(&endiandata[19], foundNonce);
 			x15hash(vhash64, endiandata);
+
 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+				int res = 1;
+				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				if (secNonce != 0) {
+					pdata[21] = secNonce;
+					res++;
+				}
 				pdata[19] = foundNonce;
-				*hashes_done = foundNonce - first_nonce + 1;
-				x15_whirlpool_cpu_free(thr_id);
-				return 1;
+				return res;
 			}
 			else if (vhash64[7] > Htarg) {
 				applog(LOG_INFO, "GPU #%d: result for %08x is not in range: %x > %x", thr_id, foundNonce, vhash64[7], Htarg);
--- a/x17/x17.cu
+++ b/x17/x17.cu
@ -254,18 +254,23 @@ extern "C" int scanhash_x17(int thr_id, uint32_t *pdata,
 		x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

 		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != 0xffffffff)
+		if (foundNonce != UINT32_MAX)
 		{
+			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash64[8];
-			uint32_t Htarg = ptarget[7];
 			be32enc(&endiandata[19], foundNonce);
 			x17hash(vhash64, endiandata);

-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget))
-			{
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+				int res = 1;
+				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				if (secNonce != 0) {
+					pdata[21] = secNonce;
+					res++;
+				}
 				pdata[19] = foundNonce;
-				*hashes_done = foundNonce - first_nonce + 1;
-				return 1;
+				return res;
 			}
 			else if (vhash64[7] > Htarg) {
 				applog(LOG_INFO, "GPU #%d: result for %08x is not in range: %x > %x", thr_id, foundNonce, vhash64[7], Htarg);