migrate 2nd nonce storage of most algos

This allow to keep pdata[19] as cursor between scans, and later, to sort them.. remains... heavy, scrypt, sia...
2025-01-09 14:28:15 +00:00 · 2017-01-29 00:52:13 +01:00 · 2017-01-29 00:52:13 +01:00 · 0ff75791e5
commit 0ff75791e5
parent 5a77d36635
32 changed files with 673 additions and 604 deletions
--- a/Algo256/blake256.cu
+++ b/Algo256/blake256.cu
@ -45,7 +45,7 @@ static uint32_t *h_resNonce[MAX_GPUS];

 /* max count of found nonces in one call */
 #define NBN 2
-static uint32_t extra_results[NBN] = { UINT32_MAX };
+static __thread uint32_t extra_results[NBN] = { UINT32_MAX };

 #define GSPREC(a,b,c,d,x,y) { \
 	v[a] += (m[x] ^ c_u256[y]) + v[b]; \
@ -519,46 +519,51 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non

 	do {
 		// GPU HASH (second block only, first is midstate)
-		uint32_t foundNonce = blake256_cpu_hash_16(thr_id, throughput, pdata[19], targetHigh, blakerounds);
+		work->nonces[0] = blake256_cpu_hash_16(thr_id, throughput, pdata[19], targetHigh, blakerounds);

-		if (foundNonce != UINT32_MAX)
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (work->nonces[0] != UINT32_MAX)
 		{
-			uint32_t vhashcpu[8];
-			uint32_t Htarg = ptarget[6];
+			uint32_t _ALIGN(64) vhashcpu[8];
+			const uint32_t Htarg = ptarget[6];

 			for (int k=16; k < 19; k++)
 				be32enc(&endiandata[k], pdata[k]);

-			be32enc(&endiandata[19], foundNonce);
+			be32enc(&endiandata[19], work->nonces[0]);
 			blake256hash(vhashcpu, endiandata, blakerounds);

 			if (vhashcpu[6] <= Htarg && fulltest(vhashcpu, ptarget))
 			{
-				rc = 1;
+				work->valid_nonces = 1;
 				work_set_target_ratio(work, vhashcpu);
-				*hashes_done = pdata[19] - first_nonce + throughput;
-				pdata[19] = foundNonce;
 #if NBN > 1
 				if (extra_results[0] != UINT32_MAX) {
-					be32enc(&endiandata[19], extra_results[0]);
+					work->nonces[1] = extra_results[0];
+					be32enc(&endiandata[19], work->nonces[1]);
 					blake256hash(vhashcpu, endiandata, blakerounds);
 					if (vhashcpu[6] <= Htarg && fulltest(vhashcpu, ptarget)) {
-						pdata[21] = extra_results[0];
 						if (bn_hash_target_ratio(vhashcpu, ptarget) > work->shareratio[0]) {
 							work_set_target_ratio(work, vhashcpu);
-							xchg(pdata[21], pdata[19]);
+							xchg(work->nonces[0], work->nonces[1]);
+						} else {
+							bn_set_target_ratio(work, vhashcpu, 1);
 						}
-						rc = 2;
+						work->valid_nonces = 2;
 					}
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
 					extra_results[0] = UINT32_MAX;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
 				}
 #endif
-				return rc;
+				return work->valid_nonces;
 			}
-			else if (opt_debug) {
-				applog_hash((uchar*)ptarget);
-				applog_compare_hash((uchar*)vhashcpu, (uchar*)ptarget);
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+			else if (vhashcpu[6] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}

--- a/Algo256/bmw.cu
+++ b/Algo256/bmw.cu
@ -43,7 +43,6 @@ extern "C" int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, u
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
-	bool swapnonce = true;
 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 21);
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);

@ -77,24 +76,37 @@ extern "C" int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, u
 	cuda_check_cpu_setTarget(ptarget);

 	do {
-		bmw256_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], (int) swapnonce);
-		uint32_t foundNonce = cuda_check_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		bmw256_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], 1);

 		*hashes_done = pdata[19] - first_nonce + throughput;

-		if (foundNonce != UINT32_MAX)
+		work->nonces[0] = cuda_check_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
 		{
-			uint32_t _ALIGN(64) vhash64[8];
-			endiandata[19] = swab32_if(foundNonce, swapnonce);
-			bmw_hash(vhash64, endiandata);
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			bmw_hash(vhash, endiandata);

-			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
-				pdata[19] = swab32_if(foundNonce,!swapnonce);
-				work_set_target_ratio(work, vhash64);
-				return 1;
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					bmw_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
 			}
-			else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+			else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}

--- a/Algo256/decred.cu
+++ b/Algo256/decred.cu
@ -391,6 +391,8 @@ extern "C" int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce
 		// GPU HASH
 		decred_gpu_hash_nonce <<<grid, block>>> (throughput, (*pnonce), d_resNonce[thr_id], targetHigh);

+		*hashes_done = (*pnonce) - first_nonce + throughput;
+
 		// first cell contains the valid nonces count
 		cudaMemcpy(resNonces, d_resNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);

@ -404,9 +406,8 @@ extern "C" int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce
 			decred_hash(vhash, endiandata);
 			if (vhash[6] <= ptarget[6] && fulltest(vhash, ptarget))
 			{
-				int rc = work->valid_nonces = 1;
+				work->valid_nonces = 1;
 				work_set_target_ratio(work, vhash);
-				*hashes_done = (*pnonce) - first_nonce + throughput;
 				work->nonces[0] = swab32(resNonces[1]);
 				*pnonce = work->nonces[0];

@ -417,7 +418,6 @@ extern "C" int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce
 					decred_hash(vhash, endiandata);
 					if (vhash[6] <= ptarget[6] && fulltest(vhash, ptarget)) {
 						work->nonces[1] = swab32(resNonces[n]);
-
 						if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
 							// we really want the best first ? depends...
 							work->shareratio[1] = work->shareratio[0];
@ -429,7 +429,7 @@ extern "C" int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce
 							bn_set_target_ratio(work, vhash, 1);
 							work->valid_nonces++;
 						}
-						rc = 2; // MAX_NONCES submit limited to 2
+						work->valid_nonces = 2; // MAX_NONCES submit limited to 2

 						gpulog(LOG_DEBUG, thr_id, "multiple nonces 1:%08x (%g) %u:%08x (%g)",
 							work->nonces[0], work->sharediff[0], n, work->nonces[1], work->sharediff[1]);
@ -438,7 +438,7 @@ extern "C" int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce
 						gpulog(LOG_WARNING, thr_id, "result %u for %08x does not validate on CPU!", n, resNonces[n]);
 					}
 				}
-				return rc;
+				return work->valid_nonces;

 			} else if (vhash[6] > ptarget[6]) {
 				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", resNonces[1]);
--- a/Algo256/keccak256.cu
+++ b/Algo256/keccak256.cu
@ -65,7 +65,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
 		init[thr_id] = true;
 	}

-	for (int k=0; k < 20; k++) {
+	for (int k=0; k < 19; k++) {
 		be32enc(&endiandata[k], pdata[k]);
 	}

@ -75,20 +75,25 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no

 		*hashes_done = pdata[19] - first_nonce + throughput;

-		uint32_t foundNonce = keccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		if (foundNonce != UINT32_MAX && bench_algo < 0)
+		work->nonces[0] = keccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		if (work->nonces[0] != UINT32_MAX && bench_algo < 0)
 		{
-			uint32_t _ALIGN(64) vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
-			keccak256_hash(vhash64, endiandata);
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];

-			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
-				work_set_target_ratio(work, vhash64);
-				pdata[19] = foundNonce;
-				return 1;
+			be32enc(&endiandata[19], work->nonces[0]);
+			keccak256_hash(vhash, endiandata);
+
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				pdata[19] = work->nonces[0] + 1;
+				return work->valid_nonces;
 			}
-			else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+			else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}

--- a/Algo256/vanilla.cu
+++ b/Algo256/vanilla.cu
@ -416,6 +416,7 @@ extern "C" int scanhash_vanilla(int thr_id, struct work* work, uint32_t max_nonc
 	do {
 		vanilla_gpu_hash_16_8<<<grid,block, 0, streams[thr_id]>>>(throughput, pdata[19], d_resNonce[thr_id], targetHigh);
 		cudaMemcpyAsync(h_resNonce[thr_id], d_resNonce[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost,streams[thr_id]);
+		*hashes_done = pdata[19] - first_nonce + throughput;
 		cudaStreamSynchronize(streams[thr_id]);

 		if (h_resNonce[thr_id][0] != UINT32_MAX){
@ -429,31 +430,41 @@ extern "C" int scanhash_vanilla(int thr_id, struct work* work, uint32_t max_nonc
 			vanillahash(vhashcpu, endiandata, blakerounds);

 			if (vhashcpu[6] <= Htarg && fulltest(vhashcpu, ptarget)) {
-				rc = 1;
+				work->valid_nonces = 1;
+				work->nonces[0] = h_resNonce[thr_id][0];
 				work_set_target_ratio(work, vhashcpu);
-				*hashes_done = pdata[19] - first_nonce + throughput;
-				pdata[19] = h_resNonce[thr_id][0];
 #if NBN > 1
 				if (h_resNonce[thr_id][1] != UINT32_MAX) {
+					work->nonces[1] = h_resNonce[thr_id][1];
 					be32enc(&endiandata[19], h_resNonce[thr_id][1]);
 					vanillahash(vhashcpu, endiandata, blakerounds);
-					pdata[21] = h_resNonce[thr_id][1];
 					if (bn_hash_target_ratio(vhashcpu, ptarget) > work->shareratio[0]) {
 						work_set_target_ratio(work, vhashcpu);
-						xchg(pdata[19], pdata[21]);
+						xchg(work->nonces[0], work->nonces[1]);
 					}
-					rc = 2;
+					work->valid_nonces = 2;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
 				}
 #endif
-				return rc;
+				return work->valid_nonces;
 			}
-			else {
+			else if (vhashcpu[6] > Htarg) {
 				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", h_resNonce[thr_id][0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}

+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
 		pdata[19] += throughput;
-	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
+
+	} while (!work_restart[thr_id].restart);

 	*hashes_done = pdata[19] - first_nonce;
 	MyStreamSynchronize(NULL, 0, dev_id);
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@ -214,34 +214,40 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc

 		CUDA_LOG_ERROR();

-		uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+		work->nonces[0] = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);

-		if (foundNonce != UINT32_MAX)
+		if (work->nonces[0] != UINT32_MAX)
 		{
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);

 			// jackpothash function gibt die Zahl der Runden zurück
-			jackpothash(vhash64, endiandata);
+			jackpothash(vhash, endiandata);

-			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
-				int res = 1;
-				work_set_target_ratio(work, vhash64);
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
 #if 0
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				if (secNonce != 0) {
-					be32enc(&endiandata[19], secNonce);
-					nist5hash(vhash64, endiandata);
-					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0])
-						work_set_target_ratio(work, vhash64);
-					pdata[21] = secNonce;
-					res++;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					jackpothash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
 				}
+#else
+				pdata[19] = work->nonces[0] + 1; // cursor
 #endif
-				pdata[19] = foundNonce;
-				return res;
-			} else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}

--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -2343,24 +2343,13 @@ static void *miner_thread(void *userdata)
 		/* record scanhash elapsed time */
 		gettimeofday(&tv_end, NULL);

-		// todo: update all algos to use work->nonces and pdata[19] as counter
 		switch (opt_algo) {
-			case ALGO_BLAKE2S:
-			case ALGO_CRYPTOLIGHT:
-			case ALGO_CRYPTONIGHT:
-			case ALGO_DECRED:
-			case ALGO_LBRY:
+			// algos to migrate to replace pdata[21] by work.nonces[]
+			case ALGO_HEAVY:
+			case ALGO_SCRYPT:
+			case ALGO_SCRYPT_JANE:
 			case ALGO_SIA:
-			case ALGO_VELTOR:
-			case ALGO_WILDKECCAK:
-				// migrated algos
-				break;
-			case ALGO_ZR5:
-				// algos with only work.nonces[1] set
-				work.nonces[0] = nonceptr[0];
-				break;
-			default:
-				// algos with 2 results in pdata and work.nonces unset
+			//case ALGO_WHIRLPOOLX:
 				work.nonces[0] = nonceptr[0];
 				work.nonces[1] = nonceptr[2];
 		}
@ -2483,7 +2472,6 @@ static void *miner_thread(void *userdata)
 				work.submit_nonce_id = 1;
 				nonceptr[0] = work.nonces[1];
 				if (opt_algo == ALGO_ZR5) {
-					// todo: use + 4..6 index for pok to allow multiple nonces
 					work.data[0] = work.data[22]; // pok
 					work.data[22] = 0;
 				}
--- a/lyra2/lyra2RE.cu
+++ b/lyra2/lyra2RE.cu
@ -136,7 +136,6 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,

 	do {
 		int order = 0;
-		uint32_t foundNonce;

 		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
@ -146,35 +145,34 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,

 		*hashes_done = pdata[19] - first_nonce + throughput;

-		foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		if (foundNonce != UINT32_MAX)
+		work->nonces[0] = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		if (work->nonces[0] != UINT32_MAX)
 		{
-			uint32_t _ALIGN(64) vhash64[8];
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];

-			be32enc(&endiandata[19], foundNonce);
-			lyra2re_hash(vhash64, endiandata);
+			be32enc(&endiandata[19], work->nonces[0]);
+			lyra2re_hash(vhash, endiandata);

-			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
-				int res = 1;
-				uint32_t secNonce = groestl256_getSecNonce(thr_id, 1);
-				work_set_target_ratio(work, vhash64);
-				if (secNonce != UINT32_MAX)
-				{
-					be32enc(&endiandata[19], secNonce);
-					lyra2re_hash(vhash64, endiandata);
-					if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
-						if (opt_debug)
-							gpulog(LOG_BLUE, thr_id, "found second nonce %08x", secNonce);
-						if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0])
-							work_set_target_ratio(work, vhash64);
-						pdata[21] = secNonce;
-						res++;
-					}
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = groestl256_getSecNonce(thr_id, 1);
+				if (work->nonces[1] != UINT32_MAX) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					lyra2re_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
 				}
-				pdata[19] = foundNonce;
-				return res;
-			} else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}

--- a/lyra2/lyra2REv2.cu
+++ b/lyra2/lyra2REv2.cu
@ -142,7 +142,6 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc

 	do {
 		int order = 0;
-		uint32_t foundNonces[2] = { 0, 0 };

 		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		TRACE("blake  :");
@ -157,37 +156,36 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 		cubehash256_cpu_hash_32(thr_id, throughput,pdata[19], d_hash[thr_id], order++);
 		TRACE("cube   :");

-		bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonces);
+		memset(work->nonces, 0, sizeof(work->nonces));
+		bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], work->nonces);

 		*hashes_done = pdata[19] - first_nonce + throughput;

-		if (foundNonces[0] != 0)
+		if (work->nonces[0] != 0)
 		{
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonces[0]);
-			lyra2v2_hash(vhash64, endiandata);
-			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget))
-			{
-				int res = 1;
-				work_set_target_ratio(work, vhash64);
-				pdata[19] = foundNonces[0];
-				// check if there was another one...
-				if (foundNonces[1] != 0)
-				{
-					be32enc(&endiandata[19], foundNonces[1]);
-					lyra2v2_hash(vhash64, endiandata);
-					pdata[21] = foundNonces[1];
-					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0]) {
-						work_set_target_ratio(work, vhash64);
-						xchg(pdata[19], pdata[21]);
-					}
-					res++;
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			lyra2v2_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					lyra2v2_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
 				}
-				return res;
+				return work->valid_nonces;
 			}
-			else
-			{
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonces[0]);
+			else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}

--- a/myriadgroestl.cpp
+++ b/myriadgroestl.cpp
@ -67,34 +67,36 @@ int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned
 	myriadgroestl_cpu_setBlock(thr_id, endiandata, ptarget);

 	do {
-		// GPU
-		uint32_t foundNonces[2] = { UINT32_MAX, UINT32_MAX };
+		memset(work->nonces, 0xff, sizeof(work->nonces));

-		myriadgroestl_cpu_hash(thr_id, throughput, pdata[19], foundNonces);
+		// GPU
+		myriadgroestl_cpu_hash(thr_id, throughput, pdata[19], work->nonces);

 		*hashes_done = pdata[19] - start_nonce + throughput;

-		if (foundNonces[0] < UINT32_MAX && bench_algo < 0)
+		if (work->nonces[0] < UINT32_MAX && bench_algo < 0)
 		{
 			uint32_t _ALIGN(64) vhash[8];
-			endiandata[19] = swab32(foundNonces[0]);
+			endiandata[19] = swab32(work->nonces[0]);
 			myriadhash(vhash, endiandata);
 			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
 				work_set_target_ratio(work, vhash);
-				work->nonces[0] = foundNonces[0];
-				pdata[19] = foundNonces[0];
-				// search for another nonce
-				if (foundNonces[1] != UINT32_MAX) {
-					endiandata[19] = swab32(foundNonces[1]);
+				if (work->nonces[1] != UINT32_MAX) {
+					endiandata[19] = swab32(work->nonces[1]);
 					myriadhash(vhash, endiandata);
-					pdata[21] = foundNonces[1]; // to drop
-					work->nonces[1] = foundNonces[1];
 					bn_set_target_ratio(work, vhash, 1);
-					return 2;
+					work->valid_nonces = 2;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
 				}
-				return 1;
-			} else if (vhash[7] > ptarget[7]) {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonces[0]);
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > ptarget[7]) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}

--- a/neoscrypt/neoscrypt.cpp
+++ b/neoscrypt/neoscrypt.cpp
@ -62,28 +62,31 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
 	neoscrypt_setBlockTarget(endiandata,ptarget);

 	do {
-		uint32_t foundNonces[2] = { UINT32_MAX, UINT32_MAX };
-		neoscrypt_hash_k4(thr_id, throughput, pdata[19], foundNonces, have_stratum);
+		memset(work->nonces, 0xff, sizeof(work->nonces));
+		neoscrypt_hash_k4(thr_id, throughput, pdata[19], work->nonces, have_stratum);

 		*hashes_done = pdata[19] - first_nonce + throughput;

-		if (foundNonces[0] != UINT32_MAX)
+		if (work->nonces[0] != UINT32_MAX)
 		{
+			const uint32_t Htarg = ptarget[7];
 			uint32_t _ALIGN(64) vhash[8];

 			if (have_stratum) {
-				be32enc(&endiandata[19], foundNonces[0]);
+				be32enc(&endiandata[19], work->nonces[0]);
 			} else {
-				endiandata[19] = foundNonces[0];
+				endiandata[19] = work->nonces[0];
 			}
 			neoscrypt((uchar*)vhash, (uchar*) endiandata, 0x80000620U);

-			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
 				work_set_target_ratio(work, vhash);
-				pdata[19] = foundNonces[0];
-				return 1;
-			} else {
-				gpulog(LOG_WARNING, thr_id, "nonce %08x does not validate on CPU!", foundNonces[0]);
+				pdata[19] = work->nonces[0] + 1; // cursor
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "nonce %08x does not validate on CPU!", work->nonces[0]);
 			}
 		}

--- a/pentablake.cu
+++ b/pentablake.cu
@ -98,21 +98,34 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n

 		*hashes_done = pdata[19] - first_nonce + throughput;

-		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
 		{
-			uint32_t vhash[8];
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];

-			be32enc(&endiandata[19], foundNonce);
+			be32enc(&endiandata[19], work->nonces[0]);
 			pentablakehash(vhash, endiandata);

-			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
-				rc = 1;
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
 				work_set_target_ratio(work, vhash);
-				pdata[19] = foundNonce;
-				return rc;
-			} else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					pentablakehash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}

--- a/quark/nist5.cu
+++ b/quark/nist5.cu
@ -56,7 +56,6 @@ extern "C" int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce,
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
-	int res = 0;

 	uint32_t throughput =  cuda_default_throughput(thr_id, 1 << 20); // 256*256*16
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
@ -101,6 +100,8 @@ extern "C" int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce,
 	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
 	cuda_check_cpu_setTarget(ptarget);

+	work->valid_nonces = 0;
+
 	do {
 		int order = 0;

@ -113,31 +114,33 @@ extern "C" int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce,

 		*hashes_done = pdata[19] - first_nonce + throughput;

-		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
-			nist5hash(vhash64, endiandata);
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			nist5hash(vhash, endiandata);

-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
-				res = 1;
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				work_set_target_ratio(work, vhash64);
-				if (secNonce != 0) {
-					be32enc(&endiandata[19], secNonce);
-					nist5hash(vhash64, endiandata);
-					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0])
-						work_set_target_ratio(work, vhash64);
-					pdata[21] = secNonce;
-					res++;
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					nist5hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
 				}
-				pdata[19] = foundNonce;
 				goto out;
 			}
-			else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+			else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}

@ -157,7 +160,7 @@ out:
 		cudaStreamDestroy(stream[i]);
 #endif

-	return res;
+	return work->valid_nonces;
 }

 // ressources cleanup
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@ -180,7 +180,6 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,

 	do {
 		int order = 0;
-		uint32_t foundNonce;
 		uint32_t nrm1=0, nrm2=0, nrm3=0;

 		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
@ -229,8 +228,8 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
 			quark_keccak512_cpu_hash_64(thr_id, nrm1, pdata[19], d_branch1Nonces[thr_id], d_hash[thr_id], order++);
 			quark_jh512_cpu_hash_64(thr_id, nrm2, pdata[19], d_branch2Nonces[thr_id], d_hash[thr_id], order++);

-			foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-
+			work->nonces[0] = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+			work->nonces[1] = 0;
 		} else {
 			/* algo permutations are made with 2 different buffers */

@ -263,31 +262,47 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
 			TRACE("perm3  :");

 			CUDA_LOG_ERROR();
-			foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+			work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+			work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 		}

 		*hashes_done = pdata[19] - first_nonce + throughput;

-		if (foundNonce != UINT32_MAX)
+		if (work->nonces[0] != UINT32_MAX)
 		{
-			uint32_t vhash[8];
-			be32enc(&endiandata[19], foundNonce);
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
 			quarkhash(vhash, endiandata);

 			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
 				work_set_target_ratio(work, vhash);
-				pdata[19] = foundNonce;
-				return 1;
-			} else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
-				applog_hash((uchar*) vhash);
-				applog_hash((uchar*) ptarget);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					quarkhash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
 			}
+			else if (vhash[7] > ptarget[7]) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
 		}

 		pdata[19] += throughput;

-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);

 	return 0;
 }
--- a/qubit/deep.cu
+++ b/qubit/deep.cu
@ -96,30 +96,33 @@ extern "C" int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce,

 		*hashes_done = pdata[19] - first_nonce + throughput;

-		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
 		{
-			uint32_t _ALIGN(64) vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
-			deephash(vhash64, endiandata);
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			deephash(vhash, endiandata);

-			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
-				int res = 1;
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				work_set_target_ratio(work, vhash64);
-				if (secNonce != 0) {
-					be32enc(&endiandata[19], secNonce);
-					deephash(vhash64, endiandata);
-					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0])
-						work_set_target_ratio(work, vhash64);
-					pdata[21] = secNonce;
-					res++;
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					deephash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
 				}
-				pdata[19] = foundNonce;
-				return res;
+				return work->valid_nonces;
 			}
-			else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+			else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}

@ -127,7 +130,6 @@ extern "C" int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce,
 			pdata[19] = max_nonce;
 			break;
 		}
-
 		pdata[19] += throughput;

 	} while (!work_restart[thr_id].restart);
--- a/qubit/luffa.cu
+++ b/qubit/luffa.cu
@ -72,27 +72,40 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce,

 		*hashes_done = pdata[19] - first_nonce + throughput;

-		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
 		{
-			uint32_t _ALIGN(64) vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
-			luffa_hash(vhash64, endiandata);
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			luffa_hash(vhash, endiandata);

-			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
-				work_set_target_ratio(work, vhash64);
-				pdata[19] = foundNonce;
-				return 1;
-			} else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					luffa_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}

-		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
-
 		pdata[19] += throughput;

 	} while (!work_restart[thr_id].restart);
--- a/qubit/qubit.cu
+++ b/qubit/qubit.cu
@ -112,31 +112,33 @@ extern "C" int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce,

 		*hashes_done = pdata[19] - first_nonce + throughput;

-		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
-			qubithash(vhash64, endiandata);
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			qubithash(vhash, endiandata);

-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
-				int res = 1;
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				work_set_target_ratio(work, vhash64);
-				if (secNonce != 0) {
-					be32enc(&endiandata[19], secNonce);
-					qubithash(vhash64, endiandata);
-					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0])
-						work_set_target_ratio(work, vhash64);
-					pdata[21] = secNonce;
-					res++;
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					qubithash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
 				}
-				pdata[19] = foundNonce;
-				return res;
+				return work->valid_nonces;
 			}
-			else {
-				applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+			else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}

@ -144,7 +146,6 @@ extern "C" int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce,
 			pdata[19] = max_nonce;
 			break;
 		}
-
 		pdata[19] += throughput;

 	} while (!work_restart[thr_id].restart);
--- a/skein.cu
+++ b/skein.cu
@ -342,10 +342,6 @@ extern "C" void skeincoinhash(void *output, const void *input)
 	memcpy(output, hash, 32);
 }

-static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {
-	return iftrue ? swab32(val) : val;
-}
-
 static bool init[MAX_GPUS] = { 0 };

 extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
@ -355,7 +351,6 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no
 	uint32_t *ptarget = work->target;

 	const uint32_t first_nonce = pdata[19];
-	const int swap = 1;

 	sm5 = (device_sm[device_map[thr_id]] >= 500);
 	bool checkSecnonce = (have_stratum || have_longpoll) && !sm5;
@ -363,7 +358,6 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no
 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 20);
 	if (init[thr_id]) throughput = min(throughput, (max_nonce - first_nonce));

-	uint32_t foundNonce, secNonce = 0;
 	uint64_t target64 = 0;

 	if (opt_benchmark)
@ -409,54 +403,45 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no

 		if (sm5) {
 			/* cuda_skeincoin.cu */
-			foundNonce = skeincoin_hash_sm5(thr_id, throughput, pdata[19], swap, target64, &secNonce);
+			work->nonces[0] = skeincoin_hash_sm5(thr_id, throughput, pdata[19], 1, target64, &work->nonces[1]);
 		} else {
 			/* quark/cuda_skein512.cu */
-			skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], swap);
+			skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 			sha2_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]);
-			foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+			work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		}

-		if (foundNonce != UINT32_MAX)
+		if (work->nonces[0] != UINT32_MAX)
 		{
 			uint32_t _ALIGN(64) vhash[8];

-			endiandata[19] = swab32_if(foundNonce, swap);
+			endiandata[19] = swab32(work->nonces[0]);
 			skeincoinhash(vhash, endiandata);
-
 			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
-				int res = 1;
-				uint8_t num = res;
+				work->valid_nonces = 1;
 				work_set_target_ratio(work, vhash);
 				if (checkSecnonce) {
-					secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], num);
-				}
-				while (secNonce != 0 && res < 2) /* todo: up to 6 */
-				{
-					endiandata[19] = swab32_if(secNonce, swap);
-					skeincoinhash(vhash, endiandata);
-					if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
-						// todo: use 19 20 21... zr5 pok to adapt...
-						endiandata[19] = swab32_if(secNonce, swap);
+					work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], work->valid_nonces);
+					if (work->nonces[1] != 0) {
+						endiandata[19] = swab32(work->nonces[1]);
 						skeincoinhash(vhash, endiandata);
-						if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0])
-							work_set_target_ratio(work, vhash);
-						pdata[19+res*2] = swab32_if(secNonce, !swap);
-						res++;
+						if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+							work->valid_nonces++;
+							bn_set_target_ratio(work, vhash, 1);
+						}
+						pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+					} else {
+						pdata[19] = work->nonces[0] + 1;
 					}
-					num++;
-					//if (checkSecnonce)
-					//	secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], num);
-					//else
-						break; // only one secNonce...
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor for next scan
 				}
-				if (res > 1 && opt_debug)
-					applog(LOG_BLUE, "GPU #%d: %d/%d valid nonces !!!", device_map[thr_id], res, (int)num);
-				pdata[19] = swab32_if(foundNonce, !swap);
-				return res;
+				return work->valid_nonces;
 			}
-			else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+			 else if (vhash[7] > ptarget[7]) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}

--- a/skein2.cpp
+++ b/skein2.cpp
@ -35,17 +35,12 @@ void skein2hash(void *output, const void *input)

 static bool init[MAX_GPUS] = { 0 };

-static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {
-	return iftrue ? swab32(val) : val;
-}
-
 int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	int dev_id = device_map[thr_id];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
-	const int swap = 1; // to toggle nonce endian

 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 19); // 256*256*8
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
@ -85,35 +80,39 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
 		int order = 0;

 		// Hash with CUDA
-		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], swap);
+		skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);

 		*hashes_done = pdata[19] - first_nonce + throughput;

-		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
 		{
 			uint32_t _ALIGN(64) vhash[8];

-			endiandata[19] = swab32_if(foundNonce, swap);
+			endiandata[19] = swab32(work->nonces[0]);
 			skein2hash(vhash, endiandata);

 			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
-				int res = 1;
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 				work_set_target_ratio(work, vhash);
-				if (secNonce != 0) {
-					endiandata[19] = swab32_if(secNonce, swap);
+				if (work->nonces[1] != 0) {
+					endiandata[19] = swab32(work->nonces[1]);
 					skein2hash(vhash, endiandata);
+					work->valid_nonces++;
 					bn_set_target_ratio(work, vhash, 1);
-					pdata[21] = work->nonces[1] = swab32_if(secNonce, !swap);
-					gpulog(LOG_DEBUG, thr_id, "found second nonce %08x!", swab32(secNonce));
-					res++;
+					gpulog(LOG_DEBUG, thr_id, "found second nonce %08x!", endiandata[19]);
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor for next scan
 				}
-				pdata[19] = work->nonces[0] = swab32_if(foundNonce, !swap);
-				return res;
-			} else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > ptarget[7]) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}

--- a/x11/c11.cu
+++ b/x11/c11.cu
@ -155,7 +155,6 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u

 	do {
 		int order = 0;
-		uint32_t foundNonce;

 		// Hash with CUDA
 		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
@ -181,32 +180,32 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u

 		*hashes_done = pdata[19] - first_nonce + throughput;

-		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
-			c11hash(vhash64, endiandata);
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			c11hash(vhash, endiandata);

-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
-				int res = 1;
-				// check if there was some other ones...
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				work_set_target_ratio(work, vhash64);
-				if (secNonce != 0) {
-					be32enc(&endiandata[19], secNonce);
-					c11hash(vhash64, endiandata);
-					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0])
-						work_set_target_ratio(work, vhash64);
-					pdata[21] = secNonce;
-					res++;
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					c11hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
 				}
-				pdata[19] = foundNonce;
-				return res;
-			} else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
-				pdata[19] = foundNonce + 1;
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
 				continue;
 			}
 		}
--- a/x11/fresh.cu
+++ b/x11/fresh.cu
@ -108,7 +108,6 @@ extern "C" int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce,
 	x11_shavite512_setBlock_80((void*)endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 	do {
-		uint32_t foundNonce;
 		int order = 0;

 		// GPU Hash
@ -126,35 +125,41 @@ extern "C" int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce,
 #endif
 		*hashes_done = pdata[19] - first_nonce + throughput;

-		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
 		{
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
-			fresh_hash(vhash64, endiandata);
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			fresh_hash(vhash, endiandata);

-			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
-				int res = 1;
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				work_set_target_ratio(work, vhash64);
-				if (secNonce != 0) {
-					be32enc(&endiandata[19], secNonce);
-					fresh_hash(vhash64, endiandata);
-					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0])
-						work_set_target_ratio(work, vhash64);
-					pdata[21] = secNonce;
-					res++;
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					fresh_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
 				}
-				pdata[19] = foundNonce;
-				return res;
-			} else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
 			}
 		}

+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
 		pdata[19] += throughput;

-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);

 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
--- a/x11/s3.cu
+++ b/x11/s3.cu
@ -107,8 +107,6 @@ extern "C" int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, un
 	cuda_check_cpu_setTarget(ptarget);

 	do {
-		const uint32_t Htarg = ptarget[7];
-		uint32_t foundNonce;
 		int order = 0;

 		x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
@ -120,31 +118,33 @@ extern "C" int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, un

 		*hashes_done = pdata[19] - first_nonce + throughput;

-		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-
-		if (foundNonce != UINT32_MAX)
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
 		{
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
-			s3hash(vhash64, endiandata);
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			s3hash(vhash, endiandata);

-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
-				int res = 1;
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				work_set_target_ratio(work, vhash64);
-				if (secNonce != 0) {
-					be32enc(&endiandata[19], secNonce);
-					s3hash(vhash64, endiandata);
-					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0])
-						work_set_target_ratio(work, vhash64);
-					pdata[21] = secNonce;
-					res++;
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					s3hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
 				}
-				pdata[19] = foundNonce;
-				return res;
-
-			} else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}

--- a/x11/sib.cu
+++ b/x11/sib.cu
@ -152,7 +152,6 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u

 	do {
 		int order = 0;
-		uint32_t foundNonce;

 		// Hash with CUDA
 		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
@ -178,32 +177,34 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		TRACE("echo => ");

-		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
-			sibhash(vhash64, endiandata);
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			sibhash(vhash, endiandata);

-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
-				int res = 1;
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				work_set_target_ratio(work, vhash64);
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] =cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 				*hashes_done = pdata[19] - first_nonce + throughput;
-				if (secNonce != 0) {
-					be32enc(&endiandata[19], secNonce);
-					sibhash(vhash64, endiandata);
-					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0])
-						work_set_target_ratio(work, vhash64);
-					pdata[21] = secNonce;
-					res++;
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					sibhash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
 				}
-				pdata[19] = foundNonce;
-				return res;
-			} else if (vhash64[7] > Htarg && !opt_quiet) {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
-				pdata[19] = foundNonce + 1;
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
 				continue;
 			}
 		}
--- a/x11/veltor.cu
+++ b/x11/veltor.cu
@ -120,31 +120,34 @@ extern "C" int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce
 			veltorhash(vhash, endiandata);
 			if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
 			{
-				int res = 1;
-				work_set_target_ratio(work, vhash);
 				work->nonces[0] = startNounce + h_resNonce[0];
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
 				if (h_resNonce[1] != UINT32_MAX)
 				{
 					uint32_t secNonce = work->nonces[1] = startNounce + h_resNonce[1];
-					gpulog(LOG_DEBUG, thr_id, "Found 2nd nonce: %08x", secNonce);
 					be32enc(&endiandata[19], secNonce);
 					veltorhash(vhash, endiandata);
 					work->nonces[1] = secNonce;
-
 					if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
 						work_set_target_ratio(work, vhash);
 						xchg(work->nonces[1], work->nonces[0]);
 					} else {
-						bn_set_target_ratio(work, vhash, res);
+						bn_set_target_ratio(work, vhash, work->valid_nonces);
 					}
-					res++;
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
 				}
-				pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; // next scan
-				return res;
+				return work->valid_nonces;
 			}
-			else if (vhash[7] > Htarg && !opt_quiet) {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", h_resNonce[0]);
+			else if (vhash[7] > Htarg) {
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", h_resNonce[0]);
 				cudaMemset(d_resNonce[thr_id], 0xff, NBN*sizeof(uint32_t));
+				pdata[19] = h_resNonce[0] + 1;
+				continue;
 			}
 		}
 		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
--- a/x11/x11.cu
+++ b/x11/x11.cu
@ -144,7 +144,6 @@ extern "C" int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, u

 	do {
 		int order = 0;
-		uint32_t foundNonce;

 		// Hash with CUDA
 		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
@ -168,33 +167,33 @@ extern "C" int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, u
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		TRACE("echo => ");

-		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
-			x11hash(vhash64, endiandata);
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			x11hash(vhash, endiandata);

-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
-				int res = 1;
-				// check if there was some other ones...
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				work_set_target_ratio(work, vhash64);
-				*hashes_done = pdata[19] - first_nonce + throughput;
-				if (secNonce != 0) {
-					be32enc(&endiandata[19], secNonce);
-					x11hash(vhash64, endiandata);
-					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0])
-						work_set_target_ratio(work, vhash64);
-					pdata[21] = secNonce;
-					res++;
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					x11hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
 				}
-				pdata[19] = foundNonce;
-				return res;
+				return work->valid_nonces;
 			} else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
-				pdata[19] = foundNonce + 1;
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
 				continue;
 			}
 		}
--- a/x11/x11evo.cu
+++ b/x11/x11evo.cu
@ -290,7 +290,6 @@ extern "C" int scanhash_x11evo(int thr_id, struct work* work, uint32_t max_nonce

 	do {
 		int order = 1;
-		uint32_t foundNonce;

 		// Hash with CUDA
 		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
@ -344,38 +343,38 @@ extern "C" int scanhash_x11evo(int thr_id, struct work* work, uint32_t max_nonce
 				break;
 			case ECHO:
 				x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-				TRACE("echo => ");
+				TRACE("echo   :");
 				break;
 			}
 		}

-		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
-		{
-			uint32_t _ALIGN(64) vhash64[8];
-			const uint32_t Htarg = ptarget[7];
-			be32enc(&endiandata[19], foundNonce);
-			x11evo_hash(vhash64, endiandata);
+		*hashes_done = pdata[19] - first_nonce + throughput;

-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
-				int res = 1;
-				// check if there was some other ones...
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				work_set_target_ratio(work, vhash64);
-				*hashes_done = pdata[19] - first_nonce + throughput;
-				if (secNonce != 0) {
-					be32enc(&endiandata[19], secNonce);
-					x11evo_hash(vhash64, endiandata);
-					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0])
-						work_set_target_ratio(work, vhash64);
-					pdata[21] = secNonce;
-					res++;
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			x11evo_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				pdata[19] = work->nonces[0] + 1; // cursor
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					x11evo_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+					gpulog(LOG_DEBUG, thr_id, "second nonce %08x! cursor %08x", work->nonces[1], pdata[19]);
+					work->valid_nonces++;
 				}
-				pdata[19] = foundNonce;
-				return res;
-			} else if (vhash64[7] > Htarg) {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
-				pdata[19] = foundNonce + 1;
+				return work->valid_nonces;
+			} else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
 				continue;
 			}
 		}
@ -388,7 +387,7 @@ extern "C" int scanhash_x11evo(int thr_id, struct work* work, uint32_t max_nonce

 	} while (!work_restart[thr_id].restart);

-	*hashes_done = pdata[19] - first_nonce;
+	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }

--- a/x13/x13.cu
+++ b/x13/x13.cu
@ -165,7 +165,6 @@ extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, u
 	cuda_check_cpu_setTarget(ptarget);

 	do {
-		uint32_t foundNonce;
 		int order = 0;

 		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
@ -185,31 +184,33 @@ extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, u

 		CUDA_LOG_ERROR();

-		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
 		{
-			uint32_t vhash[8];
-			be32enc(&endiandata[19], foundNonce);
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
 			x13hash(vhash, endiandata);

 			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
-				int res = 1;
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 				work_set_target_ratio(work, vhash);
-				pdata[19] = foundNonce;
-				if (secNonce != 0) {
-					be32enc(&endiandata[19], secNonce);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
 					x13hash(vhash, endiandata);
-					pdata[21] = secNonce;
-					if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
-						work_set_target_ratio(work, vhash);
-						xchg(pdata[19], pdata[21]);
-					}
-					res++;
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
 				}
-				return res;
-			} else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}

--- a/x15/whirlpool.cu
+++ b/x15/whirlpool.cu
@ -101,7 +101,6 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce,
 	whirlpool512_setBlock_80((void*)endiandata, ptarget);

 	do {
-		uint32_t foundNonce;
 		int order = 0;

 		*hashes_done = pdata[19] - first_nonce + throughput;
@ -113,28 +112,24 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce,
 		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		TRACE64(" 64 :", d_hash);

-		foundNonce = whirlpool512_cpu_finalhash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		if (foundNonce != UINT32_MAX && bench_algo < 0)
+		work->nonces[0] = whirlpool512_cpu_finalhash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		if (work->nonces[0] != UINT32_MAX && bench_algo < 0)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash[8];
-			be32enc(&endiandata[19], foundNonce);
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
 			wcoinhash(vhash, endiandata);

 			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
-				int res = 1;
+				work->valid_nonces = 1;
 				work_set_target_ratio(work, vhash);
-				#if 0
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				if (secNonce != 0) {
-					pdata[21] = secNonce;
-					res++;
-				}
-				#endif
-				pdata[19] = foundNonce;
-				return res;
-			} else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+				pdata[19] = work->nonces[0] + 1; // cursor
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}
 		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
--- a/x15/x14.cu
+++ b/x15/x14.cu
@ -198,32 +198,35 @@ extern "C" int scanhash_x14(int thr_id,  struct work* work, uint32_t max_nonce,

 		*hashes_done = pdata[19] - first_nonce + throughput;

-		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);

-		if (foundNonce != UINT32_MAX)
+		if (work->nonces[0] != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
+			uint32_t _ALIGN(64) vhash[8];
 			/* check now with the CPU to confirm */
-			be32enc(&endiandata[19], foundNonce);
-			x14hash(vhash64, endiandata);
+			be32enc(&endiandata[19], work->nonces[0]);
+			x14hash(vhash, endiandata);

-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
-				int res = 1;
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				work_set_target_ratio(work, vhash64);
-				if (secNonce != 0) {
-					be32enc(&endiandata[19], secNonce);
-					x14hash(vhash64, endiandata);
-					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0])
-						work_set_target_ratio(work, vhash64);
-					pdata[21] = secNonce;
-					res++;
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					x14hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
 				}
-				pdata[19] = foundNonce;
-				return res;
-			} else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}

--- a/x15/x15.cu
+++ b/x15/x15.cu
@ -205,31 +205,34 @@ extern "C" int scanhash_x15(int thr_id,  struct work* work, uint32_t max_nonce,

 		*hashes_done = pdata[19] - first_nonce + throughput;

-		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
+			uint32_t _ALIGN(64) vhash[8];
 			/* check now with the CPU to confirm */
-			be32enc(&endiandata[19], foundNonce);
-			x15hash(vhash64, endiandata);
+			be32enc(&endiandata[19], work->nonces[0]);
+			x15hash(vhash, endiandata);

-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
-				int res = 1;
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				work_set_target_ratio(work, vhash64);
-				if (secNonce != 0) {
-					be32enc(&endiandata[19], secNonce);
-					x15hash(vhash64, endiandata);
-					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0])
-						work_set_target_ratio(work, vhash64);
-					pdata[21] = secNonce;
-					res++;
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					x15hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
 				}
-				pdata[19] = foundNonce;
-				return res;
-			} else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
 			}
 		}

--- a/x17/x17.cu
+++ b/x17/x17.cu
@ -230,34 +230,37 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u

 		*hashes_done = pdata[19] - first_nonce + throughput;

-		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
-			uint32_t _ALIGN(64) vhash64[8];
-			be32enc(&endiandata[19], foundNonce);
-			x17hash(vhash64, endiandata);
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			x17hash(vhash, endiandata);

-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
-				int res = 1;
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
-				work_set_target_ratio(work, vhash64);
-				if (secNonce != 0) {
-					be32enc(&endiandata[19], secNonce);
-					x17hash(vhash64, endiandata);
-					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0])
-						work_set_target_ratio(work, vhash64);
-					pdata[21] = secNonce;
-					res++;
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					x17hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
 				}
-				pdata[19] = foundNonce;
-				return res;
-			} else {
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
 				// x11+ coins could do some random error, but not on retry
 				if (!warn) {
-					warn++; continue;
+					warn++;
+					pdata[19] = work->nonces[0] + 1;
+					continue;
 				} else {
-					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
 					warn = 0;
 				}
 			}
--- a/zr5.cu
+++ b/zr5.cu
@ -431,12 +431,12 @@ extern "C" int scanhash_zr5(int thr_id, struct work *work,
 		if (work_restart[thr_id].restart)
 			return -1;

-		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
-		if (foundNonce != UINT32_MAX)
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
 		{
-			uint32_t vhash64[8];
+			uint32_t _ALIGN(64) vhash[8];
 			uint32_t oldp19 = pdata[19];
-			uint32_t offset = foundNonce - pdata[19];
+			uint32_t offset = work->nonces[0] - pdata[19];
 			uint32_t pok = 0;
 			uint16_t h_pok;

@ -444,32 +444,31 @@ extern "C" int scanhash_zr5(int thr_id, struct work *work,

 			cudaMemcpy(&h_pok, d_poks[thr_id] + offset, sizeof(uint16_t), cudaMemcpyDeviceToHost);
 			pok = version | (0x10000UL * h_pok);
-			pdata[0] = pok; pdata[19] = foundNonce;
-			zr5hash(vhash64, pdata);
-			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
-				int res = 1;
-				work_set_target_ratio(work, vhash64);
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, oldp19, d_hash[thr_id], 1);
-				if (secNonce != 0) {
-					offset = secNonce - oldp19;
+			pdata[0] = pok; pdata[19] = work->nonces[0];
+			zr5hash(vhash, pdata);
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, oldp19, d_hash[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					offset = work->nonces[1] - oldp19;
 					cudaMemcpy(&h_pok, d_poks[thr_id] + offset, sizeof(uint16_t), cudaMemcpyDeviceToHost);
 					pok = version | (0x10000UL * h_pok);
 					memcpy(tmpdata, pdata, 80);
-					tmpdata[0] = pok; tmpdata[19] = secNonce;
-					zr5hash(vhash64, tmpdata);
-					if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
-						if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio[0])
-							work_set_target_ratio(work, vhash64);
-						pdata[21] = secNonce;
-						pdata[22] = pok;
-						res++;
+					tmpdata[0] = pok; tmpdata[19] = work->nonces[1];
+					zr5hash(vhash, tmpdata);
+					if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+						bn_set_target_ratio(work, vhash, 1);
+						pdata[19] = max(pdata[19], work->nonces[1]); // cursor
+						pdata[20] = pok; // second nonce "pok"
+						work->valid_nonces++;
 					}
+					pdata[19]++;
 				}
-				return res;
-			} else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
-
-				pdata[19]++;
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > ptarget[7]) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
 				pdata[0] = oldp0;
 			}
 		} else