From 61ff92b5b472308c30af1e7b83022eff804bf3af Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sun, 1 Nov 2015 15:36:26 +0100
Subject: [PATCH] never interrupt global benchmark with found nonces

fix some algo weird hashrates (like blake)
and reset device between algos, for better accuracy

but this reset doesnt seems enough to bench all algos correctly...

to test on linux, could be a driver issue...

heavy: fix first alloc and indent with tabs...
---
 Algo256/blake256.cu     |   5 +-
 Algo256/bmw.cu          |   2 +-
 Algo256/keccak256.cu    |   8 +-
 JHA/jackpotcoin.cu      |   7 +-
 Makefile.am             |   2 +-
 bench.cpp               |  10 +-
 ccminer.cpp             |  15 +-
 cuda.cpp                |   4 +-
 cuda_checkhash.cu       |  11 +-
 cuda_nist5.cu           |   8 +-
 fuguecoin.cpp           |   9 +-
 groestlcoin.cpp         |  11 +-
 heavy/heavy.cu          | 575 ++++++++++++++++++++--------------------
 lyra2/lyra2RE.cu        |  11 +-
 lyra2/lyra2REv2.cu      |  11 +-
 myriadgroestl.cpp       |  13 +-
 neoscrypt/neoscrypt.cpp |   9 +-
 pentablake.cu           |   7 +-
 qubit/deep.cu           |   7 +-
 qubit/luffa.cu          |  25 +-
 skein.cu                |   6 +-
 skein2.cpp              |   5 +-
 x11/c11.cu              |  12 +-
 x11/fresh.cu            |  11 +-
 x11/s3.cu               |   9 +-
 x11/x11.cu              |  11 +-
 x13/x13.cu              |   4 +-
 x15/whirlpool.cu        |  11 +-
 x15/whirlpoolx.cu       |  20 +-
 x15/x14.cu              |  10 +-
 x15/x15.cu              |   9 +-
 x17/x17.cu              |   7 +-
 32 files changed, 485 insertions(+), 380 deletions(-)

diff --git a/Algo256/blake256.cu b/Algo256/blake256.cu
index 4a08410..09184ef 100644
--- a/Algo256/blake256.cu
+++ b/Algo256/blake256.cu
@@ -439,7 +439,7 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non
 #endif
 		*hashes_done = pdata[19] - first_nonce + throughput;
 
-		if (foundNonce != UINT32_MAX)
+		if (foundNonce != UINT32_MAX && bench_algo == -1)
 		{
 			uint32_t vhashcpu[8];
 			uint32_t Htarg = (uint32_t)targetHigh;
@@ -478,7 +478,8 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non
 			}
 		}
 
-		if ((uint64_t) pdata[19] + throughput > (uint64_t) max_nonce) {
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
 			break;
 		}
 
diff --git a/Algo256/bmw.cu b/Algo256/bmw.cu
index c578d98..fc05db2 100644
--- a/Algo256/bmw.cu
+++ b/Algo256/bmw.cu
@@ -91,7 +91,7 @@ extern "C" int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, u
 			}
 		}
 
-		if ((uint64_t) throughput + pdata[19] > max_nonce) {
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
diff --git a/Algo256/keccak256.cu b/Algo256/keccak256.cu
index d2e3ec1..4d5f315 100644
--- a/Algo256/keccak256.cu
+++ b/Algo256/keccak256.cu
@@ -46,7 +46,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 
 	if (opt_benchmark)
-		ptarget[7] = 0x00ff;
+		ptarget[7] = 0x000f;
 
 	if (!init[thr_id]) {
 		cudaSetDevice(device_map[thr_id]);
@@ -68,7 +68,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
 		*hashes_done = pdata[19] - first_nonce + throughput;
 
 		uint32_t foundNonce = keccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		if (foundNonce != UINT32_MAX)
+		if (foundNonce != UINT32_MAX && bench_algo < 0)
 		{
 			uint32_t _ALIGN(64) vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
@@ -84,7 +84,8 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
 			}
 		}
 
-		if ((uint64_t) pdata[19] + throughput > max_nonce) {
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
 			break;
 		}
 
@@ -92,6 +93,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
 
 	} while (!work_restart[thr_id].restart);
 
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 
diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu
index 64d7aab..495057f 100644
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@@ -238,16 +238,17 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
 			}
 		}
 
-		if ((uint64_t) pdata[19] + throughput > max_nonce) {
-			*hashes_done = pdata[19] - first_nonce;
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
-			return 0;
+			break;
 		}
 
 		pdata[19] += throughput;
 
 	} while (!work_restart[thr_id].restart);
 
+	*hashes_done = pdata[19] - first_nonce;
+
 	CUDA_LOG_ERROR();
 
 	return 0;
diff --git a/Makefile.am b/Makefile.am
index e53fb46..d1af8f9 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -81,7 +81,7 @@ ccminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ @PCIFLAGS@ @OPENMP_CFLAGS@ $(CPPFLAGS) $(P
 
 nvcc_ARCH  = -gencode=arch=compute_50,code=\"sm_50,compute_50\"
 
-#nvcc_ARCH += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
+nvcc_ARCH += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
 #nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
 #nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\"
 #nvcc_ARCH += -gencode=arch=compute_20,code=\"sm_21,compute_20\"
diff --git a/bench.cpp b/bench.cpp
index 6e94e31..0a3fdf3 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -89,6 +89,9 @@ bool bench_algo_switch_next(int thr_id)
 	int prev_algo = algo;
 	int dev_id = device_map[thr_id % MAX_GPUS];
 	int mfree, mused;
+	// doesnt seems enough to prevent device slow down
+	// after some algo switchs
+	bool need_reset = (gpu_threads == 1);
 
 	algo++;
 
@@ -143,6 +146,7 @@ bool bench_algo_switch_next(int thr_id)
 		gpulog(LOG_WARNING, thr_id, "possible %d MB memory leak in %s! %d MB free",
 			(device_mem_free[thr_id] - mfree), algo_names[prev_algo], mfree);
 		cuda_reset_device(thr_id, NULL); // force to free the leak
+		need_reset = false;
 		mfree = cuda_available_memory(thr_id);
 	}
 	// store used memory per algo
@@ -152,14 +156,13 @@ bool bench_algo_switch_next(int thr_id)
 	// store to dump a table per gpu later
 	algo_hashrates[thr_id][prev_algo] = hashrate;
 
-
 	// wait the other threads to display logs correctly
 	if (opt_n_threads > 1) {
 		pthread_barrier_wait(&algo_barr);
 	}
 
 	if (algo == ALGO_AUTO)
-		return false;
+		return false; // all algos done
 
 	// mutex primary used for the stats purge
 	pthread_mutex_lock(&bench_lock);
@@ -170,6 +173,9 @@ bool bench_algo_switch_next(int thr_id)
 	thr_hashrates[thr_id] = 0; // reset for minmax64
 	pthread_mutex_unlock(&bench_lock);
 
+	if (need_reset)
+		cuda_reset_device(thr_id, NULL);
+
 	if (thr_id == 0)
 		applog(LOG_BLUE, "Benchmark algo %s...", algo_names[algo]);
 
diff --git a/ccminer.cpp b/ccminer.cpp
index 1ba68c1..b995d33 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -479,9 +479,11 @@ void proper_exit(int reason)
 		reason = app_exit_code;
 	}
 
+	pthread_mutex_lock(&stats_lock);
 	if (check_dups)
 		hashlog_purge_all();
 	stats_purge_all();
+	pthread_mutex_unlock(&stats_lock);
 
 #ifdef WIN32
 	timeEndPeriod(1); // else never executed
@@ -496,7 +498,7 @@ void proper_exit(int reason)
 #endif
 	free(opt_syslog_pfx);
 	free(opt_api_allow);
-	free(work_restart);
+	//free(work_restart);
 	//free(thr_info);
 	exit(reason);
 }
@@ -1709,18 +1711,22 @@ static void *miner_thread(void *userdata)
 		if (max64 < minmax) {
 			switch (opt_algo) {
 			case ALGO_BLAKECOIN:
-			case ALGO_BLAKE:
-			case ALGO_WHIRLPOOLX:
 				minmax = 0x80000000U;
 				break;
+			case ALGO_BLAKE:
 			case ALGO_BMW:
+			case ALGO_WHIRLPOOLX:
 				minmax = 0x40000000U;
 				break;
+			case ALGO_KECCAK:
 			case ALGO_LUFFA:
-				minmax = 0x2000000;
+			case ALGO_SKEIN:
+			case ALGO_SKEIN2:
+				minmax = 0x1000000;
 				break;
 			case ALGO_C11:
 			case ALGO_DEEP:
+			case ALGO_HEAVY:
 			case ALGO_LYRA2v2:
 			case ALGO_S3:
 			case ALGO_X11:
@@ -1729,7 +1735,6 @@ static void *miner_thread(void *userdata)
 			case ALGO_WHIRLPOOL:
 				minmax = 0x400000;
 				break;
-			case ALGO_KECCAK:
 			case ALGO_JACKPOT:
 			case ALGO_X14:
 			case ALGO_X15:
diff --git a/cuda.cpp b/cuda.cpp
index fb32cfa..a9ba63b 100644
--- a/cuda.cpp
+++ b/cuda.cpp
@@ -176,9 +176,11 @@ void cuda_reset_device(int thr_id, bool *init)
 	}
 	cudaDeviceReset();
 	if (opt_cudaschedule >= 0) {
-		cudaSetDevice(dev_id);
 		cudaSetDeviceFlags((unsigned)(opt_cudaschedule & cudaDeviceScheduleMask));
+	} else {
+		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 	}
+	cudaDeviceSynchronize();
 }
 
 // return free memory in megabytes
diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu
index 20d9e45..7fdbc41 100644
--- a/cuda_checkhash.cu
+++ b/cuda_checkhash.cu
@@ -121,6 +121,9 @@ uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uin
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
+	if (bench_algo >= 0) // dont interrupt the global benchmark
+		return UINT32_MAX;
+
 	if (!init_done) {
 		applog(LOG_ERR, "missing call to cuda_check_cpu_init");
 		return UINT32_MAX;
@@ -143,6 +146,9 @@ uint32_t cuda_check_hash_32(int thr_id, uint32_t threads, uint32_t startNounce,
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
+	if (bench_algo >= 0) // dont interrupt the global benchmark
+		return UINT32_MAX;
+
 	if (!init_done) {
 		applog(LOG_ERR, "missing call to cuda_check_cpu_init");
 		return UINT32_MAX;
@@ -237,9 +243,12 @@ uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNoun
 
 	uint32_t result = UINT32_MAX;
 
+	if (bench_algo >= 0) // dont interrupt the global benchmark
+		return result;
+
 	if (!init_done) {
 		applog(LOG_ERR, "missing call to cuda_check_cpu_init");
-		return UINT32_MAX;
+		return result;
 	}
 
 	cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
diff --git a/cuda_nist5.cu b/cuda_nist5.cu
index 744f844..bcc8b3a 100644
--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@@ -135,11 +135,17 @@ extern "C" int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce,
 			}
 		}
 
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
 out:
+	*hashes_done = pdata[19] - first_nonce;
 #ifdef USE_STREAMS
 	for (int i = 0; i < 5; i++)
 		cudaStreamDestroy(stream[i]);
diff --git a/fuguecoin.cpp b/fuguecoin.cpp
index d7681c8..7c703ce 100644
--- a/fuguecoin.cpp
+++ b/fuguecoin.cpp
@@ -65,7 +65,9 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
 		uint32_t foundNounce = UINT32_MAX;
 		fugue256_cpu_hash(thr_id, throughput, pdata[19], NULL, &foundNounce);
 
-		if (foundNounce < UINT32_MAX)
+		*hashes_done = pdata[19] - start_nonce + throughput;
+
+		if (foundNounce < UINT32_MAX && bench_algo < 0)
 		{
 			uint32_t vhash[8];
 			sph_fugue256_context ctx_fugue;
@@ -79,14 +81,13 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
 			{
 				work_set_target_ratio(work, vhash);
 				pdata[19] = foundNounce;
-				*hashes_done = foundNounce - start_nonce + 1;
 				return 1;
 			} else {
 				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNounce);
 			}
 		}
 
-		if ((uint64_t) pdata[19] + throughput > (uint64_t) max_nonce) {
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
@@ -95,7 +96,7 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
 
 	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - start_nonce + 1;
+	*hashes_done = pdata[19] - start_nonce;
 	return 0;
 }
 
diff --git a/groestlcoin.cpp b/groestlcoin.cpp
index c3f803a..89f178a 100644
--- a/groestlcoin.cpp
+++ b/groestlcoin.cpp
@@ -39,7 +39,7 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
 	uint32_t *outputHash = (uint32_t*)malloc((size_t) 64* throughput);
 
 	if (opt_benchmark)
-		ptarget[7] = 0x000ff;
+		ptarget[7] = 0x001f;
 
 	if (!init[thr_id])
 	{
@@ -62,7 +62,7 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
 		// GPU hash
 		groestlcoin_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce);
 
-		if (foundNounce < UINT32_MAX)
+		if (foundNounce < UINT32_MAX && bench_algo < 0)
 		{
 			uint32_t _ALIGN(64) vhash[8];
 			endiandata[19] = swab32(foundNounce);
@@ -78,14 +78,15 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
 			}
 		}
 
-		if ((uint64_t) pdata[19] + throughput > max_nonce) {
-			*hashes_done = pdata[19] - start_nonce + 1;
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - start_nonce;
 
 	free(outputHash);
 	return 0;
diff --git a/heavy/heavy.cu b/heavy/heavy.cu
index 936006a..40fb0c9 100644
--- a/heavy/heavy.cu
+++ b/heavy/heavy.cu
@@ -43,99 +43,99 @@ extern uint32_t *heavy_heftyHashes[MAX_GPUS];
 /* Combines top 64-bits from each hash into a single hash */
 static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t *hash2, const uint32_t *hash3, const uint32_t *hash4)
 {
-    const uint32_t *hash[4] = { hash1, hash2, hash3, hash4 };
-    int bits;
-    unsigned int i;
-    uint32_t mask;
-    unsigned int k;
-
-    /* Transpose first 64 bits of each hash into out */
-    memset(out, 0, 32);
-    bits = 0;
-    for (i = 7; i >= 6; i--) {
-        for (mask = 0x80000000; mask; mask >>= 1) {
-            for (k = 0; k < 4; k++) {
-                out[(255 - bits)/32] <<= 1;
-                if ((hash[k][i] & mask) != 0)
-                    out[(255 - bits)/32] |= 1;
-                bits++;
-            }
-        }
-    }
+	const uint32_t *hash[4] = { hash1, hash2, hash3, hash4 };
+	int bits;
+	unsigned int i;
+	uint32_t mask;
+	unsigned int k;
+
+	/* Transpose first 64 bits of each hash into out */
+	memset(out, 0, 32);
+	bits = 0;
+	for (i = 7; i >= 6; i--) {
+		for (mask = 0x80000000; mask; mask >>= 1) {
+			for (k = 0; k < 4; k++) {
+				out[(255 - bits)/32] <<= 1;
+				if ((hash[k][i] & mask) != 0)
+					out[(255 - bits)/32] |= 1;
+				bits++;
+			}
+		}
+	}
 }
 
 #ifdef _MSC_VER
 #include <intrin.h>
 static uint32_t __inline bitsset( uint32_t x )
 {
-    DWORD r = 0;
-    _BitScanReverse(&r, x);
-    return r;
+	DWORD r = 0;
+	_BitScanReverse(&r, x);
+	return r;
 }
 #else
 static uint32_t bitsset( uint32_t x )
 {
-    return 31-__builtin_clz(x);
+	return 31-__builtin_clz(x);
 }
 #endif
 
 // Finde das high bit in einem Multiword-Integer.
 static int findhighbit(const uint32_t *ptarget, int words)
 {
-    int i;
-    int highbit = 0;
-    for (i=words-1; i >= 0; --i)
-    {
-        if (ptarget[i] != 0) {
-            highbit = i*32 + bitsset(ptarget[i])+1;
-            break;
-        }
-    }
-    return highbit;
+	int i;
+	int highbit = 0;
+	for (i=words-1; i >= 0; --i)
+	{
+		if (ptarget[i] != 0) {
+			highbit = i*32 + bitsset(ptarget[i])+1;
+			break;
+		}
+	}
+	return highbit;
 }
 
 // Generiere ein Multiword-Integer das die Zahl
 // (2 << highbit) - 1 repräsentiert.
 static void genmask(uint32_t *ptarget, int words, int highbit)
 {
-    int i;
-    for (i=words-1; i >= 0; --i)
-    {
-        if ((i+1)*32 <= highbit)
-            ptarget[i] = UINT32_MAX;
-        else if (i*32 > highbit)
-            ptarget[i] = 0x00000000;
-        else
-            ptarget[i] = (1 << (highbit-i*32)) - 1;
-    }
+	int i;
+	for (i=words-1; i >= 0; --i)
+	{
+		if ((i+1)*32 <= highbit)
+			ptarget[i] = UINT32_MAX;
+		else if (i*32 > highbit)
+			ptarget[i] = 0x00000000;
+		else
+			ptarget[i] = (1 << (highbit-i*32)) - 1;
+	}
 }
 
 struct check_nonce_for_remove
 {
-    check_nonce_for_remove(uint64_t target, uint32_t *hashes, uint32_t hashlen, uint32_t startNonce) :
-        m_target(target),
-        m_hashes(hashes),
-        m_hashlen(hashlen),
-        m_startNonce(startNonce) { }
-
-    uint64_t  m_target;
-    uint32_t *m_hashes;
-    uint32_t  m_hashlen;
-    uint32_t  m_startNonce;
-
-    __device__
-    bool operator()(const uint32_t x)
-    {
-        // Position im Hash Buffer
-        uint32_t hashIndex = x - m_startNonce;
-        // Wert des Hashes (als uint64_t) auslesen.
-        // Steht im 6. und 7. Wort des Hashes (jeder dieser Hashes hat 512 Bits)
-        uint64_t hashValue = *((uint64_t*)(&m_hashes[m_hashlen*hashIndex + 6]));
-        bool res = (hashValue & m_target) != hashValue;
-        //printf("ndx=%x val=%08x target=%lx\n", hashIndex, hashValue, m_target);
-        // gegen das Target prüfen. Es dürfen nur Bits aus dem Target gesetzt sein.
-        return res;
-    }
+	check_nonce_for_remove(uint64_t target, uint32_t *hashes, uint32_t hashlen, uint32_t startNonce) :
+		m_target(target),
+		m_hashes(hashes),
+		m_hashlen(hashlen),
+		m_startNonce(startNonce) { }
+
+	uint64_t  m_target;
+	uint32_t *m_hashes;
+	uint32_t  m_hashlen;
+	uint32_t  m_startNonce;
+
+	__device__
+	bool operator()(const uint32_t x)
+	{
+		// Position im Hash Buffer
+		uint32_t hashIndex = x - m_startNonce;
+		// Wert des Hashes (als uint64_t) auslesen.
+		// Steht im 6. und 7. Wort des Hashes (jeder dieser Hashes hat 512 Bits)
+		uint64_t hashValue = *((uint64_t*)(&m_hashes[m_hashlen*hashIndex + 6]));
+		bool res = (hashValue & m_target) != hashValue;
+		//printf("ndx=%x val=%08x target=%lx\n", hashIndex, hashValue, m_target);
+		// gegen das Target prüfen. Es dürfen nur Bits aus dem Target gesetzt sein.
+		return res;
+	}
 };
 
 static bool init[MAX_GPUS] = { 0 };
@@ -143,245 +143,252 @@ static bool init[MAX_GPUS] = { 0 };
 __host__
 int scanhash_heavy(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done, uint32_t maxvote, int blocklen)
 {
-    uint32_t *pdata = work->data;
-    uint32_t *ptarget = work->target;
-    const uint32_t first_nonce = pdata[19];
-    // CUDA will process thousands of threads.
-    uint32_t throughput = cuda_default_throughput(thr_id, (1U << 19) - 256);
-    if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
-
-    int rc = 0;
-    uint32_t *hash = NULL;
-    uint32_t *cpu_nonceVector = NULL;
-    CUDA_SAFE_CALL(cudaMallocHost(&hash, throughput*8*sizeof(uint32_t)));
-    CUDA_SAFE_CALL(cudaMallocHost(&cpu_nonceVector, throughput*sizeof(uint32_t)));
-
-    int nrmCalls[6];
-    memset(nrmCalls, 0, sizeof(int) * 6);
-
-    if (opt_benchmark)
-        ((uint32_t*)ptarget)[7] = 0x00ff;
-
-    // für jeden Hash ein individuelles Target erstellen basierend
-    // auf dem höchsten Bit, das in ptarget gesetzt ist.
-    int highbit = findhighbit(ptarget, 8);
-    uint32_t target2[2], target3[2], target4[2], target5[2];
-    genmask(target2, 2, highbit/4+(((highbit%4)>3)?1:0) ); // SHA256
-    genmask(target3, 2, highbit/4+(((highbit%4)>2)?1:0) ); // keccak512
-    genmask(target4, 2, highbit/4+(((highbit%4)>1)?1:0) ); // groestl512
-    genmask(target5, 2, highbit/4+(((highbit%4)>0)?1:0) ); // blake512
-
-    if (!init[thr_id])
-    {
-        hefty_cpu_init(thr_id, throughput);
-        sha256_cpu_init(thr_id, throughput);
-        keccak512_cpu_init(thr_id, throughput);
-        groestl512_cpu_init(thr_id, throughput);
-        blake512_cpu_init(thr_id, throughput);
-        combine_cpu_init(thr_id, throughput);
-
-        CUDA_SAFE_CALL(cudaMalloc(&heavy_nonceVector[thr_id], sizeof(uint32_t) * throughput));
-
-        init[thr_id] = true;
-    }
-
-    if (blocklen == HEAVYCOIN_BLKHDR_SZ)
-    {
-        uint16_t *ext = (uint16_t *)&pdata[20];
-
-        if (opt_vote > maxvote && !opt_benchmark) {
-            applog(LOG_WARNING, "Your block reward vote (%hu) exceeds "
-                    "the maxvote reported by the pool (%hu).",
-                    opt_vote, maxvote);
-        }
-
-        if (opt_trust_pool && opt_vote > maxvote) {
-            applog(LOG_WARNING, "Capping block reward vote to maxvote reported by pool.");
-            ext[0] = maxvote;
-        }
-        else
-            ext[0] = opt_vote;
-    }
-
-    // Setze die Blockdaten
-    hefty_cpu_setBlock(thr_id, throughput, pdata, blocklen);
-    sha256_cpu_setBlock(pdata, blocklen);
-    keccak512_cpu_setBlock(pdata, blocklen);
-    groestl512_cpu_setBlock(pdata, blocklen);
-    blake512_cpu_setBlock(pdata, blocklen);
-
-    do {
-        uint32_t actualNumberOfValuesInNonceVectorGPU = throughput;
-
-        ////// Compaction init
-
-        hefty_cpu_hash(thr_id, throughput, pdata[19]);
-        sha256_cpu_hash(thr_id, throughput, pdata[19]);
-
-        // Hier ist die längste CPU Wartephase. Deshalb ein strategisches MyStreamSynchronize() hier.
-        MyStreamSynchronize(NULL, 1, thr_id);
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	// CUDA will process thousands of threads.
+	uint32_t throughput = cuda_default_throughput(thr_id, (1U << 19) - 256);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	int rc = 0;
+	uint32_t *hash = NULL;
+	uint32_t *cpu_nonceVector = NULL;
+
+	int nrmCalls[6];
+	memset(nrmCalls, 0, sizeof(int) * 6);
+
+	if (opt_benchmark)
+	   ptarget[7] = 0x000f;
+
+	// für jeden Hash ein individuelles Target erstellen basierend
+	// auf dem höchsten Bit, das in ptarget gesetzt ist.
+	int highbit = findhighbit(ptarget, 8);
+	uint32_t target2[2], target3[2], target4[2], target5[2];
+	genmask(target2, 2, highbit/4+(((highbit%4)>3)?1:0) ); // SHA256
+	genmask(target3, 2, highbit/4+(((highbit%4)>2)?1:0) ); // keccak512
+	genmask(target4, 2, highbit/4+(((highbit%4)>1)?1:0) ); // groestl512
+	genmask(target5, 2, highbit/4+(((highbit%4)>0)?1:0) ); // blake512
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+
+		hefty_cpu_init(thr_id, throughput);
+		sha256_cpu_init(thr_id, throughput);
+		keccak512_cpu_init(thr_id, throughput);
+		groestl512_cpu_init(thr_id, throughput);
+		blake512_cpu_init(thr_id, throughput);
+		combine_cpu_init(thr_id, throughput);
+
+		CUDA_SAFE_CALL(cudaMalloc(&heavy_nonceVector[thr_id], sizeof(uint32_t) * throughput));
+
+		init[thr_id] = true;
+	}
+
+	// weird but require at least one cudaSetDevice first
+	CUDA_SAFE_CALL(cudaMallocHost(&hash, (size_t) 32 * throughput));
+	CUDA_SAFE_CALL(cudaMallocHost(&cpu_nonceVector, sizeof(uint32_t) * throughput));
+
+	if (blocklen == HEAVYCOIN_BLKHDR_SZ)
+	{
+		uint16_t *ext = (uint16_t*) &pdata[20];
+
+		if (opt_vote > maxvote && !opt_benchmark) {
+			applog(LOG_WARNING, "Your block reward vote (%hu) exceeds the maxvote reported by the pool (%hu).",
+					opt_vote, maxvote);
+		}
+
+		if (opt_trust_pool && opt_vote > maxvote) {
+			applog(LOG_WARNING, "Capping block reward vote to maxvote reported by pool.");
+			ext[0] = maxvote;
+		}
+		else
+			ext[0] = opt_vote;
+	}
+
+	// Setze die Blockdaten
+	hefty_cpu_setBlock(thr_id, throughput, pdata, blocklen);
+	sha256_cpu_setBlock(pdata, blocklen);
+	keccak512_cpu_setBlock(pdata, blocklen);
+	groestl512_cpu_setBlock(pdata, blocklen);
+	blake512_cpu_setBlock(pdata, blocklen);
+
+	do {
+		uint32_t actualNumberOfValuesInNonceVectorGPU = throughput;
+
+		////// Compaction init
+
+		hefty_cpu_hash(thr_id, throughput, pdata[19]);
+		sha256_cpu_hash(thr_id, throughput, pdata[19]);
+
+		// Hier ist die längste CPU Wartephase. Deshalb ein strategisches MyStreamSynchronize() hier.
+		MyStreamSynchronize(NULL, 1, thr_id);
 
 #if USE_THRUST
-        thrust::device_ptr<uint32_t> devNoncePtr(heavy_nonceVector[thr_id]);
-        thrust::device_ptr<uint32_t> devNoncePtrEnd((heavy_nonceVector[thr_id]) + throughput);
-
-        ////// Compaction
-        uint64_t *t = (uint64_t*) target2;
-        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash2output[thr_id], 8, pdata[19]));
-        actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
-        if(actualNumberOfValuesInNonceVectorGPU == 0)
-            goto emptyNonceVector;
-
-        keccak512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
-
-        ////// Compaction
-        t = (uint64_t*) target3;
-        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash3output[thr_id], 16, pdata[19]));
-        actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
-        if(actualNumberOfValuesInNonceVectorGPU == 0)
-            goto emptyNonceVector;
-
-        blake512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
-
-        ////// Compaction
-        t = (uint64_t*) target5;
-        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash5output[thr_id], 16, pdata[19]));
-        actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
-        if(actualNumberOfValuesInNonceVectorGPU == 0)
-            goto emptyNonceVector;
-
-        groestl512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
-
-        ////// Compaction
-        t = (uint64_t*) target4;
-        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash4output[thr_id], 16, pdata[19]));
-        actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
+		thrust::device_ptr<uint32_t> devNoncePtr(heavy_nonceVector[thr_id]);
+		thrust::device_ptr<uint32_t> devNoncePtrEnd((heavy_nonceVector[thr_id]) + throughput);
+
+		////// Compaction
+		uint64_t *t = (uint64_t*) target2;
+		devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash2output[thr_id], 8, pdata[19]));
+		actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
+		if(actualNumberOfValuesInNonceVectorGPU == 0)
+			goto emptyNonceVector;
+
+		keccak512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
+
+		////// Compaction
+		t = (uint64_t*) target3;
+		devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash3output[thr_id], 16, pdata[19]));
+		actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
+		if(actualNumberOfValuesInNonceVectorGPU == 0)
+			goto emptyNonceVector;
+
+		blake512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
+
+		////// Compaction
+		t = (uint64_t*) target5;
+		devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash5output[thr_id], 16, pdata[19]));
+		actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
+		if(actualNumberOfValuesInNonceVectorGPU == 0)
+			goto emptyNonceVector;
+
+		groestl512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
+
+		////// Compaction
+		t = (uint64_t*) target4;
+		devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash4output[thr_id], 16, pdata[19]));
+		actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
 #else
-        // todo
-        actualNumberOfValuesInNonceVectorGPU = 0;
+		// todo (nvlabs cub ?)
+		actualNumberOfValuesInNonceVectorGPU = 0;
 #endif
-        if(actualNumberOfValuesInNonceVectorGPU == 0)
-            goto emptyNonceVector;
-
-        // combine
-        combine_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19], hash);
-
-        if (opt_tracegpu) {
-            applog(LOG_BLUE, "heavy GPU hash:");
-            applog_hash((uchar*)hash);
-        }
-
-        // Ergebnisse kopieren
-        if(actualNumberOfValuesInNonceVectorGPU > 0)
-        {
-            size_t size = sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU;
-            CUDA_SAFE_CALL(cudaMemcpy(cpu_nonceVector, heavy_nonceVector[thr_id], size, cudaMemcpyDeviceToHost));
-            cudaThreadSynchronize();
-
-            for (uint32_t i=0; i < actualNumberOfValuesInNonceVectorGPU; i++)
-            {
-                uint32_t nonce = cpu_nonceVector[i];
-                uint32_t *foundhash = &hash[8*i];
-                if (foundhash[7] <= ptarget[7] && fulltest(foundhash, ptarget)) {
-                    uint32_t vhash[8];
-                    pdata[19] += nonce - pdata[19];
-                    heavycoin_hash((uchar*)vhash, (uchar*)pdata, blocklen);
-                    if (memcmp(vhash, foundhash, 32)) {
-                        gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", nonce);
-                    } else {
-                        *hashes_done = pdata[19] - first_nonce;
-                        work_set_target_ratio(work, vhash);
-                        rc = 1;
-                        goto exit;
-                    }
-                }
-            }
-        }
+		if(actualNumberOfValuesInNonceVectorGPU == 0)
+			goto emptyNonceVector;
+
+		// combine
+		combine_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19], hash);
+
+		if (opt_tracegpu) {
+			applog(LOG_BLUE, "heavy GPU hash:");
+			applog_hash((uchar*)hash);
+		}
+
+		// Ergebnisse kopieren
+		if(actualNumberOfValuesInNonceVectorGPU > 0)
+		{
+			size_t size = sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU;
+			cudaMemcpy(cpu_nonceVector, heavy_nonceVector[thr_id], size, cudaMemcpyDeviceToHost);
+
+			for (uint32_t i=0; i < actualNumberOfValuesInNonceVectorGPU; i++)
+			{
+				uint32_t nonce = cpu_nonceVector[i];
+				uint32_t *foundhash = &hash[8*i];
+				if (foundhash[7] <= ptarget[7] && fulltest(foundhash, ptarget)) {
+					uint32_t vhash[8];
+					pdata[19] += nonce - pdata[19];
+					heavycoin_hash((uchar*)vhash, (uchar*)pdata, blocklen);
+					if (memcmp(vhash, foundhash, 32)) {
+						gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", nonce);
+					} else {
+						work_set_target_ratio(work, vhash);
+						rc = 1;
+						goto exit;
+					}
+				}
+			}
+		}
 
 emptyNonceVector:
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
 
-        pdata[19] += throughput;
-
-    } while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
-    *hashes_done = pdata[19] - first_nonce;
+	} while (!work_restart[thr_id].restart);
 
 exit:
-    cudaFreeHost(cpu_nonceVector);
-    cudaFreeHost(hash);
-    return rc;
+	*hashes_done = pdata[19] - first_nonce;
+
+	cudaFreeHost(cpu_nonceVector);
+	cudaFreeHost(hash);
+	CUDA_LOG_ERROR();
+
+	return rc;
 }
 
 // cleanup
 extern "C" void free_heavy(int thr_id)
 {
-    if (!init[thr_id])
-        return;
+	if (!init[thr_id])
+		return;
 
-    cudaThreadSynchronize();
+	cudaThreadSynchronize();
 
-    cudaFree(heavy_nonceVector[thr_id]);
+	cudaFree(heavy_nonceVector[thr_id]);
 
-    blake512_cpu_free(thr_id);
-    groestl512_cpu_free(thr_id);
-    hefty_cpu_free(thr_id);
-    keccak512_cpu_free(thr_id);
-    sha256_cpu_free(thr_id);
-    combine_cpu_free(thr_id);
+	blake512_cpu_free(thr_id);
+	groestl512_cpu_free(thr_id);
+	hefty_cpu_free(thr_id);
+	keccak512_cpu_free(thr_id);
+	sha256_cpu_free(thr_id);
+	combine_cpu_free(thr_id);
 
-    init[thr_id] = false;
+	init[thr_id] = false;
 
-    cudaDeviceSynchronize();
+	cudaDeviceSynchronize();
 }
 
 __host__
 void heavycoin_hash(uchar* output, const uchar* input, int len)
 {
-    unsigned char hash1[32];
-    unsigned char hash2[32];
-    uint32_t hash3[16];
-    uint32_t hash4[16];
-    uint32_t hash5[16];
-    uint32_t *final;
-    SHA256_CTX ctx;
-    sph_keccak512_context keccakCtx;
-    sph_groestl512_context groestlCtx;
-    sph_blake512_context blakeCtx;
-
-    HEFTY1(input, len, hash1);
-
-    /* HEFTY1 is new, so take an extra security measure to eliminate
-     * the possiblity of collisions:
-     *
-     *     Hash(x) = SHA256(x + HEFTY1(x))
-     *
-     * N.B. '+' is concatenation.
-     */
-    SHA256_Init(&ctx);
-    SHA256_Update(&ctx, input, len);
-    SHA256_Update(&ctx, hash1, sizeof(hash1));
-    SHA256_Final(hash2, &ctx);
-
-    /* Additional security: Do not rely on a single cryptographic hash
-     * function.  Instead, combine the outputs of 4 of the most secure
-     * cryptographic hash functions-- SHA256, KECCAK512, GROESTL512
-     * and BLAKE512.
-     */
-
-    sph_keccak512_init(&keccakCtx);
-    sph_keccak512(&keccakCtx, input, len);
-    sph_keccak512(&keccakCtx, hash1, sizeof(hash1));
-    sph_keccak512_close(&keccakCtx, (void *)&hash3);
-
-    sph_groestl512_init(&groestlCtx);
-    sph_groestl512(&groestlCtx, input, len);
-    sph_groestl512(&groestlCtx, hash1, sizeof(hash1));
-    sph_groestl512_close(&groestlCtx, (void *)&hash4);
-
-    sph_blake512_init(&blakeCtx);
-    sph_blake512(&blakeCtx, input, len);
-    sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1));
-    sph_blake512_close(&blakeCtx, (void *)&hash5);
-
-    final = (uint32_t *)output;
-    combine_hashes(final, (uint32_t *)hash2, hash3, hash4, hash5);
+	unsigned char hash1[32];
+	unsigned char hash2[32];
+	uint32_t hash3[16];
+	uint32_t hash4[16];
+	uint32_t hash5[16];
+	uint32_t *final;
+	SHA256_CTX ctx;
+	sph_keccak512_context keccakCtx;
+	sph_groestl512_context groestlCtx;
+	sph_blake512_context blakeCtx;
+
+	HEFTY1(input, len, hash1);
+
+	/* HEFTY1 is new, so take an extra security measure to eliminate
+	 * the possiblity of collisions:
+	 *
+	 *     Hash(x) = SHA256(x + HEFTY1(x))
+	 *
+	 * N.B. '+' is concatenation.
+	 */
+	SHA256_Init(&ctx);
+	SHA256_Update(&ctx, input, len);
+	SHA256_Update(&ctx, hash1, sizeof(hash1));
+	SHA256_Final(hash2, &ctx);
+
+	/* Additional security: Do not rely on a single cryptographic hash
+	 * function.  Instead, combine the outputs of 4 of the most secure
+	 * cryptographic hash functions-- SHA256, KECCAK512, GROESTL512
+	 * and BLAKE512.
+	 */
+
+	sph_keccak512_init(&keccakCtx);
+	sph_keccak512(&keccakCtx, input, len);
+	sph_keccak512(&keccakCtx, hash1, sizeof(hash1));
+	sph_keccak512_close(&keccakCtx, (void *)&hash3);
+
+	sph_groestl512_init(&groestlCtx);
+	sph_groestl512(&groestlCtx, input, len);
+	sph_groestl512(&groestlCtx, hash1, sizeof(hash1));
+	sph_groestl512_close(&groestlCtx, (void *)&hash4);
+
+	sph_blake512_init(&blakeCtx);
+	sph_blake512(&blakeCtx, input, len);
+	sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1));
+	sph_blake512_close(&blakeCtx, (void *)&hash5);
+
+	final = (uint32_t *)output;
+	combine_hashes(final, (uint32_t *)hash2, hash3, hash4, hash5);
 }
diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu
index 05ffb52..d74bb16 100644
--- a/lyra2/lyra2RE.cu
+++ b/lyra2/lyra2RE.cu
@@ -122,14 +122,14 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
 		int order = 0;
 		uint32_t foundNonce;
 
-		*hashes_done = pdata[19] - first_nonce + throughput;
-
 		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		TRACE("S")
 
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
 		foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		if (foundNonce != UINT32_MAX)
 		{
@@ -162,10 +162,15 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
 			}
 		}
 
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 
diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu
index 67e2dc9..bcc39d6 100644
--- a/lyra2/lyra2REv2.cu
+++ b/lyra2/lyra2REv2.cu
@@ -96,14 +96,13 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 			cudaDeviceReset();
 			// reduce cpu usage
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
 		}
-		CUDA_LOG_ERROR();
 
 		blake256_cpu_init(thr_id, throughput);
 		keccak256_cpu_init(thr_id,throughput);
 		skein256_cpu_init(thr_id, throughput);
 		bmw256_cpu_init(thr_id, throughput);
-		CUDA_LOG_ERROR();
 
 		// SM 3 implentation requires a bit more memory
 		if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500)
@@ -175,11 +174,15 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 			}
 		}
 
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
 		pdata[19] += throughput;
 
-	} while (!work_restart[thr_id].restart && (max_nonce > ((uint64_t)(pdata[19]) + throughput)));
+	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 
diff --git a/myriadgroestl.cpp b/myriadgroestl.cpp
index f27d060..4311120 100644
--- a/myriadgroestl.cpp
+++ b/myriadgroestl.cpp
@@ -43,7 +43,7 @@ int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned
 	uint32_t *outputHash = (uint32_t*)malloc(throughput * 64);
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
+		ptarget[7] = 0x0000ff;
 
 	// init
 	if(!init[thr_id])
@@ -63,11 +63,11 @@ int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned
 		// GPU
 		uint32_t foundNounce = UINT32_MAX;
 
-		*hashes_done = pdata[19] - start_nonce + throughput;
-
 		myriadgroestl_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce);
 
-		if (foundNounce < UINT32_MAX)
+		*hashes_done = pdata[19] - start_nonce + throughput;
+
+		if (foundNounce < UINT32_MAX && bench_algo < 0)
 		{
 			uint32_t _ALIGN(64) vhash[8];
 			endiandata[19] = swab32(foundNounce);
@@ -82,8 +82,7 @@ int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned
 			}
 		}
 
-		if ((uint64_t) pdata[19] + throughput > max_nonce) {
-			*hashes_done = pdata[19] - start_nonce;
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
@@ -91,6 +90,8 @@ int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned
 
 	} while (!work_restart[thr_id].restart);
 
+	*hashes_done = max_nonce - start_nonce;
+
 	free(outputHash);
 	return 0;
 }
diff --git a/neoscrypt/neoscrypt.cpp b/neoscrypt/neoscrypt.cpp
index 0e07845..b0cb1cb 100644
--- a/neoscrypt/neoscrypt.cpp
+++ b/neoscrypt/neoscrypt.cpp
@@ -79,11 +79,16 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
 			}
 		}
 
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
 		pdata[19] += throughput;
 
-	} while (!work_restart[thr_id].restart && (max_nonce > ((uint64_t)(pdata[19]) + throughput)));
+	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 
diff --git a/pentablake.cu b/pentablake.cu
index 96eb2a1..2abdeb7 100644
--- a/pentablake.cu
+++ b/pentablake.cu
@@ -110,9 +110,14 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n
 			}
 		}
 
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
 	return rc;
 }
diff --git a/qubit/deep.cu b/qubit/deep.cu
index afbe9fb..a5d4c63 100644
--- a/qubit/deep.cu
+++ b/qubit/deep.cu
@@ -117,9 +117,14 @@ extern "C" int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce,
 			}
 		}
 
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
diff --git a/qubit/luffa.cu b/qubit/luffa.cu
index afbf545..db283cb 100644
--- a/qubit/luffa.cu
+++ b/qubit/luffa.cu
@@ -36,19 +36,21 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce,
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
-	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 22); // 256*256*8*8
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 21);
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000f;
+		ptarget[7] = 0x0000f;
 
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
-		CUDA_LOG_ERROR();
-		//if (opt_cudaschedule == -1) // to reduce cpu usage...
-		//	cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
-		//CUDA_LOG_ERROR();
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
 
@@ -65,10 +67,9 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce,
 	cuda_check_cpu_setTarget(ptarget);
 
 	do {
-		int order = 0;
-		*hashes_done = pdata[19] - first_nonce + throughput;
+		qubit_luffa512_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], 0);
 
-		qubit_luffa512_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], order++);
+		*hashes_done = pdata[19] - first_nonce + throughput;
 
 		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != UINT32_MAX)
@@ -86,8 +87,8 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce,
 			}
 		}
 
-		if ((uint64_t) throughput + pdata[19] > max_nonce) {
-			// pdata[19] = max_nonce;
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
 			break;
 		}
 
@@ -95,7 +96,7 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce,
 
 	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 
diff --git a/skein.cu b/skein.cu
index f178a78..97a7c0b 100644
--- a/skein.cu
+++ b/skein.cu
@@ -453,9 +453,7 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no
 			}
 		}
 
-		if ((uint64_t) throughput + pdata[19] > max_nonce) {
-			//applog(LOG_DEBUG, "done... max=%u", max_nonce);
-			*hashes_done = pdata[19] - first_nonce;
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
@@ -464,6 +462,8 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no
 
 	} while (!work_restart[thr_id].restart);
 
+	*hashes_done = pdata[19] - first_nonce;
+
 	return 0;
 }
 
diff --git a/skein2.cpp b/skein2.cpp
index b05ccb3..38c0a5b 100644
--- a/skein2.cpp
+++ b/skein2.cpp
@@ -110,8 +110,7 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
 			}
 		}
 
-		if ((uint64_t) throughput + pdata[19] > max_nonce) {
-			*hashes_done = pdata[19] - first_nonce;
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
@@ -120,6 +119,8 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
 
 	} while (!work_restart[thr_id].restart);
 
+	*hashes_done = pdata[19] - first_nonce;
+
 	return 0;
 }
 
diff --git a/x11/c11.cu b/x11/c11.cu
index 0c7e25d..b48e85e 100644
--- a/x11/c11.cu
+++ b/x11/c11.cu
@@ -200,14 +200,20 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u
 			} else {
 				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 				pdata[19] = foundNonce + 1;
+				continue;
 			}
 		}
 
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 
@@ -228,4 +234,4 @@ extern "C" void free_c11(int thr_id)
 	init[thr_id] = false;
 
 	cudaDeviceSynchronize();
-}
\ No newline at end of file
+}
diff --git a/x11/fresh.cu b/x11/fresh.cu
index 8e2b42a..5e3bf38 100644
--- a/x11/fresh.cu
+++ b/x11/fresh.cu
@@ -78,18 +78,19 @@ extern "C" int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce,
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x00ff;
+		ptarget[7] = 0x00ff;
 
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		CUDA_LOG_ERROR();
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput + 4), -1);
 
 		x11_shavite512_cpu_init(thr_id, throughput);
 		x11_simd512_cpu_init(thr_id, throughput);
 		x11_echo512_cpu_init(thr_id, throughput);
 
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput + 4), 0);
-
 		cuda_check_cpu_init(thr_id, throughput);
 
 		init[thr_id] = true;
@@ -101,8 +102,6 @@ extern "C" int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce,
 	x11_shavite512_setBlock_80((void*)endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 	do {
-		uint32_t Htarg = ptarget[7];
-
 		uint32_t foundNonce;
 		int order = 0;
 
@@ -128,7 +127,7 @@ extern "C" int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce,
 			be32enc(&endiandata[19], foundNonce);
 			fresh_hash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
 				int res = 1;
 				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 				work_set_target_ratio(work, vhash64);
diff --git a/x11/s3.cu b/x11/s3.cu
index 45609c8..97e6068 100644
--- a/x11/s3.cu
+++ b/x11/s3.cu
@@ -147,11 +147,16 @@ extern "C" int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, un
 			}
 		}
 
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 
diff --git a/x11/x11.cu b/x11/x11.cu
index d2e93ca..367aa00 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -138,7 +138,7 @@ extern "C" int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, u
 		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
 			return 0;
 		}
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0); // why 64 ?
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
 
 		cuda_check_cpu_init(thr_id, throughput);
 
@@ -205,14 +205,19 @@ extern "C" int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, u
 			} else {
 				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 				pdata[19] = foundNonce + 1;
+				continue;
 			}
 		}
 
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 
diff --git a/x13/x13.cu b/x13/x13.cu
index faa73ab..39e0fe2 100644
--- a/x13/x13.cu
+++ b/x13/x13.cu
@@ -206,7 +206,7 @@ extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, u
 			}
 		}
 
-		if ((uint64_t)pdata[19] + throughput > (uint64_t)max_nonce) {
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
@@ -214,7 +214,7 @@ extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, u
 
 	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 
 	CUDA_LOG_ERROR();
 
diff --git a/x15/whirlpool.cu b/x15/whirlpool.cu
index 31f6466..4fac950 100644
--- a/x15/whirlpool.cu
+++ b/x15/whirlpool.cu
@@ -83,12 +83,14 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce,
 		uint32_t foundNonce;
 		int order = 0;
 
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
 		whirlpool512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 
 		foundNonce = whirlpool512_cpu_finalhash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		if (foundNonce != UINT32_MAX)
+		if (foundNonce != UINT32_MAX && bench_algo < 0)
 		{
 			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash[8];
@@ -97,7 +99,6 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce,
 
 			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
 				int res = 1;
-				*hashes_done = pdata[19] - first_nonce + throughput;
 				work_set_target_ratio(work, vhash);
 				#if 0
 				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
@@ -112,9 +113,13 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce,
 				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
 			}
 		}
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
 	*hashes_done = pdata[19] - first_nonce;
 	return 0;
diff --git a/x15/whirlpoolx.cu b/x15/whirlpoolx.cu
index 4416512..517d5ca 100644
--- a/x15/whirlpoolx.cu
+++ b/x15/whirlpoolx.cu
@@ -49,12 +49,12 @@ extern "C" int scanhash_whirlx(int thr_id,  struct work* work, uint32_t max_nonc
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
+		ptarget[7] = 0x000f;
 
 	if (!init[thr_id]) {
 		cudaSetDevice(device_map[thr_id]);
 
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
 
 		whirlpoolx_cpu_init(thr_id, throughput);
 
@@ -70,15 +70,16 @@ extern "C" int scanhash_whirlx(int thr_id,  struct work* work, uint32_t max_nonc
 
 	do {
 		uint32_t foundNonce = whirlpoolx_cpu_hash(thr_id, throughput, pdata[19]);
-		if (foundNonce != UINT32_MAX)
+
+		*(hashes_done) = pdata[19] - first_nonce + throughput;
+
+		if (foundNonce != UINT32_MAX && bench_algo < 0)
 		{
 			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			whirlxHash(vhash64, endiandata);
 
-			*hashes_done = pdata[19] - first_nonce + throughput;
-
 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
 				work_set_target_ratio(work, vhash64);
 				pdata[19] = foundNonce;
@@ -88,15 +89,16 @@ extern "C" int scanhash_whirlx(int thr_id,  struct work* work, uint32_t max_nonc
 			}
 		}
 
-		pdata[19] += throughput;
-
-		if (((uint64_t)pdata[19]+throughput) >= max_nonce) {
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
 			break;
 		}
 
+		pdata[19] += throughput;
+
 	} while (!work_restart[thr_id].restart);
 
-	*(hashes_done) = pdata[19] - first_nonce + 1;
+	*(hashes_done) = pdata[19] - first_nonce;
 
 	return 0;
 }
diff --git a/x15/x14.cu b/x15/x14.cu
index feaf6d4..d020f5c 100644
--- a/x15/x14.cu
+++ b/x15/x14.cu
@@ -219,13 +219,19 @@ extern "C" int scanhash_x14(int thr_id,  struct work* work, uint32_t max_nonce,
 				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 			}
 		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
 	CUDA_LOG_ERROR();
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 
diff --git a/x15/x15.cu b/x15/x15.cu
index d74dfa6..6d4799d 100644
--- a/x15/x15.cu
+++ b/x15/x15.cu
@@ -226,11 +226,16 @@ extern "C" int scanhash_x15(int thr_id,  struct work* work, uint32_t max_nonce,
 			}
 		}
 
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 
 	return 0;
 }
diff --git a/x17/x17.cu b/x17/x17.cu
index ebf2b4f..3be51d1 100644
--- a/x17/x17.cu
+++ b/x17/x17.cu
@@ -251,11 +251,16 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
 			}
 		}
 
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
 		pdata[19] += throughput;
 
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }