diff --git a/Algo256/blake256.cu b/Algo256/blake256.cu
index 4a08410..09184ef 100644
--- a/Algo256/blake256.cu
+++ b/Algo256/blake256.cu
@@ -439,7 +439,7 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non
 #endif
 		*hashes_done = pdata[19] - first_nonce + throughput;
 
-		if (foundNonce != UINT32_MAX)
+		if (foundNonce != UINT32_MAX && bench_algo == -1)
 		{
 			uint32_t vhashcpu[8];
 			uint32_t Htarg = (uint32_t)targetHigh;
@@ -478,7 +478,8 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non
 			}
 		}
 
-		if ((uint64_t) pdata[19] + throughput > (uint64_t) max_nonce) {
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
 			break;
 		}
 
diff --git a/Algo256/bmw.cu b/Algo256/bmw.cu
index c578d98..fc05db2 100644
--- a/Algo256/bmw.cu
+++ b/Algo256/bmw.cu
@@ -91,7 +91,7 @@ extern "C" int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, u
 			}
 		}
 
-		if ((uint64_t) throughput + pdata[19] > max_nonce) {
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
diff --git a/Algo256/keccak256.cu b/Algo256/keccak256.cu
index d2e3ec1..4d5f315 100644
--- a/Algo256/keccak256.cu
+++ b/Algo256/keccak256.cu
@@ -46,7 +46,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 
 	if (opt_benchmark)
-		ptarget[7] = 0x00ff;
+		ptarget[7] = 0x000f;
 
 	if (!init[thr_id]) {
 		cudaSetDevice(device_map[thr_id]);
@@ -68,7 +68,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
 		*hashes_done = pdata[19] - first_nonce + throughput;
 
 		uint32_t foundNonce = keccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		if (foundNonce != UINT32_MAX)
+		if (foundNonce != UINT32_MAX && bench_algo < 0)
 		{
 			uint32_t _ALIGN(64) vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
@@ -84,7 +84,8 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
 			}
 		}
 
-		if ((uint64_t) pdata[19] + throughput > max_nonce) {
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
 			break;
 		}
 
@@ -92,6 +93,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
 
 	} while (!work_restart[thr_id].restart);
 
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 
diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu
index 64d7aab..495057f 100644
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@@ -238,16 +238,17 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
 			}
 		}
 
-		if ((uint64_t) pdata[19] + throughput > max_nonce) {
-			*hashes_done = pdata[19] - first_nonce;
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
-			return 0;
+			break;
 		}
 
 		pdata[19] += throughput;
 
 	} while (!work_restart[thr_id].restart);
 
+	*hashes_done = pdata[19] - first_nonce;
+
 	CUDA_LOG_ERROR();
 
 	return 0;
diff --git a/Makefile.am b/Makefile.am
index e53fb46..d1af8f9 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -81,7 +81,7 @@ ccminer_CPPFLAGS = @LIBCURL_CPPFLAGS@ @PCIFLAGS@ @OPENMP_CFLAGS@ $(CPPFLAGS) $(P
 
 nvcc_ARCH  = -gencode=arch=compute_50,code=\"sm_50,compute_50\"
 
-#nvcc_ARCH += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
+nvcc_ARCH += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
 #nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
 #nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\"
 #nvcc_ARCH += -gencode=arch=compute_20,code=\"sm_21,compute_20\"
diff --git a/bench.cpp b/bench.cpp
index 6e94e31..0a3fdf3 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -89,6 +89,9 @@ bool bench_algo_switch_next(int thr_id)
 	int prev_algo = algo;
 	int dev_id = device_map[thr_id % MAX_GPUS];
 	int mfree, mused;
+	// doesnt seems enough to prevent device slow down
+	// after some algo switchs
+	bool need_reset = (gpu_threads == 1);
 
 	algo++;
 
@@ -143,6 +146,7 @@ bool bench_algo_switch_next(int thr_id)
 		gpulog(LOG_WARNING, thr_id, "possible %d MB memory leak in %s! %d MB free",
 			(device_mem_free[thr_id] - mfree), algo_names[prev_algo], mfree);
 		cuda_reset_device(thr_id, NULL); // force to free the leak
+		need_reset = false;
 		mfree = cuda_available_memory(thr_id);
 	}
 	// store used memory per algo
@@ -152,14 +156,13 @@ bool bench_algo_switch_next(int thr_id)
 	// store to dump a table per gpu later
 	algo_hashrates[thr_id][prev_algo] = hashrate;
 
-
 	// wait the other threads to display logs correctly
 	if (opt_n_threads > 1) {
 		pthread_barrier_wait(&algo_barr);
 	}
 
 	if (algo == ALGO_AUTO)
-		return false;
+		return false; // all algos done
 
 	// mutex primary used for the stats purge
 	pthread_mutex_lock(&bench_lock);
@@ -170,6 +173,9 @@ bool bench_algo_switch_next(int thr_id)
 	thr_hashrates[thr_id] = 0; // reset for minmax64
 	pthread_mutex_unlock(&bench_lock);
 
+	if (need_reset)
+		cuda_reset_device(thr_id, NULL);
+
 	if (thr_id == 0)
 		applog(LOG_BLUE, "Benchmark algo %s...", algo_names[algo]);
 
diff --git a/ccminer.cpp b/ccminer.cpp
index 1ba68c1..b995d33 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -479,9 +479,11 @@ void proper_exit(int reason)
 		reason = app_exit_code;
 	}
 
+	pthread_mutex_lock(&stats_lock);
 	if (check_dups)
 		hashlog_purge_all();
 	stats_purge_all();
+	pthread_mutex_unlock(&stats_lock);
 
 #ifdef WIN32
 	timeEndPeriod(1); // else never executed
@@ -496,7 +498,7 @@ void proper_exit(int reason)
 #endif
 	free(opt_syslog_pfx);
 	free(opt_api_allow);
-	free(work_restart);
+	//free(work_restart);
 	//free(thr_info);
 	exit(reason);
 }
@@ -1709,18 +1711,22 @@ static void *miner_thread(void *userdata)
 		if (max64 < minmax) {
 			switch (opt_algo) {
 			case ALGO_BLAKECOIN:
-			case ALGO_BLAKE:
-			case ALGO_WHIRLPOOLX:
 				minmax = 0x80000000U;
 				break;
+			case ALGO_BLAKE:
 			case ALGO_BMW:
+			case ALGO_WHIRLPOOLX:
 				minmax = 0x40000000U;
 				break;
+			case ALGO_KECCAK:
 			case ALGO_LUFFA:
-				minmax = 0x2000000;
+			case ALGO_SKEIN:
+			case ALGO_SKEIN2:
+				minmax = 0x1000000;
 				break;
 			case ALGO_C11:
 			case ALGO_DEEP:
+			case ALGO_HEAVY:
 			case ALGO_LYRA2v2:
 			case ALGO_S3:
 			case ALGO_X11:
@@ -1729,7 +1735,6 @@ static void *miner_thread(void *userdata)
 			case ALGO_WHIRLPOOL:
 				minmax = 0x400000;
 				break;
-			case ALGO_KECCAK:
 			case ALGO_JACKPOT:
 			case ALGO_X14:
 			case ALGO_X15:
diff --git a/cuda.cpp b/cuda.cpp
index fb32cfa..a9ba63b 100644
--- a/cuda.cpp
+++ b/cuda.cpp
@@ -176,9 +176,11 @@ void cuda_reset_device(int thr_id, bool *init)
 	}
 	cudaDeviceReset();
 	if (opt_cudaschedule >= 0) {
-		cudaSetDevice(dev_id);
 		cudaSetDeviceFlags((unsigned)(opt_cudaschedule & cudaDeviceScheduleMask));
+	} else {
+		cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 	}
+	cudaDeviceSynchronize();
 }
 
 // return free memory in megabytes
diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu
index 20d9e45..7fdbc41 100644
--- a/cuda_checkhash.cu
+++ b/cuda_checkhash.cu
@@ -121,6 +121,9 @@ uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uin
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
+	if (bench_algo >= 0) // dont interrupt the global benchmark
+		return UINT32_MAX;
+
 	if (!init_done) {
 		applog(LOG_ERR, "missing call to cuda_check_cpu_init");
 		return UINT32_MAX;
@@ -143,6 +146,9 @@ uint32_t cuda_check_hash_32(int thr_id, uint32_t threads, uint32_t startNounce,
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
+	if (bench_algo >= 0) // dont interrupt the global benchmark
+		return UINT32_MAX;
+
 	if (!init_done) {
 		applog(LOG_ERR, "missing call to cuda_check_cpu_init");
 		return UINT32_MAX;
@@ -237,9 +243,12 @@ uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNoun
 
 	uint32_t result = UINT32_MAX;
 
+	if (bench_algo >= 0) // dont interrupt the global benchmark
+		return result;
+
 	if (!init_done) {
 		applog(LOG_ERR, "missing call to cuda_check_cpu_init");
-		return UINT32_MAX;
+		return result;
 	}
 
 	cudaMemset(d_resNonces[thr_id], 0xff, sizeof(uint32_t));
diff --git a/cuda_nist5.cu b/cuda_nist5.cu
index 744f844..bcc8b3a 100644
--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@@ -135,11 +135,17 @@ extern "C" int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce,
 			}
 		}
 
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
 out:
+	*hashes_done = pdata[19] - first_nonce;
 #ifdef USE_STREAMS
 	for (int i = 0; i < 5; i++)
 		cudaStreamDestroy(stream[i]);
diff --git a/fuguecoin.cpp b/fuguecoin.cpp
index d7681c8..7c703ce 100644
--- a/fuguecoin.cpp
+++ b/fuguecoin.cpp
@@ -65,7 +65,9 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
 		uint32_t foundNounce = UINT32_MAX;
 		fugue256_cpu_hash(thr_id, throughput, pdata[19], NULL, &foundNounce);
 
-		if (foundNounce < UINT32_MAX)
+		*hashes_done = pdata[19] - start_nonce + throughput;
+
+		if (foundNounce < UINT32_MAX && bench_algo < 0)
 		{
 			uint32_t vhash[8];
 			sph_fugue256_context ctx_fugue;
@@ -79,14 +81,13 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
 			{
 				work_set_target_ratio(work, vhash);
 				pdata[19] = foundNounce;
-				*hashes_done = foundNounce - start_nonce + 1;
 				return 1;
 			} else {
 				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNounce);
 			}
 		}
 
-		if ((uint64_t) pdata[19] + throughput > (uint64_t) max_nonce) {
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
@@ -95,7 +96,7 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
 
 	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - start_nonce + 1;
+	*hashes_done = pdata[19] - start_nonce;
 	return 0;
 }
 
diff --git a/groestlcoin.cpp b/groestlcoin.cpp
index c3f803a..89f178a 100644
--- a/groestlcoin.cpp
+++ b/groestlcoin.cpp
@@ -39,7 +39,7 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
 	uint32_t *outputHash = (uint32_t*)malloc((size_t) 64* throughput);
 
 	if (opt_benchmark)
-		ptarget[7] = 0x000ff;
+		ptarget[7] = 0x001f;
 
 	if (!init[thr_id])
 	{
@@ -62,7 +62,7 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
 		// GPU hash
 		groestlcoin_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce);
 
-		if (foundNounce < UINT32_MAX)
+		if (foundNounce < UINT32_MAX && bench_algo < 0)
 		{
 			uint32_t _ALIGN(64) vhash[8];
 			endiandata[19] = swab32(foundNounce);
@@ -78,14 +78,15 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
 			}
 		}
 
-		if ((uint64_t) pdata[19] + throughput > max_nonce) {
-			*hashes_done = pdata[19] - start_nonce + 1;
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - start_nonce;
 
 	free(outputHash);
 	return 0;
diff --git a/heavy/heavy.cu b/heavy/heavy.cu
index 936006a..40fb0c9 100644
--- a/heavy/heavy.cu
+++ b/heavy/heavy.cu
@@ -43,99 +43,99 @@ extern uint32_t *heavy_heftyHashes[MAX_GPUS];
 /* Combines top 64-bits from each hash into a single hash */
 static void combine_hashes(uint32_t *out, const uint32_t *hash1, const uint32_t *hash2, const uint32_t *hash3, const uint32_t *hash4)
 {
-    const uint32_t *hash[4] = { hash1, hash2, hash3, hash4 };
-    int bits;
-    unsigned int i;
-    uint32_t mask;
-    unsigned int k;
-
-    /* Transpose first 64 bits of each hash into out */
-    memset(out, 0, 32);
-    bits = 0;
-    for (i = 7; i >= 6; i--) {
-        for (mask = 0x80000000; mask; mask >>= 1) {
-            for (k = 0; k < 4; k++) {
-                out[(255 - bits)/32] <<= 1;
-                if ((hash[k][i] & mask) != 0)
-                    out[(255 - bits)/32] |= 1;
-                bits++;
-            }
-        }
-    }
+	const uint32_t *hash[4] = { hash1, hash2, hash3, hash4 };
+	int bits;
+	unsigned int i;
+	uint32_t mask;
+	unsigned int k;
+
+	/* Transpose first 64 bits of each hash into out */
+	memset(out, 0, 32);
+	bits = 0;
+	for (i = 7; i >= 6; i--) {
+		for (mask = 0x80000000; mask; mask >>= 1) {
+			for (k = 0; k < 4; k++) {
+				out[(255 - bits)/32] <<= 1;
+				if ((hash[k][i] & mask) != 0)
+					out[(255 - bits)/32] |= 1;
+				bits++;
+			}
+		}
+	}
 }
 
 #ifdef _MSC_VER
 #include <intrin.h>
 static uint32_t __inline bitsset( uint32_t x )
 {
-    DWORD r = 0;
-    _BitScanReverse(&r, x);
-    return r;
+	DWORD r = 0;
+	_BitScanReverse(&r, x);
+	return r;
 }
 #else
 static uint32_t bitsset( uint32_t x )
 {
-    return 31-__builtin_clz(x);
+	return 31-__builtin_clz(x);
 }
 #endif
 
 // Finde das high bit in einem Multiword-Integer.
 static int findhighbit(const uint32_t *ptarget, int words)
 {
-    int i;
-    int highbit = 0;
-    for (i=words-1; i >= 0; --i)
-    {
-        if (ptarget[i] != 0) {
-            highbit = i*32 + bitsset(ptarget[i])+1;
-            break;
-        }
-    }
-    return highbit;
+	int i;
+	int highbit = 0;
+	for (i=words-1; i >= 0; --i)
+	{
+		if (ptarget[i] != 0) {
+			highbit = i*32 + bitsset(ptarget[i])+1;
+			break;
+		}
+	}
+	return highbit;
 }
 
 // Generiere ein Multiword-Integer das die Zahl
 // (2 << highbit) - 1 repräsentiert.
 static void genmask(uint32_t *ptarget, int words, int highbit)
 {
-    int i;
-    for (i=words-1; i >= 0; --i)
-    {
-        if ((i+1)*32 <= highbit)
-            ptarget[i] = UINT32_MAX;
-        else if (i*32 > highbit)
-            ptarget[i] = 0x00000000;
-        else
-            ptarget[i] = (1 << (highbit-i*32)) - 1;
-    }
+	int i;
+	for (i=words-1; i >= 0; --i)
+	{
+		if ((i+1)*32 <= highbit)
+			ptarget[i] = UINT32_MAX;
+		else if (i*32 > highbit)
+			ptarget[i] = 0x00000000;
+		else
+			ptarget[i] = (1 << (highbit-i*32)) - 1;
+	}
 }
 
 struct check_nonce_for_remove
 {
-    check_nonce_for_remove(uint64_t target, uint32_t *hashes, uint32_t hashlen, uint32_t startNonce) :
-        m_target(target),
-        m_hashes(hashes),
-        m_hashlen(hashlen),
-        m_startNonce(startNonce) { }
-
-    uint64_t  m_target;
-    uint32_t *m_hashes;
-    uint32_t  m_hashlen;
-    uint32_t  m_startNonce;
-
-    __device__
-    bool operator()(const uint32_t x)
-    {
-        // Position im Hash Buffer
-        uint32_t hashIndex = x - m_startNonce;
-        // Wert des Hashes (als uint64_t) auslesen.
-        // Steht im 6. und 7. Wort des Hashes (jeder dieser Hashes hat 512 Bits)
-        uint64_t hashValue = *((uint64_t*)(&m_hashes[m_hashlen*hashIndex + 6]));
-        bool res = (hashValue & m_target) != hashValue;
-        //printf("ndx=%x val=%08x target=%lx\n", hashIndex, hashValue, m_target);
-        // gegen das Target prüfen. Es dürfen nur Bits aus dem Target gesetzt sein.
-        return res;
-    }
+	check_nonce_for_remove(uint64_t target, uint32_t *hashes, uint32_t hashlen, uint32_t startNonce) :
+		m_target(target),
+		m_hashes(hashes),
+		m_hashlen(hashlen),
+		m_startNonce(startNonce) { }
+
+	uint64_t  m_target;
+	uint32_t *m_hashes;
+	uint32_t  m_hashlen;
+	uint32_t  m_startNonce;
+
+	__device__
+	bool operator()(const uint32_t x)
+	{
+		// Position im Hash Buffer
+		uint32_t hashIndex = x - m_startNonce;
+		// Wert des Hashes (als uint64_t) auslesen.
+		// Steht im 6. und 7. Wort des Hashes (jeder dieser Hashes hat 512 Bits)
+		uint64_t hashValue = *((uint64_t*)(&m_hashes[m_hashlen*hashIndex + 6]));
+		bool res = (hashValue & m_target) != hashValue;
+		//printf("ndx=%x val=%08x target=%lx\n", hashIndex, hashValue, m_target);
+		// gegen das Target prüfen. Es dürfen nur Bits aus dem Target gesetzt sein.
+		return res;
+	}
 };
 
 static bool init[MAX_GPUS] = { 0 };
@@ -143,245 +143,252 @@ static bool init[MAX_GPUS] = { 0 };
 __host__
 int scanhash_heavy(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done, uint32_t maxvote, int blocklen)
 {
-    uint32_t *pdata = work->data;
-    uint32_t *ptarget = work->target;
-    const uint32_t first_nonce = pdata[19];
-    // CUDA will process thousands of threads.
-    uint32_t throughput = cuda_default_throughput(thr_id, (1U << 19) - 256);
-    if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
-
-    int rc = 0;
-    uint32_t *hash = NULL;
-    uint32_t *cpu_nonceVector = NULL;
-    CUDA_SAFE_CALL(cudaMallocHost(&hash, throughput*8*sizeof(uint32_t)));
-    CUDA_SAFE_CALL(cudaMallocHost(&cpu_nonceVector, throughput*sizeof(uint32_t)));
-
-    int nrmCalls[6];
-    memset(nrmCalls, 0, sizeof(int) * 6);
-
-    if (opt_benchmark)
-        ((uint32_t*)ptarget)[7] = 0x00ff;
-
-    // für jeden Hash ein individuelles Target erstellen basierend
-    // auf dem höchsten Bit, das in ptarget gesetzt ist.
-    int highbit = findhighbit(ptarget, 8);
-    uint32_t target2[2], target3[2], target4[2], target5[2];
-    genmask(target2, 2, highbit/4+(((highbit%4)>3)?1:0) ); // SHA256
-    genmask(target3, 2, highbit/4+(((highbit%4)>2)?1:0) ); // keccak512
-    genmask(target4, 2, highbit/4+(((highbit%4)>1)?1:0) ); // groestl512
-    genmask(target5, 2, highbit/4+(((highbit%4)>0)?1:0) ); // blake512
-
-    if (!init[thr_id])
-    {
-        hefty_cpu_init(thr_id, throughput);
-        sha256_cpu_init(thr_id, throughput);
-        keccak512_cpu_init(thr_id, throughput);
-        groestl512_cpu_init(thr_id, throughput);
-        blake512_cpu_init(thr_id, throughput);
-        combine_cpu_init(thr_id, throughput);
-
-        CUDA_SAFE_CALL(cudaMalloc(&heavy_nonceVector[thr_id], sizeof(uint32_t) * throughput));
-
-        init[thr_id] = true;
-    }
-
-    if (blocklen == HEAVYCOIN_BLKHDR_SZ)
-    {
-        uint16_t *ext = (uint16_t *)&pdata[20];
-
-        if (opt_vote > maxvote && !opt_benchmark) {
-            applog(LOG_WARNING, "Your block reward vote (%hu) exceeds "
-                    "the maxvote reported by the pool (%hu).",
-                    opt_vote, maxvote);
-        }
-
-        if (opt_trust_pool && opt_vote > maxvote) {
-            applog(LOG_WARNING, "Capping block reward vote to maxvote reported by pool.");
-            ext[0] = maxvote;
-        }
-        else
-            ext[0] = opt_vote;
-    }
-
-    // Setze die Blockdaten
-    hefty_cpu_setBlock(thr_id, throughput, pdata, blocklen);
-    sha256_cpu_setBlock(pdata, blocklen);
-    keccak512_cpu_setBlock(pdata, blocklen);
-    groestl512_cpu_setBlock(pdata, blocklen);
-    blake512_cpu_setBlock(pdata, blocklen);
-
-    do {
-        uint32_t actualNumberOfValuesInNonceVectorGPU = throughput;
-
-        ////// Compaction init
-
-        hefty_cpu_hash(thr_id, throughput, pdata[19]);
-        sha256_cpu_hash(thr_id, throughput, pdata[19]);
-
-        // Hier ist die längste CPU Wartephase. Deshalb ein strategisches MyStreamSynchronize() hier.
-        MyStreamSynchronize(NULL, 1, thr_id);
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	// CUDA will process thousands of threads.
+	uint32_t throughput = cuda_default_throughput(thr_id, (1U << 19) - 256);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	int rc = 0;
+	uint32_t *hash = NULL;
+	uint32_t *cpu_nonceVector = NULL;
+
+	int nrmCalls[6];
+	memset(nrmCalls, 0, sizeof(int) * 6);
+
+	if (opt_benchmark)
+	   ptarget[7] = 0x000f;
+
+	// für jeden Hash ein individuelles Target erstellen basierend
+	// auf dem höchsten Bit, das in ptarget gesetzt ist.
+	int highbit = findhighbit(ptarget, 8);
+	uint32_t target2[2], target3[2], target4[2], target5[2];
+	genmask(target2, 2, highbit/4+(((highbit%4)>3)?1:0) ); // SHA256
+	genmask(target3, 2, highbit/4+(((highbit%4)>2)?1:0) ); // keccak512
+	genmask(target4, 2, highbit/4+(((highbit%4)>1)?1:0) ); // groestl512
+	genmask(target5, 2, highbit/4+(((highbit%4)>0)?1:0) ); // blake512
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+
+		hefty_cpu_init(thr_id, throughput);
+		sha256_cpu_init(thr_id, throughput);
+		keccak512_cpu_init(thr_id, throughput);
+		groestl512_cpu_init(thr_id, throughput);
+		blake512_cpu_init(thr_id, throughput);
+		combine_cpu_init(thr_id, throughput);
+
+		CUDA_SAFE_CALL(cudaMalloc(&heavy_nonceVector[thr_id], sizeof(uint32_t) * throughput));
+
+		init[thr_id] = true;
+	}
+
+	// weird but require at least one cudaSetDevice first
+	CUDA_SAFE_CALL(cudaMallocHost(&hash, (size_t) 32 * throughput));
+	CUDA_SAFE_CALL(cudaMallocHost(&cpu_nonceVector, sizeof(uint32_t) * throughput));
+
+	if (blocklen == HEAVYCOIN_BLKHDR_SZ)
+	{
+		uint16_t *ext = (uint16_t*) &pdata[20];
+
+		if (opt_vote > maxvote && !opt_benchmark) {
+			applog(LOG_WARNING, "Your block reward vote (%hu) exceeds the maxvote reported by the pool (%hu).",
+					opt_vote, maxvote);
+		}
+
+		if (opt_trust_pool && opt_vote > maxvote) {
+			applog(LOG_WARNING, "Capping block reward vote to maxvote reported by pool.");
+			ext[0] = maxvote;
+		}
+		else
+			ext[0] = opt_vote;
+	}
+
+	// Setze die Blockdaten
+	hefty_cpu_setBlock(thr_id, throughput, pdata, blocklen);
+	sha256_cpu_setBlock(pdata, blocklen);
+	keccak512_cpu_setBlock(pdata, blocklen);
+	groestl512_cpu_setBlock(pdata, blocklen);
+	blake512_cpu_setBlock(pdata, blocklen);
+
+	do {
+		uint32_t actualNumberOfValuesInNonceVectorGPU = throughput;
+
+		////// Compaction init
+
+		hefty_cpu_hash(thr_id, throughput, pdata[19]);
+		sha256_cpu_hash(thr_id, throughput, pdata[19]);
+
+		// Hier ist die längste CPU Wartephase. Deshalb ein strategisches MyStreamSynchronize() hier.
+		MyStreamSynchronize(NULL, 1, thr_id);
 
 #if USE_THRUST
-        thrust::device_ptr<uint32_t> devNoncePtr(heavy_nonceVector[thr_id]);
-        thrust::device_ptr<uint32_t> devNoncePtrEnd((heavy_nonceVector[thr_id]) + throughput);
-
-        ////// Compaction
-        uint64_t *t = (uint64_t*) target2;
-        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash2output[thr_id], 8, pdata[19]));
-        actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
-        if(actualNumberOfValuesInNonceVectorGPU == 0)
-            goto emptyNonceVector;
-
-        keccak512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
-
-        ////// Compaction
-        t = (uint64_t*) target3;
-        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash3output[thr_id], 16, pdata[19]));
-        actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
-        if(actualNumberOfValuesInNonceVectorGPU == 0)
-            goto emptyNonceVector;
-
-        blake512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
-
-        ////// Compaction
-        t = (uint64_t*) target5;
-        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash5output[thr_id], 16, pdata[19]));
-        actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
-        if(actualNumberOfValuesInNonceVectorGPU == 0)
-            goto emptyNonceVector;
-
-        groestl512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
-
-        ////// Compaction
-        t = (uint64_t*) target4;
-        devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash4output[thr_id], 16, pdata[19]));
-        actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
+		thrust::device_ptr<uint32_t> devNoncePtr(heavy_nonceVector[thr_id]);
+		thrust::device_ptr<uint32_t> devNoncePtrEnd((heavy_nonceVector[thr_id]) + throughput);
+
+		////// Compaction
+		uint64_t *t = (uint64_t*) target2;
+		devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash2output[thr_id], 8, pdata[19]));
+		actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
+		if(actualNumberOfValuesInNonceVectorGPU == 0)
+			goto emptyNonceVector;
+
+		keccak512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
+
+		////// Compaction
+		t = (uint64_t*) target3;
+		devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash3output[thr_id], 16, pdata[19]));
+		actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
+		if(actualNumberOfValuesInNonceVectorGPU == 0)
+			goto emptyNonceVector;
+
+		blake512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
+
+		////// Compaction
+		t = (uint64_t*) target5;
+		devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash5output[thr_id], 16, pdata[19]));
+		actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
+		if(actualNumberOfValuesInNonceVectorGPU == 0)
+			goto emptyNonceVector;
+
+		groestl512_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19]);
+
+		////// Compaction
+		t = (uint64_t*) target4;
+		devNoncePtrEnd = thrust::remove_if(devNoncePtr, devNoncePtrEnd, check_nonce_for_remove(*t, d_hash4output[thr_id], 16, pdata[19]));
+		actualNumberOfValuesInNonceVectorGPU = (uint32_t)(devNoncePtrEnd - devNoncePtr);
 #else
-        // todo
-        actualNumberOfValuesInNonceVectorGPU = 0;
+		// todo (nvlabs cub ?)
+		actualNumberOfValuesInNonceVectorGPU = 0;
 #endif
-        if(actualNumberOfValuesInNonceVectorGPU == 0)
-            goto emptyNonceVector;
-
-        // combine
-        combine_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19], hash);
-
-        if (opt_tracegpu) {
-            applog(LOG_BLUE, "heavy GPU hash:");
-            applog_hash((uchar*)hash);
-        }
-
-        // Ergebnisse kopieren
-        if(actualNumberOfValuesInNonceVectorGPU > 0)
-        {
-            size_t size = sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU;
-            CUDA_SAFE_CALL(cudaMemcpy(cpu_nonceVector, heavy_nonceVector[thr_id], size, cudaMemcpyDeviceToHost));
-            cudaThreadSynchronize();
-
-            for (uint32_t i=0; i < actualNumberOfValuesInNonceVectorGPU; i++)
-            {
-                uint32_t nonce = cpu_nonceVector[i];
-                uint32_t *foundhash = &hash[8*i];
-                if (foundhash[7] <= ptarget[7] && fulltest(foundhash, ptarget)) {
-                    uint32_t vhash[8];
-                    pdata[19] += nonce - pdata[19];
-                    heavycoin_hash((uchar*)vhash, (uchar*)pdata, blocklen);
-                    if (memcmp(vhash, foundhash, 32)) {
-                        gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", nonce);
-                    } else {
-                        *hashes_done = pdata[19] - first_nonce;
-                        work_set_target_ratio(work, vhash);
-                        rc = 1;
-                        goto exit;
-                    }
-                }
-            }
-        }
+		if(actualNumberOfValuesInNonceVectorGPU == 0)
+			goto emptyNonceVector;
+
+		// combine
+		combine_cpu_hash(thr_id, actualNumberOfValuesInNonceVectorGPU, pdata[19], hash);
+
+		if (opt_tracegpu) {
+			applog(LOG_BLUE, "heavy GPU hash:");
+			applog_hash((uchar*)hash);
+		}
+
+		// Ergebnisse kopieren
+		if(actualNumberOfValuesInNonceVectorGPU > 0)
+		{
+			size_t size = sizeof(uint32_t) * actualNumberOfValuesInNonceVectorGPU;
+			cudaMemcpy(cpu_nonceVector, heavy_nonceVector[thr_id], size, cudaMemcpyDeviceToHost);
+
+			for (uint32_t i=0; i < actualNumberOfValuesInNonceVectorGPU; i++)
+			{
+				uint32_t nonce = cpu_nonceVector[i];
+				uint32_t *foundhash = &hash[8*i];
+				if (foundhash[7] <= ptarget[7] && fulltest(foundhash, ptarget)) {
+					uint32_t vhash[8];
+					pdata[19] += nonce - pdata[19];
+					heavycoin_hash((uchar*)vhash, (uchar*)pdata, blocklen);
+					if (memcmp(vhash, foundhash, 32)) {
+						gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", nonce);
+					} else {
+						work_set_target_ratio(work, vhash);
+						rc = 1;
+						goto exit;
+					}
+				}
+			}
+		}
 
 emptyNonceVector:
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
 
-        pdata[19] += throughput;
-
-    } while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
-    *hashes_done = pdata[19] - first_nonce;
+	} while (!work_restart[thr_id].restart);
 
 exit:
-    cudaFreeHost(cpu_nonceVector);
-    cudaFreeHost(hash);
-    return rc;
+	*hashes_done = pdata[19] - first_nonce;
+
+	cudaFreeHost(cpu_nonceVector);
+	cudaFreeHost(hash);
+	CUDA_LOG_ERROR();
+
+	return rc;
 }
 
 // cleanup
 extern "C" void free_heavy(int thr_id)
 {
-    if (!init[thr_id])
-        return;
+	if (!init[thr_id])
+		return;
 
-    cudaThreadSynchronize();
+	cudaThreadSynchronize();
 
-    cudaFree(heavy_nonceVector[thr_id]);
+	cudaFree(heavy_nonceVector[thr_id]);
 
-    blake512_cpu_free(thr_id);
-    groestl512_cpu_free(thr_id);
-    hefty_cpu_free(thr_id);
-    keccak512_cpu_free(thr_id);
-    sha256_cpu_free(thr_id);
-    combine_cpu_free(thr_id);
+	blake512_cpu_free(thr_id);
+	groestl512_cpu_free(thr_id);
+	hefty_cpu_free(thr_id);
+	keccak512_cpu_free(thr_id);
+	sha256_cpu_free(thr_id);
+	combine_cpu_free(thr_id);
 
-    init[thr_id] = false;
+	init[thr_id] = false;
 
-    cudaDeviceSynchronize();
+	cudaDeviceSynchronize();
 }
 
 __host__
 void heavycoin_hash(uchar* output, const uchar* input, int len)
 {
-    unsigned char hash1[32];
-    unsigned char hash2[32];
-    uint32_t hash3[16];
-    uint32_t hash4[16];
-    uint32_t hash5[16];
-    uint32_t *final;
-    SHA256_CTX ctx;
-    sph_keccak512_context keccakCtx;
-    sph_groestl512_context groestlCtx;
-    sph_blake512_context blakeCtx;
-
-    HEFTY1(input, len, hash1);
-
-    /* HEFTY1 is new, so take an extra security measure to eliminate
-     * the possiblity of collisions:
-     *
-     *     Hash(x) = SHA256(x + HEFTY1(x))
-     *
-     * N.B. '+' is concatenation.
-     */
-    SHA256_Init(&ctx);
-    SHA256_Update(&ctx, input, len);
-    SHA256_Update(&ctx, hash1, sizeof(hash1));
-    SHA256_Final(hash2, &ctx);
-
-    /* Additional security: Do not rely on a single cryptographic hash
-     * function.  Instead, combine the outputs of 4 of the most secure
-     * cryptographic hash functions-- SHA256, KECCAK512, GROESTL512
-     * and BLAKE512.
-     */
-
-    sph_keccak512_init(&keccakCtx);
-    sph_keccak512(&keccakCtx, input, len);
-    sph_keccak512(&keccakCtx, hash1, sizeof(hash1));
-    sph_keccak512_close(&keccakCtx, (void *)&hash3);
-
-    sph_groestl512_init(&groestlCtx);
-    sph_groestl512(&groestlCtx, input, len);
-    sph_groestl512(&groestlCtx, hash1, sizeof(hash1));
-    sph_groestl512_close(&groestlCtx, (void *)&hash4);
-
-    sph_blake512_init(&blakeCtx);
-    sph_blake512(&blakeCtx, input, len);
-    sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1));
-    sph_blake512_close(&blakeCtx, (void *)&hash5);
-
-    final = (uint32_t *)output;
-    combine_hashes(final, (uint32_t *)hash2, hash3, hash4, hash5);
+	unsigned char hash1[32];
+	unsigned char hash2[32];
+	uint32_t hash3[16];
+	uint32_t hash4[16];
+	uint32_t hash5[16];
+	uint32_t *final;
+	SHA256_CTX ctx;
+	sph_keccak512_context keccakCtx;
+	sph_groestl512_context groestlCtx;
+	sph_blake512_context blakeCtx;
+
+	HEFTY1(input, len, hash1);
+
+	/* HEFTY1 is new, so take an extra security measure to eliminate
+	 * the possiblity of collisions:
+	 *
+	 *     Hash(x) = SHA256(x + HEFTY1(x))
+	 *
+	 * N.B. '+' is concatenation.
+	 */
+	SHA256_Init(&ctx);
+	SHA256_Update(&ctx, input, len);
+	SHA256_Update(&ctx, hash1, sizeof(hash1));
+	SHA256_Final(hash2, &ctx);
+
+	/* Additional security: Do not rely on a single cryptographic hash
+	 * function.  Instead, combine the outputs of 4 of the most secure
+	 * cryptographic hash functions-- SHA256, KECCAK512, GROESTL512
+	 * and BLAKE512.
+	 */
+
+	sph_keccak512_init(&keccakCtx);
+	sph_keccak512(&keccakCtx, input, len);
+	sph_keccak512(&keccakCtx, hash1, sizeof(hash1));
+	sph_keccak512_close(&keccakCtx, (void *)&hash3);
+
+	sph_groestl512_init(&groestlCtx);
+	sph_groestl512(&groestlCtx, input, len);
+	sph_groestl512(&groestlCtx, hash1, sizeof(hash1));
+	sph_groestl512_close(&groestlCtx, (void *)&hash4);
+
+	sph_blake512_init(&blakeCtx);
+	sph_blake512(&blakeCtx, input, len);
+	sph_blake512(&blakeCtx, (unsigned char *)&hash1, sizeof(hash1));
+	sph_blake512_close(&blakeCtx, (void *)&hash5);
+
+	final = (uint32_t *)output;
+	combine_hashes(final, (uint32_t *)hash2, hash3, hash4, hash5);
 }
diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu
index 05ffb52..d74bb16 100644
--- a/lyra2/lyra2RE.cu
+++ b/lyra2/lyra2RE.cu
@@ -122,14 +122,14 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
 		int order = 0;
 		uint32_t foundNonce;
 
-		*hashes_done = pdata[19] - first_nonce + throughput;
-
 		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		TRACE("S")
 
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
 		foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		if (foundNonce != UINT32_MAX)
 		{
@@ -162,10 +162,15 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
 			}
 		}
 
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 
diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu
index 67e2dc9..bcc39d6 100644
--- a/lyra2/lyra2REv2.cu
+++ b/lyra2/lyra2REv2.cu
@@ -96,14 +96,13 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 			cudaDeviceReset();
 			// reduce cpu usage
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
 		}
-		CUDA_LOG_ERROR();
 
 		blake256_cpu_init(thr_id, throughput);
 		keccak256_cpu_init(thr_id,throughput);
 		skein256_cpu_init(thr_id, throughput);
 		bmw256_cpu_init(thr_id, throughput);
-		CUDA_LOG_ERROR();
 
 		// SM 3 implentation requires a bit more memory
 		if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500)
@@ -175,11 +174,15 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 			}
 		}
 
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
 		pdata[19] += throughput;
 
-	} while (!work_restart[thr_id].restart && (max_nonce > ((uint64_t)(pdata[19]) + throughput)));
+	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 
diff --git a/myriadgroestl.cpp b/myriadgroestl.cpp
index f27d060..4311120 100644
--- a/myriadgroestl.cpp
+++ b/myriadgroestl.cpp
@@ -43,7 +43,7 @@ int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned
 	uint32_t *outputHash = (uint32_t*)malloc(throughput * 64);
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
+		ptarget[7] = 0x0000ff;
 
 	// init
 	if(!init[thr_id])
@@ -63,11 +63,11 @@ int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned
 		// GPU
 		uint32_t foundNounce = UINT32_MAX;
 
-		*hashes_done = pdata[19] - start_nonce + throughput;
-
 		myriadgroestl_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce);
 
-		if (foundNounce < UINT32_MAX)
+		*hashes_done = pdata[19] - start_nonce + throughput;
+
+		if (foundNounce < UINT32_MAX && bench_algo < 0)
 		{
 			uint32_t _ALIGN(64) vhash[8];
 			endiandata[19] = swab32(foundNounce);
@@ -82,8 +82,7 @@ int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned
 			}
 		}
 
-		if ((uint64_t) pdata[19] + throughput > max_nonce) {
-			*hashes_done = pdata[19] - start_nonce;
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
@@ -91,6 +90,8 @@ int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned
 
 	} while (!work_restart[thr_id].restart);
 
+	*hashes_done = max_nonce - start_nonce;
+
 	free(outputHash);
 	return 0;
 }
diff --git a/neoscrypt/neoscrypt.cpp b/neoscrypt/neoscrypt.cpp
index 0e07845..b0cb1cb 100644
--- a/neoscrypt/neoscrypt.cpp
+++ b/neoscrypt/neoscrypt.cpp
@@ -79,11 +79,16 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
 			}
 		}
 
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
 		pdata[19] += throughput;
 
-	} while (!work_restart[thr_id].restart && (max_nonce > ((uint64_t)(pdata[19]) + throughput)));
+	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 
diff --git a/pentablake.cu b/pentablake.cu
index 96eb2a1..2abdeb7 100644
--- a/pentablake.cu
+++ b/pentablake.cu
@@ -110,9 +110,14 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n
 			}
 		}
 
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
 	return rc;
 }
diff --git a/qubit/deep.cu b/qubit/deep.cu
index afbe9fb..a5d4c63 100644
--- a/qubit/deep.cu
+++ b/qubit/deep.cu
@@ -117,9 +117,14 @@ extern "C" int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce,
 			}
 		}
 
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
diff --git a/qubit/luffa.cu b/qubit/luffa.cu
index afbf545..db283cb 100644
--- a/qubit/luffa.cu
+++ b/qubit/luffa.cu
@@ -36,19 +36,21 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce,
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
-	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 22); // 256*256*8*8
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 21);
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000f;
+		ptarget[7] = 0x0000f;
 
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
-		CUDA_LOG_ERROR();
-		//if (opt_cudaschedule == -1) // to reduce cpu usage...
-		//	cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
-		//CUDA_LOG_ERROR();
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
 
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
 
@@ -65,10 +67,9 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce,
 	cuda_check_cpu_setTarget(ptarget);
 
 	do {
-		int order = 0;
-		*hashes_done = pdata[19] - first_nonce + throughput;
+		qubit_luffa512_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], 0);
 
-		qubit_luffa512_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], order++);
+		*hashes_done = pdata[19] - first_nonce + throughput;
 
 		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != UINT32_MAX)
@@ -86,8 +87,8 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce,
 			}
 		}
 
-		if ((uint64_t) throughput + pdata[19] > max_nonce) {
-			// pdata[19] = max_nonce;
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
 			break;
 		}
 
@@ -95,7 +96,7 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce,
 
 	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 
diff --git a/skein.cu b/skein.cu
index f178a78..97a7c0b 100644
--- a/skein.cu
+++ b/skein.cu
@@ -453,9 +453,7 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no
 			}
 		}
 
-		if ((uint64_t) throughput + pdata[19] > max_nonce) {
-			//applog(LOG_DEBUG, "done... max=%u", max_nonce);
-			*hashes_done = pdata[19] - first_nonce;
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
@@ -464,6 +462,8 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no
 
 	} while (!work_restart[thr_id].restart);
 
+	*hashes_done = pdata[19] - first_nonce;
+
 	return 0;
 }
 
diff --git a/skein2.cpp b/skein2.cpp
index b05ccb3..38c0a5b 100644
--- a/skein2.cpp
+++ b/skein2.cpp
@@ -110,8 +110,7 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
 			}
 		}
 
-		if ((uint64_t) throughput + pdata[19] > max_nonce) {
-			*hashes_done = pdata[19] - first_nonce;
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
@@ -120,6 +119,8 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
 
 	} while (!work_restart[thr_id].restart);
 
+	*hashes_done = pdata[19] - first_nonce;
+
 	return 0;
 }
 
diff --git a/x11/c11.cu b/x11/c11.cu
index 0c7e25d..b48e85e 100644
--- a/x11/c11.cu
+++ b/x11/c11.cu
@@ -200,14 +200,20 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u
 			} else {
 				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 				pdata[19] = foundNonce + 1;
+				continue;
 			}
 		}
 
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 
@@ -228,4 +234,4 @@ extern "C" void free_c11(int thr_id)
 	init[thr_id] = false;
 
 	cudaDeviceSynchronize();
-}
\ No newline at end of file
+}
diff --git a/x11/fresh.cu b/x11/fresh.cu
index 8e2b42a..5e3bf38 100644
--- a/x11/fresh.cu
+++ b/x11/fresh.cu
@@ -78,18 +78,19 @@ extern "C" int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce,
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x00ff;
+		ptarget[7] = 0x00ff;
 
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		CUDA_LOG_ERROR();
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput + 4), -1);
 
 		x11_shavite512_cpu_init(thr_id, throughput);
 		x11_simd512_cpu_init(thr_id, throughput);
 		x11_echo512_cpu_init(thr_id, throughput);
 
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput + 4), 0);
-
 		cuda_check_cpu_init(thr_id, throughput);
 
 		init[thr_id] = true;
@@ -101,8 +102,6 @@ extern "C" int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce,
 	x11_shavite512_setBlock_80((void*)endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 	do {
-		uint32_t Htarg = ptarget[7];
-
 		uint32_t foundNonce;
 		int order = 0;
 
@@ -128,7 +127,7 @@ extern "C" int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce,
 			be32enc(&endiandata[19], foundNonce);
 			fresh_hash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
 				int res = 1;
 				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 				work_set_target_ratio(work, vhash64);
diff --git a/x11/s3.cu b/x11/s3.cu
index 45609c8..97e6068 100644
--- a/x11/s3.cu
+++ b/x11/s3.cu
@@ -147,11 +147,16 @@ extern "C" int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, un
 			}
 		}
 
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 
diff --git a/x11/x11.cu b/x11/x11.cu
index d2e93ca..367aa00 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -138,7 +138,7 @@ extern "C" int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, u
 		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
 			return 0;
 		}
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0); // why 64 ?
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
 
 		cuda_check_cpu_init(thr_id, throughput);
 
@@ -205,14 +205,19 @@ extern "C" int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, u
 			} else {
 				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 				pdata[19] = foundNonce + 1;
+				continue;
 			}
 		}
 
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 
diff --git a/x13/x13.cu b/x13/x13.cu
index faa73ab..39e0fe2 100644
--- a/x13/x13.cu
+++ b/x13/x13.cu
@@ -206,7 +206,7 @@ extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, u
 			}
 		}
 
-		if ((uint64_t)pdata[19] + throughput > (uint64_t)max_nonce) {
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
@@ -214,7 +214,7 @@ extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, u
 
 	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 
 	CUDA_LOG_ERROR();
 
diff --git a/x15/whirlpool.cu b/x15/whirlpool.cu
index 31f6466..4fac950 100644
--- a/x15/whirlpool.cu
+++ b/x15/whirlpool.cu
@@ -83,12 +83,14 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce,
 		uint32_t foundNonce;
 		int order = 0;
 
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
 		whirlpool512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 
 		foundNonce = whirlpool512_cpu_finalhash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		if (foundNonce != UINT32_MAX)
+		if (foundNonce != UINT32_MAX && bench_algo < 0)
 		{
 			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash[8];
@@ -97,7 +99,6 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce,
 
 			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
 				int res = 1;
-				*hashes_done = pdata[19] - first_nonce + throughput;
 				work_set_target_ratio(work, vhash);
 				#if 0
 				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
@@ -112,9 +113,13 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce,
 				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
 			}
 		}
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
 	*hashes_done = pdata[19] - first_nonce;
 	return 0;
diff --git a/x15/whirlpoolx.cu b/x15/whirlpoolx.cu
index 4416512..517d5ca 100644
--- a/x15/whirlpoolx.cu
+++ b/x15/whirlpoolx.cu
@@ -49,12 +49,12 @@ extern "C" int scanhash_whirlx(int thr_id,  struct work* work, uint32_t max_nonc
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
+		ptarget[7] = 0x000f;
 
 	if (!init[thr_id]) {
 		cudaSetDevice(device_map[thr_id]);
 
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
 
 		whirlpoolx_cpu_init(thr_id, throughput);
 
@@ -70,15 +70,16 @@ extern "C" int scanhash_whirlx(int thr_id,  struct work* work, uint32_t max_nonc
 
 	do {
 		uint32_t foundNonce = whirlpoolx_cpu_hash(thr_id, throughput, pdata[19]);
-		if (foundNonce != UINT32_MAX)
+
+		*(hashes_done) = pdata[19] - first_nonce + throughput;
+
+		if (foundNonce != UINT32_MAX && bench_algo < 0)
 		{
 			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			whirlxHash(vhash64, endiandata);
 
-			*hashes_done = pdata[19] - first_nonce + throughput;
-
 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
 				work_set_target_ratio(work, vhash64);
 				pdata[19] = foundNonce;
@@ -88,15 +89,16 @@ extern "C" int scanhash_whirlx(int thr_id,  struct work* work, uint32_t max_nonc
 			}
 		}
 
-		pdata[19] += throughput;
-
-		if (((uint64_t)pdata[19]+throughput) >= max_nonce) {
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
 			break;
 		}
 
+		pdata[19] += throughput;
+
 	} while (!work_restart[thr_id].restart);
 
-	*(hashes_done) = pdata[19] - first_nonce + 1;
+	*(hashes_done) = pdata[19] - first_nonce;
 
 	return 0;
 }
diff --git a/x15/x14.cu b/x15/x14.cu
index feaf6d4..d020f5c 100644
--- a/x15/x14.cu
+++ b/x15/x14.cu
@@ -219,13 +219,19 @@ extern "C" int scanhash_x14(int thr_id,  struct work* work, uint32_t max_nonce,
 				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 			}
 		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
 	CUDA_LOG_ERROR();
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 
diff --git a/x15/x15.cu b/x15/x15.cu
index d74dfa6..6d4799d 100644
--- a/x15/x15.cu
+++ b/x15/x15.cu
@@ -226,11 +226,16 @@ extern "C" int scanhash_x15(int thr_id,  struct work* work, uint32_t max_nonce,
 			}
 		}
 
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 
 	return 0;
 }
diff --git a/x17/x17.cu b/x17/x17.cu
index ebf2b4f..3be51d1 100644
--- a/x17/x17.cu
+++ b/x17/x17.cu
@@ -251,11 +251,16 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
 			}
 		}
 
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
 		pdata[19] += throughput;
 
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }