diff --git a/Algo256/blake256.cu b/Algo256/blake256.cu
index ad426e3..4ecec43 100644
--- a/Algo256/blake256.cu
+++ b/Algo256/blake256.cu
@@ -452,14 +452,16 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 			if (vhashcpu[6] <= Htarg && fulltest(vhashcpu, ptarget))
 			{
 				rc = 1;
-				*hashes_done = pdata[19] - first_nonce + throughput;
 				pdata[19] = foundNonce;
+				*hashes_done = pdata[19] - first_nonce + 1;
 #if NBN > 1
 				if (extra_results[0] != UINT32_MAX) {
 					be32enc(&endiandata[19], extra_results[0]);
 					blake256hash(vhashcpu, endiandata, blakerounds);
 					if (vhashcpu[6] <= Htarg /* && fulltest(vhashcpu, ptarget) */) {
 						pdata[21] = extra_results[0];
+						applog(LOG_BLUE, "1:%x 2:%x", foundNonce, extra_results[0]);
+						*hashes_done = max(*hashes_done, extra_results[0] - first_nonce + 1);
 						rc = 2;
 					}
 					extra_results[0] = UINT32_MAX;
@@ -477,7 +479,6 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 		}
 
 		if ((uint64_t) pdata[19] + throughput > (uint64_t) max_nonce) {
-			pdata[19] = max_nonce;
 			break;
 		}
 
@@ -485,6 +486,7 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 
 	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1; // (+1 to prevent locks)
+	*hashes_done = pdata[19] - first_nonce;
+
 	return rc;
 }
diff --git a/Algo256/keccak256.cu b/Algo256/keccak256.cu
index ac6f4d0..59a1d9b 100644
--- a/Algo256/keccak256.cu
+++ b/Algo256/keccak256.cu
@@ -41,8 +41,8 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
 	unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
-	int throughput = opt_work_size ? opt_work_size : (1 << 21); // 256*256*8*4
-	throughput = min(throughput, (int)(max_nonce - first_nonce));
+	uint32_t throughput = opt_work_size ? opt_work_size : (1 << 21); // 256*256*8*4
+	throughput = min(throughput, (max_nonce - first_nonce));
 
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0005;
@@ -51,7 +51,7 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
 		cudaSetDevice(device_map[thr_id]);
 
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
-		keccak256_cpu_init(thr_id, throughput);
+		keccak256_cpu_init(thr_id, (int) throughput);
 
 		init[thr_id] = true;
 	}
@@ -65,7 +65,7 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
 	do {
 		int order = 0;
 
-		uint32_t foundNonce = keccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		uint32_t foundNonce = keccak256_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], order++);
 		if (foundNonce != UINT32_MAX)
 		{
 			uint32_t Htarg = ptarget[7];
@@ -83,8 +83,7 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
 			}
 		}
 
-		if ((uint64_t) pdata[19] + throughput > (uint64_t) max_nonce) {
-			pdata[19] = max_nonce;
+		if ((uint64_t) pdata[19] + throughput > max_nonce) {
 			break;
 		}
 
@@ -92,6 +91,6 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
 
 	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
+	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
diff --git a/ccminer.cpp b/ccminer.cpp
index 7392886..b0f2b9a 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -484,6 +484,7 @@ static bool work_decode(const json_t *val, struct work *work)
 
 /**
  * Calculate the work difficulty as double
+ * Not sure it works with pools
  */
 static void calc_diff(struct work *work, int known)
 {
@@ -548,26 +549,24 @@ static int share_result(int result, const char *reason)
 static bool submit_upstream_work(CURL *curl, struct work *work)
 {
 	json_t *val, *res, *reason;
+	bool stale_work = false;
 	char s[384];
-	bool stale_work;
 
-	pthread_mutex_lock(&g_work_lock);
-	if (strlen(work->job_id + 8)) {
-		/* stale if not the current job id */
-		stale_work = strcmp(work->job_id + 8, g_work.job_id + 8);
-	} else {
-		/* fallback when no job id (compare hash) */
-		stale_work = memcmp(&work->data[1], &g_work.data[1], 32);
+	/* discard if a new bloc was sent */
+	stale_work = work->height != g_work.height;
+	if (have_stratum && !stale_work) {
+		pthread_mutex_lock(&g_work_lock);
+		if (strlen(work->job_id + 8))
+			stale_work = strcmp(work->job_id + 8, g_work.job_id + 8);
+		pthread_mutex_unlock(&g_work_lock);
 	}
 
 	if (stale_work) {
-		pthread_mutex_unlock(&g_work_lock);
 		if (opt_debug)
 			applog(LOG_WARNING, "stale work detected, discarding");
 		return true;
 	}
 	calc_diff(work, 0);
-	pthread_mutex_unlock(&g_work_lock);
 
 	if (have_stratum) {
 		uint32_t sent;
@@ -1049,14 +1048,15 @@ static void *miner_thread(void *userdata)
 				stratum_gen_work(&stratum, &g_work);
 			}
 		} else {
-			int min_scantime = scan_time;
-			/* obtain new work from internal workio thread */
 			pthread_mutex_lock(&g_work_lock);
-			if (time(NULL) - g_work_time >= min_scantime || nonceptr[0] >= end_nonce) {
+			if ((time(NULL) - g_work_time) >= scan_time || nonceptr[0] >= (end_nonce - 0x100)) {
+				if (opt_debug && g_work_time && !opt_quiet)
+					applog(LOG_DEBUG, "work time %u/%us nonce %x/%x", time(NULL) - g_work_time,
+						scan_time, nonceptr[0], end_nonce);
+				/* obtain new work from internal workio thread */
 				if (unlikely(!get_work(mythr, &g_work))) {
-					applog(LOG_ERR, "work retrieval failed, exiting "
-						"mining thread %d", mythr->id);
 					pthread_mutex_unlock(&g_work_lock);
+					applog(LOG_ERR, "work retrieval failed, exiting mining thread %d", mythr->id);
 					goto out;
 				}
 				g_work_time = time(NULL);
@@ -1065,6 +1065,8 @@ static void *miner_thread(void *userdata)
 
 		if (!opt_benchmark && memcmp(work.target, g_work.target, sizeof(work.target))) {
 			calc_diff(&g_work, 0);
+			if (!have_stratum)
+				global_diff = g_work.difficulty;
 			if (opt_debug) {
 				uint64_t target64 = g_work.target[7] * 0x100000000ULL + g_work.target[6];
 				applog(LOG_DEBUG, "job %s target change: %llx (%.1f)", g_work.job_id, target64, g_work.difficulty);
@@ -1175,9 +1177,6 @@ static void *miner_thread(void *userdata)
 			}
 		}
 #endif
-		if (opt_algo == ALGO_KECCAK && max64 == UINT32_MAX) {
-			max64 = 0x7FFFFFFFUL;
-		}
 		/* never let small ranges at end */
 		if (end_nonce >= UINT32_MAX - 256)
 			end_nonce = UINT32_MAX;
@@ -1345,6 +1344,7 @@ static void *miner_thread(void *userdata)
 		timeval_subtract(&diff, &tv_end, &tv_start);
 
 		if (diff.tv_usec || diff.tv_sec) {
+			double dtime = (double) diff.tv_sec + 1e-6 * diff.tv_usec;
 
 			/* hashrate factors for some algos */
 			double rate_factor = 1.0;
@@ -1357,13 +1357,13 @@ static void *miner_thread(void *userdata)
 			}
 
 			/* store thread hashrate */
-			pthread_mutex_lock(&stats_lock);
-			if (diff.tv_sec + 1e-6 * diff.tv_usec > 0.0) {
-				thr_hashrates[thr_id] = hashes_done / (diff.tv_sec + 1e-6 * diff.tv_usec);
+			if (dtime > 0.0) {
+				pthread_mutex_lock(&stats_lock);
+				thr_hashrates[thr_id] = hashes_done / dtime;
 				thr_hashrates[thr_id] *= rate_factor;
 				stats_remember_speed(thr_id, hashes_done, thr_hashrates[thr_id], (uint8_t) rc, work.height);
+				pthread_mutex_unlock(&stats_lock);
 			}
-			pthread_mutex_unlock(&stats_lock);
 		}
 
 		if (rc > 1)
@@ -1392,8 +1392,10 @@ static void *miner_thread(void *userdata)
 		/* loopcnt: ignore first loop hashrate */
 		if (loopcnt && thr_id == (opt_n_threads - 1)) {
 			double hashrate = 0.;
+			pthread_mutex_lock(&stats_lock);
 			for (int i = 0; i < opt_n_threads && thr_hashrates[i]; i++)
 				hashrate += stats_get_speed(i, thr_hashrates[i]);
+			pthread_mutex_unlock(&stats_lock);
 			if (opt_benchmark) {
 				sprintf(s, hashrate >= 1e6 ? "%.0f" : "%.2f", hashrate / 1000.);
 				applog(LOG_NOTICE, "Total: %s kH/s", s);
diff --git a/miner.h b/miner.h
index 0a4219e..81833cc 100644
--- a/miner.h
+++ b/miner.h
@@ -505,7 +505,11 @@ extern long  device_sm[8];
 #define CL_CY2  "\x1B[22;36m" /* cyan */
 #define CL_SIL  "\x1B[22;37m" /* gray */
 
+#ifdef WIN32
 #define CL_GRY  "\x1B[01;30m" /* dark gray */
+#else
+#define CL_GRY  "\x1B[90m"    /* dark gray selectable in putty */
+#endif
 #define CL_LRD  "\x1B[01;31m" /* light red */
 #define CL_LGR  "\x1B[01;32m" /* light green */
 #define CL_LYL  "\x1B[01;33m" /* tooltips */
diff --git a/qubit/doom.cu b/qubit/doom.cu
index e390a7f..df30b04 100644
--- a/qubit/doom.cu
+++ b/qubit/doom.cu
@@ -40,8 +40,8 @@ extern "C" int scanhash_doom(int thr_id, uint32_t *pdata,
 {
 	const uint32_t first_nonce = pdata[19];
 	uint32_t endiandata[20];
-	int throughput = opt_work_size ? opt_work_size : (1 << 22); // 256*256*8*8
-	throughput = min(throughput, (int)(max_nonce - first_nonce));
+	uint32_t throughput = opt_work_size ? opt_work_size : (1 << 22); // 256*256*8*8
+	throughput = min(throughput, (max_nonce - first_nonce));
 
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000f;
@@ -52,7 +52,7 @@ extern "C" int scanhash_doom(int thr_id, uint32_t *pdata,
 
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
 
-		qubit_luffa512_cpu_init(thr_id, throughput);
+		qubit_luffa512_cpu_init(thr_id, (int) throughput);
 
 		init[thr_id] = true;
 	}
@@ -65,7 +65,7 @@ extern "C" int scanhash_doom(int thr_id, uint32_t *pdata,
 	do {
 		int order = 0;
 
-		uint32_t foundNonce = qubit_luffa512_cpu_finalhash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		uint32_t foundNonce = qubit_luffa512_cpu_finalhash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], order++);
 		if (foundNonce != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
@@ -74,23 +74,22 @@ extern "C" int scanhash_doom(int thr_id, uint32_t *pdata,
 			doomhash(vhash64, endiandata);
 
 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
-				int res = 1;
-				*hashes_done = pdata[19] - first_nonce + throughput;
+				*hashes_done = min(max_nonce - first_nonce, (uint64_t) pdata[19] - first_nonce + throughput);
 				pdata[19] = foundNonce;
-				return res;
+				return 1;
 			}
 			else {
 				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
 			}
 		}
 
-		pdata[19] += throughput;
-
 		if ((uint64_t) pdata[19] + throughput > max_nonce) {
-			pdata[19] = max_nonce;
+			// pdata[19] = max_nonce;
 			break;
 		}
 
+		pdata[19] += throughput;
+
 	} while (!work_restart[thr_id].restart);
 
 	*hashes_done = pdata[19] - first_nonce + 1;