diff --git a/blake32.cu b/blake32.cu
index 2dc7235..e7aae76 100644
--- a/blake32.cu
+++ b/blake32.cu
@@ -17,7 +17,7 @@ extern "C" {
 /* threads per block and throughput (intensity) */
 #define TPB 128
 
-extern int opt_n_threads;
+extern int num_processors;
 
 /* added in sph_blake.c */
 extern "C" int blake256_rounds = 14;
@@ -416,8 +416,10 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 	}
 #endif
 
-	if (opt_benchmark)
+	if (opt_benchmark) {
 		targetHigh = 0x1ULL << 32;
+		((uint32_t*)ptarget)[6] = swab32(0xff);
+	}
 
 	if (opt_tracegpu) {
 		/* test call from util.c */
@@ -427,9 +429,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 	}
 
 	if (!init[thr_id]) {
-		if (opt_n_threads > 1) {
+		if (num_processors > 1)
 			CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
-		}
 		CUDA_SAFE_CALL(cudaMallocHost(&h_resNonce[thr_id], NBN * sizeof(uint32_t)));
 		CUDA_SAFE_CALL(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)));
 		init[thr_id] = true;
@@ -489,8 +490,8 @@ extern "C" int scanhash_blake256(int thr_id, uint32_t *pdata, const uint32_t *pt
 				goto exit_scan;
 			}
 			else if (opt_debug) {
-				applog_hash((uint8_t*)ptarget);
-				applog_compare_hash((uint8_t*)vhashcpu,(uint8_t*)ptarget);
+				applog_hash((uchar*)ptarget);
+				applog_compare_hash((uchar*)vhashcpu, (uchar*)ptarget);
 				applog(LOG_DEBUG, "GPU #%d: result for nonce %08x does not validate on CPU!", thr_id, foundNonce);
 			}
 		}
diff --git a/ccminer.cpp b/ccminer.cpp
index cb2bd4a..68c6973 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -816,7 +816,7 @@ static bool get_work(struct thr_info *thr, struct work *work)
 
 	if (opt_benchmark) {
 		memset(work->data, 0x55, 76);
-		work->data[17] = swab32((uint32_t)time(NULL));
+		//work->data[17] = swab32((uint32_t)time(NULL));
 		memset(work->data + 19, 0x00, 52);
 		work->data[20] = 0x80000000;
 		work->data[31] = 0x00000280;
@@ -1004,11 +1004,11 @@ static void *miner_thread(void *userdata)
 	}
 
 	while (1) {
+		struct timeval tv_start, tv_end, diff;
 		unsigned long hashes_done;
 		uint32_t start_nonce;
-		struct timeval tv_start, tv_end, diff;
-		int64_t max64;
-		uint64_t umax64;
+		uint32_t scan_time = have_longpoll ? LP_SCANTIME : opt_scantime;
+		uint64_t max64, minmax = 0x100000;
 
 		// &work.data[19]
 		int wcmplen = 76;
@@ -1035,7 +1035,7 @@ static void *miner_thread(void *userdata)
 				stratum_gen_work(&stratum, &g_work);
 			}
 		} else {
-			int min_scantime = have_longpoll ? LP_SCANTIME : opt_scantime;
+			int min_scantime = scan_time;
 			/* obtain new work from internal workio thread */
 			pthread_mutex_lock(&g_work_lock);
 			if (time(NULL) - g_work_time >= min_scantime ||
@@ -1065,7 +1065,7 @@ static void *miner_thread(void *userdata)
 			goto continue_scan;
 		}
 
-		if (memcmp(work.target, g_work.target, sizeof(work.target))) {
+		if (!opt_benchmark && memcmp(work.target, g_work.target, sizeof(work.target))) {
 			calc_diff(&g_work, 0);
 			if (opt_debug) {
 				uint64_t target64 = g_work.target[7] * 0x100000000ULL + g_work.target[6];
@@ -1080,56 +1080,59 @@ static void *miner_thread(void *userdata)
 			}
 		}
 		if (memcmp(work.data, g_work.data, wcmplen)) {
+			#if 0
 			if (opt_debug) {
-#if 0
 				for (int n=0; n <= (wcmplen-8); n+=8) {
 					if (memcmp(work.data + n, g_work.data + n, 8)) {
 						applog(LOG_DEBUG, "job %s work updated at offset %d:", g_work.job_id, n);
-						applog_hash((uint8_t*) work.data + n);
-						applog_compare_hash((uint8_t*) g_work.data + n, (uint8_t*) work.data + n);
+						applog_hash((uchar*) &work.data[n]);
+						applog_compare_hash((uchar*) &g_work.data[n], (uchar*) &work.data[n]);
 					}
 				}
-#endif
 			}
+			#endif
 			memcpy(&work, &g_work, sizeof(struct work));
 			(*nonceptr) = (0xffffffffUL / opt_n_threads) * thr_id; // 0 if single thr
 		} else
 			(*nonceptr)++; //??
-		work_restart[thr_id].restart = 0;
 
-		if (opt_debug)
-			applog(LOG_DEBUG, "job %s %08x", g_work.job_id, (*nonceptr));
+		work_restart[thr_id].restart = 0;
 		pthread_mutex_unlock(&g_work_lock);
 
 		/* adjust max_nonce to meet target scan time */
 		if (have_stratum)
 			max64 = LP_SCANTIME;
 		else
-			max64 = g_work_time + (have_longpoll ? LP_SCANTIME : opt_scantime)
-			      - time(NULL);
+			max64 = max(1, scan_time + g_work_time - time(NULL));
 
-		max64 *= (int64_t)thr_hashrates[thr_id];
+		max64 *= (uint32_t)thr_hashrates[thr_id];
 
-		if (max64 <= 0) {
-			/* should not be set too high,
-			   else you can miss multiple nounces */
+		/* on start, max64 should not be 0,
+		 *    before hashrate is computed */
+		if (max64 < minmax) {
 			switch (opt_algo) {
 			case ALGO_BLAKECOIN:
-				max64 = 0x3ffffffLL;
+				minmax = 0x4000000;
 				break;
 			case ALGO_BLAKE:
 			case ALGO_DOOM:
 			case ALGO_JACKPOT:
 			case ALGO_KECCAK:
 			case ALGO_LUFFA_DOOM:
-				max64 = 0x1ffffffLL;
+				minmax = 0x2000000;
 				break;
-			default:
-				max64 = 0xfffffLL;
+			case ALGO_S3:
+			case ALGO_X11:
+			case ALGO_X13:
+				minmax = 0x400000;
 				break;
 			}
+			max64 = max(minmax-1, max64);
 		}
 
+		if (opt_debug)
+			applog(LOG_DEBUG, "GPU #%d: start=%08x range=%llx", device_map[thr_id], *nonceptr, max64);
+
 		start_nonce = *nonceptr;
 
 		/* do not recompute something already scanned */
@@ -1140,7 +1143,7 @@ static void *miner_thread(void *userdata)
 			} range;
 
 			range.data = hashlog_get_scan_range(work.job_id);
-			if (range.data) {
+			if (range.data && !opt_benchmark) {
 				bool stall = false;
 				if (range.scanned[0] == 1 && range.scanned[1] == 0xFFFFFFFFUL) {
 					applog(LOG_WARNING, "detected a rescan of fully scanned job!");
@@ -1168,11 +1171,14 @@ static void *miner_thread(void *userdata)
 			}
 		}
 
-		umax64 = (uint64_t) max64;
-		if ((umax64 + start_nonce) >= end_nonce)
+		if ((max64 + start_nonce) >= end_nonce)
 			max_nonce = end_nonce;
 		else
-			max_nonce = (uint32_t) umax64 + start_nonce;
+			max_nonce = (uint32_t) (max64 + start_nonce);
+
+		/* never let small ranges at end */
+		if (max_nonce >= UINT32_MAX - 256)
+			max_nonce = UINT32_MAX;
 
 		work.scanned_from = start_nonce;
 		(*nonceptr) = start_nonce;
@@ -1343,6 +1349,19 @@ continue_scan:
 			pthread_mutex_unlock(&stats_lock);
 		}
 
+		if (rc)
+			work.scanned_to = *nonceptr;
+		else {
+			work.scanned_to = max_nonce;
+			if (opt_debug && opt_benchmark) {
+				// to debug nonce ranges
+				applog(LOG_DEBUG, "GPU #%d:  ends=%08x range=%llx", device_map[thr_id],
+					*nonceptr, ((*nonceptr) - start_nonce));
+			}
+		}
+
+		hashlog_remember_scan_range(work.job_id, work.scanned_from, work.scanned_to);
+
 		/* output */
 		if (!opt_quiet && loopcnt) {
 			sprintf(s, thr_hashrates[thr_id] >= 1e6 ? "%.0f" : "%.2f",
@@ -1365,18 +1384,9 @@ continue_scan:
 			global_hashrate = llround(hashrate);
 		}
 
-		if (rc) {
-			work.scanned_to = *nonceptr;
-		} else {
-			work.scanned_to = max_nonce;
-		}
-
-		// could be used to store speeds too..
-		hashlog_remember_scan_range(work.job_id, work.scanned_from, work.scanned_to);
-
 		/* if nonce found, submit work */
-		if (rc) {
-			if (!opt_benchmark && !submit_work(mythr, &work))
+		if (rc && !opt_benchmark) {
+			if (!submit_work(mythr, &work))
 				break;
 		}
 
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 4844f38..3a8ae58 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -172,7 +172,7 @@
       <MaxRegCount>80</MaxRegCount>
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>false</Keep>
-      <CodeGeneration>compute_50,sm_50</CodeGeneration>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;compute_52,sm_52</CodeGeneration>
       <AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
       <Defines>
       </Defines>
@@ -473,4 +473,4 @@
   <Target Name="AfterClean">
     <Delete Files="@(FilesToCopy->'$(OutDir)%(Filename)%(Extension)')" TreatErrorsAsWarnings="true" />
   </Target>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/stats.cpp b/stats.cpp
index 771f73b..5ae0135 100644
--- a/stats.cpp
+++ b/stats.cpp
@@ -27,8 +27,8 @@ extern int device_map[8];
  */
 extern "C" void stats_remember_speed(int thr_id, uint32_t hashcount, double hashrate, uint8_t found)
 {
-	uint64_t thr = (0xff & thr_id);
-	uint64_t key = (thr << 56) + (uid++ % UINT_MAX);
+	uint64_t gpu = device_map[thr_id];
+	uint64_t key = (gpu << 56) + (uid++ % UINT_MAX);
 	stats_data data;
 	// to enough hashes to give right stats
 	if (hashcount < 1000 || hashrate < 0.01)
@@ -39,8 +39,8 @@ extern "C" void stats_remember_speed(int thr_id, uint32_t hashcount, double hash
 		return;
 
 	memset(&data, 0, sizeof(data));
-	data.gpu_id = device_map[thr_id];
-	data.thr_id = (uint8_t)thr;
+	data.gpu_id = (uint8_t)gpu;
+	data.thr_id = (uint8_t)thr_id;
 	data.tm_stat = (uint32_t) time(NULL);
 	data.hashcount = hashcount;
 	data.hashfound = found;
@@ -61,8 +61,8 @@ extern "C" void stats_remember_speed(int thr_id, uint32_t hashcount, double hash
  */
 extern "C" double stats_get_speed(int thr_id, double def_speed)
 {
-	uint64_t thr = (0xff & thr_id);
-	uint64_t keypfx = (thr << 56);
+	uint64_t gpu = device_map[thr_id];
+	uint64_t keypfx = (gpu << 56);
 	uint64_t keymsk = (0xffULL << 56);
 	double speed = 0.0;
 	int records = 0;
@@ -93,8 +93,8 @@ extern "C" double stats_get_speed(int thr_id, double def_speed)
 
 extern "C" int stats_get_history(int thr_id, struct stats_data *data, int max_records)
 {
-	uint64_t thr = (0xff & thr_id);
-	uint64_t keypfx = (thr << 56);
+	uint64_t gpu = device_map[thr_id];
+	uint64_t keypfx = (gpu << 56);
 	uint64_t keymsk = (0xffULL << 56);
 	double speed = 0.0;
 	int records = 0;
diff --git a/x11/x11.cu b/x11/x11.cu
index b48dde6..9baed10 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -135,13 +135,13 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata,
     unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
-	static bool init[8] = {0,0,0,0,0,0,0,0};
+	static bool init[8] = { 0 };
 	int intensity = (device_sm[device_map[thr_id]] >= 500) ? 20 : 19;
 	int throughput = opt_work_size ? opt_work_size : (1 << intensity); // 20=256*256*16;
 	throughput = min(throughput, max_nonce - first_nonce);
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000f;
+		((uint32_t*)ptarget)[7] = 0x000f;
 
 	if (!init[thr_id])
 	{
diff --git a/x13/x13.cu b/x13/x13.cu
index 4918e3a..31a55a5 100644
--- a/x13/x13.cu
+++ b/x13/x13.cu
@@ -154,20 +154,17 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
     unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
-
-	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x0000ff;
-
-	const uint32_t Htarg = ptarget[7];
-
+	static bool init[8] = { 0 };
 	int throughput = opt_work_size ? opt_work_size : (1 << 19); // 256*256*8;
 	throughput = min(throughput, max_nonce - first_nonce);
 
-	static bool init[8] = {0,0,0,0,0,0,0,0};
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x000f;
+
 	if (!init[thr_id])
 	{
 		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 2 * 32 * throughput));
 
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
@@ -216,11 +213,12 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
 		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
+			const uint32_t Htarg = ptarget[7];
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			x13hash(vhash64, endiandata);
 
-			if( (vhash64[7]<=Htarg) && fulltest(vhash64, ptarget) ) {
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget) ) {
 				pdata[19] = foundNonce;
 				*hashes_done = foundNonce - first_nonce + 1;
 				return 1;
@@ -233,9 +231,13 @@ extern "C" int scanhash_x13(int thr_id, uint32_t *pdata,
 			}
 		}
 
+		if ((uint64_t)pdata[19] + throughput > (uint64_t)max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+	} while (!work_restart[thr_id].restart);
 
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
diff --git a/x15/x14.cu b/x15/x14.cu
index a0adc2f..99c62fd 100644
--- a/x15/x14.cu
+++ b/x15/x14.cu
@@ -165,14 +165,13 @@ extern "C" int scanhash_x14(int thr_id, uint32_t *pdata,
 	unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
-	static bool init[8] = {0,0,0,0,0,0,0,0};
+	static bool init[8] = { 0 };
 	uint32_t endiandata[20];
-
 	int throughput = opt_work_size ? opt_work_size : (1 << 19); // 256*256*8;
 	throughput = min(throughput, max_nonce - first_nonce);
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0xff;
+		((uint32_t*)ptarget)[7] = 0x000f;
 
 	if (!init[thr_id])
 	{
diff --git a/x15/x15.cu b/x15/x15.cu
index a424834..23f6aa8 100644
--- a/x15/x15.cu
+++ b/x15/x15.cu
@@ -174,7 +174,7 @@ extern "C" int scanhash_x15(int thr_id, uint32_t *pdata,
 	unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
-	static bool init[8] = {0,0,0,0,0,0,0,0};
+	static bool init[8] = { 0 };
 	uint32_t endiandata[20];
 
 	int throughput = opt_work_size ? opt_work_size : (1 << 19); // 256*256*8;