From 34e97bf3e6b3bab935243d18997d6538f6a397bc Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Tue, 27 Sep 2016 00:05:17 +0200
Subject: [PATCH] Show intensity on init for all algos

---
 Algo256/blake256.cu     |  1 +
 Algo256/bmw.cu          |  6 ++++++
 Algo256/decred.cu       |  1 +
 Algo256/keccak256.cu    | 10 +++++++++-
 Algo256/vanilla.cu      | 12 +++++++-----
 JHA/jackpotcoin.cu      |  1 +
 ccminer.cpp             |  2 --
 configure.ac            |  2 +-
 fuguecoin.cpp           |  7 +++++++
 groestlcoin.cpp         |  2 ++
 heavy/heavy.cu          |  1 +
 lyra2/lyra2RE.cu        | 43 +++++++++++++++++++----------------------
 lyra2/lyra2REv2.cu      |  1 +
 myriadgroestl.cpp       |  2 ++
 neoscrypt/neoscrypt.cpp |  1 +
 pentablake.cu           |  1 +
 quark/quarkcoin.cu      |  1 +
 qubit/deep.cu           |  3 ++-
 qubit/luffa.cu          |  1 +
 qubit/qubit.cu          |  1 +
 scrypt-jane.cpp         |  3 ++-
 scrypt.cpp              |  3 ++-
 sia.cu                  |  1 +
 skein.cu                |  1 +
 skein2.cpp              |  1 +
 x11/c11.cu              |  1 +
 x11/fresh.cu            |  8 +++++++-
 x11/s3.cu               |  1 +
 x11/sib.cu              |  1 +
 x11/x11.cu              |  1 +
 x11/x11evo.cu           |  1 +
 x13/x13.cu              |  1 +
 x15/whirlpool.cu        |  2 ++
 x15/whirlpoolx.cu       |  1 +
 x15/x14.cu              |  1 +
 x15/x15.cu              |  1 +
 x17/x17.cu              |  1 +
 zr5.cu                  |  1 +
 38 files changed, 93 insertions(+), 36 deletions(-)

diff --git a/Algo256/blake256.cu b/Algo256/blake256.cu
index 8cd9035..6366bbb 100644
--- a/Algo256/blake256.cu
+++ b/Algo256/blake256.cu
@@ -504,6 +504,7 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non
 			cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)), -1);
 		CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], NBN * sizeof(uint32_t)), -1);
diff --git a/Algo256/bmw.cu b/Algo256/bmw.cu
index 2bef9ca..085fa49 100644
--- a/Algo256/bmw.cu
+++ b/Algo256/bmw.cu
@@ -52,6 +52,12 @@ extern "C" int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, u
 
 	if (!init[thr_id]) {
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+                        // reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		cuda_check_cpu_init(thr_id, throughput);
 		bmw256_midstate_init(thr_id, throughput);
diff --git a/Algo256/decred.cu b/Algo256/decred.cu
index e0322ca..996af17 100644
--- a/Algo256/decred.cu
+++ b/Algo256/decred.cu
@@ -376,6 +376,7 @@ extern "C" int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce
 			cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], maxResults*sizeof(uint32_t)), -1);
 		CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], maxResults*sizeof(uint32_t)), -1);
diff --git a/Algo256/keccak256.cu b/Algo256/keccak256.cu
index 4d5f315..c8c47ea 100644
--- a/Algo256/keccak256.cu
+++ b/Algo256/keccak256.cu
@@ -48,8 +48,16 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
 	if (opt_benchmark)
 		ptarget[7] = 0x000f;
 
-	if (!init[thr_id]) {
+	if (!init[thr_id])
+	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], throughput * 64));
 		keccak256_cpu_init(thr_id, throughput);
diff --git a/Algo256/vanilla.cu b/Algo256/vanilla.cu
index ddf77ee..e3a4aff 100644
--- a/Algo256/vanilla.cu
+++ b/Algo256/vanilla.cu
@@ -378,6 +378,11 @@ extern "C" int scanhash_vanilla(int thr_id, struct work* work, uint32_t max_nonc
 	const uint32_t targetHigh   = ptarget[6];
 	int dev_id = device_map[thr_id];
 
+	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 30 : 24;
+	if (device_sm[dev_id] < 350) intensity = 22;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
 	if (!init[thr_id]) {
 		cudaSetDevice(dev_id);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
@@ -387,6 +392,8 @@ extern "C" int scanhash_vanilla(int thr_id, struct work* work, uint32_t max_nonc
 			cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonce[thr_id], NBN * sizeof(uint32_t)), -1);
 		CUDA_CALL_OR_RET_X(cudaMallocHost(&h_resNonce[thr_id], NBN * sizeof(uint32_t)), -1);
 		cudaStreamCreate(&streams[thr_id]);
@@ -402,11 +409,6 @@ extern "C" int scanhash_vanilla(int thr_id, struct work* work, uint32_t max_nonc
 
 	vanilla_cpu_setBlock_16(thr_id,endiandata,&pdata[16]);
 
-	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 30 : 24;
-	if (device_sm[dev_id] < 350) intensity = 22;
-	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
-	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
-
 	const dim3 grid((throughput + (NPT*TPB)-1)/(NPT*TPB));
 	const dim3 block(TPB);
 	int rc = 0;
diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu
index d2872c5..10370ac 100644
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@@ -105,6 +105,7 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
 			gpulog(LOG_ERR, thr_id, "Sorry, This algo is not supported by this GPU arch (SM 3.0 required)");
 			proper_exit(EXIT_CODE_CUDA_ERROR);
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
 
diff --git a/ccminer.cpp b/ccminer.cpp
index 5a2742b..0de1cf7 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -2981,8 +2981,6 @@ void parse_arg(int key, char *arg)
 					}
 					else if (gpus_intensity[n] != (1 << v)) {
 						gpus_intensity[n] = (1 << v);
-						applog(LOG_INFO, "Intensity set to %u, %u cuda threads",
-							v, gpus_intensity[n]);
 					}
 				}
 				last = gpus_intensity[n];
diff --git a/configure.ac b/configure.ac
index 20c549b..fec7bef 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [1.8.2], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [1.8.3], [], [ccminer], [http://github.com/tpruvot/ccminer])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/fuguecoin.cpp b/fuguecoin.cpp
index 9166774..1e36e67 100644
--- a/fuguecoin.cpp
+++ b/fuguecoin.cpp
@@ -40,6 +40,13 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
 	if(!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		fugue256_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
diff --git a/groestlcoin.cpp b/groestlcoin.cpp
index 96a6d38..a1b5136 100644
--- a/groestlcoin.cpp
+++ b/groestlcoin.cpp
@@ -48,6 +48,8 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
 		CUDA_LOG_ERROR();
 		groestlcoin_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
diff --git a/heavy/heavy.cu b/heavy/heavy.cu
index 8782616..b22401f 100644
--- a/heavy/heavy.cu
+++ b/heavy/heavy.cu
@@ -178,6 +178,7 @@ int scanhash_heavy(int thr_id, struct work *work, uint32_t max_nonce, unsigned l
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		hefty_cpu_init(thr_id, throughput);
 		sha256_cpu_init(thr_id, throughput);
diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu
index 9cd9ccb..cd3ca6a 100644
--- a/lyra2/lyra2RE.cu
+++ b/lyra2/lyra2RE.cu
@@ -79,7 +79,7 @@ extern "C" void lyra2re_hash(void *state, const void *input)
 }
 
 static bool init[MAX_GPUS] = { 0 };
-static uint32_t throughput[MAX_GPUS] = { 0 };
+static __thread uint32_t throughput = 0;
 
 extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
@@ -99,9 +99,8 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
 
 		int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16;
 		if (device_sm[device_map[thr_id]] == 500) intensity = 15;
-		int temp = intensity;
-		throughput[thr_id] = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
-		if (init[thr_id]) throughput[thr_id] = min(throughput[thr_id], max_nonce - first_nonce);
+		throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
+		if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 
 		cudaDeviceProp props;
 		cudaGetDeviceProperties(&props, dev_id);
@@ -109,25 +108,23 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
 		if (strstr(props.name, "750 Ti")) gtx750ti = true;
 		else gtx750ti = false;
 
-		blake256_cpu_init(thr_id, throughput[thr_id]);
-		keccak256_cpu_init(thr_id, throughput[thr_id]);
-		skein256_cpu_init(thr_id, throughput[thr_id]);
-		groestl256_cpu_init(thr_id, throughput[thr_id]);
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		blake256_cpu_init(thr_id, throughput);
+		keccak256_cpu_init(thr_id, throughput);
+		skein256_cpu_init(thr_id, throughput);
+		groestl256_cpu_init(thr_id, throughput);
 
 		if (device_sm[dev_id] >= 500)
 		{
 			size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
-			CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput[thr_id]));
-			lyra2_cpu_init(thr_id, throughput[thr_id], d_matrix[thr_id]);
+			CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));
+			lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
 		}
 
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput[thr_id]));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
 
 		init[thr_id] = true;
-		if (temp != intensity){
-			gpulog(LOG_INFO, thr_id, "Intensity set to %u, %u cuda threads",
-				intensity, throughput[thr_id]);
-		}
 	}
 
 	uint32_t _ALIGN(128) endiandata[20];
@@ -141,15 +138,15 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
 		int order = 0;
 		uint32_t foundNonce;
 
-		blake256_cpu_hash_80(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
-		keccak256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
-		lyra2_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], gtx750ti);
-		skein256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
+		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti);
+		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		TRACE("S")
 
-		*hashes_done = pdata[19] - first_nonce + throughput[thr_id];
+		*hashes_done = pdata[19] - first_nonce + throughput;
 
-		foundNonce = groestl256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
+		foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		if (foundNonce != UINT32_MAX)
 		{
 			uint32_t _ALIGN(64) vhash64[8];
@@ -181,11 +178,11 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
 			}
 		}
 
-		if ((uint64_t)throughput[thr_id] + pdata[19] >= max_nonce) {
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
-		pdata[19] += throughput[thr_id];
+		pdata[19] += throughput;
 
 	} while (!work_restart[thr_id].restart);
 
diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu
index f7342f2..f6699ac 100644
--- a/lyra2/lyra2REv2.cu
+++ b/lyra2/lyra2REv2.cu
@@ -113,6 +113,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		blake256_cpu_init(thr_id, throughput);
 		keccak256_cpu_init(thr_id,throughput);
diff --git a/myriadgroestl.cpp b/myriadgroestl.cpp
index 988c586..fdc6a2f 100644
--- a/myriadgroestl.cpp
+++ b/myriadgroestl.cpp
@@ -55,6 +55,8 @@ int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
 		myriadgroestl_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
diff --git a/neoscrypt/neoscrypt.cpp b/neoscrypt/neoscrypt.cpp
index fd38f23..41fdfa7 100644
--- a/neoscrypt/neoscrypt.cpp
+++ b/neoscrypt/neoscrypt.cpp
@@ -40,6 +40,7 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			cudaGetLastError(); // reset errors if device is not "reset"
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g (+5), %u cuda threads", throughput2intensity(throughput), throughput);
 
 		if (device_sm[dev_id] <= 300) {
 			gpulog(LOG_ERR, thr_id, "Sorry neoscrypt is not supported on SM 3.0 devices");
diff --git a/pentablake.cu b/pentablake.cu
index 97dde16..5a34bc4 100644
--- a/pentablake.cu
+++ b/pentablake.cu
@@ -69,6 +69,7 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
 
diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu
index 91a0e11..beea1d5 100644
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@@ -145,6 +145,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		cudaGetLastError();
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
diff --git a/qubit/deep.cu b/qubit/deep.cu
index f884602..b681a47 100644
--- a/qubit/deep.cu
+++ b/qubit/deep.cu
@@ -66,8 +66,9 @@ extern "C" int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce,
 			cudaDeviceReset();
 			// reduce cpu usage
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
 		}
-		CUDA_LOG_ERROR();
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
 
diff --git a/qubit/luffa.cu b/qubit/luffa.cu
index db283cb..f92011d 100644
--- a/qubit/luffa.cu
+++ b/qubit/luffa.cu
@@ -51,6 +51,7 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce,
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
 
diff --git a/qubit/qubit.cu b/qubit/qubit.cu
index e29f8c5..927c93b 100644
--- a/qubit/qubit.cu
+++ b/qubit/qubit.cu
@@ -79,6 +79,7 @@ extern "C" int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce,
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		qubit_luffa512_cpu_init(thr_id, throughput);
 		x11_cubehash512_cpu_init(thr_id, throughput);
diff --git a/scrypt-jane.cpp b/scrypt-jane.cpp
index 070092d..89e7472 100644
--- a/scrypt-jane.cpp
+++ b/scrypt-jane.cpp
@@ -489,8 +489,9 @@ int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonce, unsi
 		cudaDeviceSynchronize();
 		cudaDeviceReset();
 		cudaSetDevice(dev_id);
+
 		throughput = cuda_throughput(thr_id);
-		applog(LOG_INFO, "GPU #%d: cuda throughput is %d", dev_id, throughput);
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		init[thr_id] = true;
 	}
diff --git a/scrypt.cpp b/scrypt.cpp
index 5dd3f27..1d53d17 100644
--- a/scrypt.cpp
+++ b/scrypt.cpp
@@ -721,8 +721,9 @@ int scanhash_scrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned
 		cudaDeviceSynchronize();
 		cudaDeviceReset();
 		cudaSetDevice(dev_id);
+
 		throughput = cuda_throughput(thr_id);
-		applog(LOG_INFO, "GPU #%d: cuda throughput is %d", dev_id, throughput);
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		init[thr_id] = true;
 	}
diff --git a/sia.cu b/sia.cu
index 1a5b681..ee08e40 100644
--- a/sia.cu
+++ b/sia.cu
@@ -214,6 +214,7 @@ int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned lon
 			//cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonces[thr_id], NBN * sizeof(uint32_t)), -1);
 		init[thr_id] = true;
diff --git a/skein.cu b/skein.cu
index f6b1619..12e08d7 100644
--- a/skein.cu
+++ b/skein.cu
@@ -378,6 +378,7 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		if (sm5) {
 			skeincoin_init(thr_id);
diff --git a/skein2.cpp b/skein2.cpp
index 8875042..a27e5dc 100644
--- a/skein2.cpp
+++ b/skein2.cpp
@@ -62,6 +62,7 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput);
 
diff --git a/x11/c11.cu b/x11/c11.cu
index 5536cc8..048cd69 100644
--- a/x11/c11.cu
+++ b/x11/c11.cu
@@ -125,6 +125,7 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_bmw512_cpu_init(thr_id, throughput);
diff --git a/x11/fresh.cu b/x11/fresh.cu
index 5e3bf38..9c426c6 100644
--- a/x11/fresh.cu
+++ b/x11/fresh.cu
@@ -83,7 +83,13 @@ extern "C" int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce,
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
-		CUDA_LOG_ERROR();
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t)64 * throughput + 4), -1);
 
diff --git a/x11/s3.cu b/x11/s3.cu
index 84f0998..e2887bf 100644
--- a/x11/s3.cu
+++ b/x11/s3.cu
@@ -86,6 +86,7 @@ extern "C" int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, un
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
 
diff --git a/x11/sib.cu b/x11/sib.cu
index 28794fb..ca7bb77 100644
--- a/x11/sib.cu
+++ b/x11/sib.cu
@@ -120,6 +120,7 @@ extern "C" int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, u
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_bmw512_cpu_init(thr_id, throughput);
diff --git a/x11/x11.cu b/x11/x11.cu
index 2c56892..df7dd84 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -114,6 +114,7 @@ extern "C" int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, u
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_bmw512_cpu_init(thr_id, throughput);
diff --git a/x11/x11evo.cu b/x11/x11evo.cu
index 289fe70..da208ac 100644
--- a/x11/x11evo.cu
+++ b/x11/x11evo.cu
@@ -257,6 +257,7 @@ extern "C" int scanhash_x11evo(int thr_id, struct work* work, uint32_t max_nonce
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_bmw512_cpu_init(thr_id, throughput);
diff --git a/x13/x13.cu b/x13/x13.cu
index cf04d32..ad3d02a 100644
--- a/x13/x13.cu
+++ b/x13/x13.cu
@@ -133,6 +133,7 @@ extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, u
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
diff --git a/x15/whirlpool.cu b/x15/whirlpool.cu
index 44a0f2b..2a490de 100644
--- a/x15/whirlpool.cu
+++ b/x15/whirlpool.cu
@@ -86,6 +86,8 @@ extern "C" int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce,
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
 		x15_whirlpool_cpu_init(thr_id, throughput, 1 /* old whirlpool */);
 
diff --git a/x15/whirlpoolx.cu b/x15/whirlpoolx.cu
index 8622643..e9ec79b 100644
--- a/x15/whirlpoolx.cu
+++ b/x15/whirlpoolx.cu
@@ -59,6 +59,7 @@ extern "C" int scanhash_whirlx(int thr_id,  struct work* work, uint32_t max_nonc
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
 
diff --git a/x15/x14.cu b/x15/x14.cu
index c2f6912..926f1e8 100644
--- a/x15/x14.cu
+++ b/x15/x14.cu
@@ -147,6 +147,7 @@ extern "C" int scanhash_x14(int thr_id,  struct work* work, uint32_t max_nonce,
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
diff --git a/x15/x15.cu b/x15/x15.cu
index 2ad2900..45d94fc 100644
--- a/x15/x15.cu
+++ b/x15/x15.cu
@@ -157,6 +157,7 @@ extern "C" int scanhash_x15(int thr_id,  struct work* work, uint32_t max_nonce,
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
diff --git a/x17/x17.cu b/x17/x17.cu
index fa2ab31..8b804e5 100644
--- a/x17/x17.cu
+++ b/x17/x17.cu
@@ -172,6 +172,7 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
 			// reduce cpu usage
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
diff --git a/zr5.cu b/zr5.cu
index cf86819..e5281c3 100644
--- a/zr5.cu
+++ b/zr5.cu
@@ -357,6 +357,7 @@ extern "C" int scanhash_zr5(int thr_id, struct work *work,
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 
 		// constants
 		cudaMemcpyToSymbol(c_permut, permut, 24*4, 0, cudaMemcpyHostToDevice);