From 355b835ae0daa34b6f0df4040b573a9b538ff6f2 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Fri, 16 Oct 2015 22:01:29 +0200
Subject: [PATCH] benchmark: enhance the mem leak detection

reduce "false" warnings, and ignore unrelated/small ones <= 1 MB

On windows the gpu memory can be allocated by other processes

+ some cleanup in algos... (free/gpulog)
---
 Algo256/blake256.cu     |  2 +-
 Algo256/bmw.cu          |  2 +-
 Algo256/keccak256.cu    |  2 +-
 JHA/jackpotcoin.cu      |  2 +-
 bench.cpp               | 21 ++++++++++++++++-----
 cuda.cpp                |  4 ----
 cuda_groestlcoin.cu     |  2 +-
 cuda_nist5.cu           |  4 ++--
 fuguecoin.cpp           |  5 ++---
 groestlcoin.cpp         | 10 +++++-----
 heavy/heavy.cu          |  6 +++---
 myriadgroestl.cpp       |  5 ++---
 neoscrypt/neoscrypt.cpp |  4 ++--
 pentablake.cu           |  2 +-
 qubit/deep.cu           |  7 ++++---
 qubit/luffa.cu          |  6 +++---
 qubit/qubit.cu          |  4 ++--
 skein.cu                |  4 ++--
 skein2.cpp              |  4 ++--
 x11/c11.cu              |  4 ++--
 x11/fresh.cu            |  2 +-
 x11/s3.cu               |  6 +++---
 x11/x11.cu              |  6 +++---
 x13/x13.cu              |  4 ++--
 x15/whirlpool.cu        |  2 +-
 x15/whirlpoolx.cu       |  2 +-
 x15/x14.cu              |  2 +-
 x15/x15.cu              |  2 +-
 x17/x17.cu              |  2 +-
 zr5.cu                  |  2 +-
 30 files changed, 68 insertions(+), 62 deletions(-)

diff --git a/Algo256/blake256.cu b/Algo256/blake256.cu
index c3a3ea2..4a08410 100644
--- a/Algo256/blake256.cu
+++ b/Algo256/blake256.cu
@@ -474,7 +474,7 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non
 			else if (vhashcpu[7] > ptarget[7] && opt_debug) {
 				applog_hash((uchar*)ptarget);
 				applog_compare_hash((uchar*)vhashcpu, (uchar*)ptarget);
-				gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
diff --git a/Algo256/bmw.cu b/Algo256/bmw.cu
index 1cf06ea..c578d98 100644
--- a/Algo256/bmw.cu
+++ b/Algo256/bmw.cu
@@ -87,7 +87,7 @@ extern "C" int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, u
 				return 1;
 			}
 			else {
-				gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
diff --git a/Algo256/keccak256.cu b/Algo256/keccak256.cu
index 7922912..d2e3ec1 100644
--- a/Algo256/keccak256.cu
+++ b/Algo256/keccak256.cu
@@ -80,7 +80,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
 				return 1;
 			}
 			else {
-				gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu
index b441362..642b749 100644
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@@ -247,7 +247,7 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
 				pdata[19] = foundNonce;
 				return res;
 			} else {
-				gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
diff --git a/bench.cpp b/bench.cpp
index 31944ae..1100d4b 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -4,11 +4,11 @@
  * 2015 - tpruvot@github
  */
 
+#include <unistd.h>
+
 #include "miner.h"
 #include "algos.h"
 
-#include <unistd.h>
-
 int bench_algo = -1;
 
 static double algo_hashrates[MAX_GPUS][ALGO_COUNT] = { 0 };
@@ -120,7 +120,13 @@ bool bench_algo_switch_next(int thr_id)
 	// free current algo memory and track mem usage
 	mused = cuda_available_memory(thr_id);
 	algo_free_all(thr_id);
+
+	// device can take some time to free
 	mfree = cuda_available_memory(thr_id);
+	if (device_mem_free[thr_id] > mfree) {
+		sleep(1);
+		mfree = cuda_available_memory(thr_id);
+	}
 
 	// we need to wait completion on all cards before the switch
 	if (opt_n_threads > 1) {
@@ -132,10 +138,15 @@ bool bench_algo_switch_next(int thr_id)
 	format_hashrate(hashrate, rate);
 	gpulog(LOG_NOTICE, thr_id, "%s hashrate = %s", algo_names[prev_algo], rate);
 
-	// check if there is memory leak
+	// ensure memory leak is still real after the barrier
 	if (device_mem_free[thr_id] > mfree) {
-		gpulog(LOG_WARNING, thr_id, "memory leak detected in %s ! %d MB free",
-			algo_names[prev_algo], mfree);
+		mfree = cuda_available_memory(thr_id);
+	}
+
+	// check if there is memory leak
+	if (device_mem_free[thr_id] - mfree > 1) {
+		gpulog(LOG_WARNING, thr_id, "possible %d MB memory leak in %s! %d MB free",
+			(device_mem_free[thr_id] - mfree), algo_names[prev_algo], mfree);
 		cuda_reset_device(thr_id, NULL); // force to free the leak
 		mfree = cuda_available_memory(thr_id);
 	}
diff --git a/cuda.cpp b/cuda.cpp
index b912cad..635d9df 100644
--- a/cuda.cpp
+++ b/cuda.cpp
@@ -4,10 +4,6 @@
 #include <unistd.h>
 #include <map>
 
-#ifndef _WIN32
-#include <unistd.h>
-#endif
-
 // include thrust
 #ifndef __cplusplus
 #include <thrust/version.h>
diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu
index 4e4b728..eb4f833 100644
--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@@ -99,7 +99,7 @@ void groestlcoin_cpu_init(int thr_id, uint32_t threads)
 	// to check if the binary supports SM3+
 	cuda_get_arch(thr_id);
 
-	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
+	CUDA_SAFE_CALL(cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)));
 }
 
 __host__
diff --git a/cuda_nist5.cu b/cuda_nist5.cu
index a48171c..ef875d6 100644
--- a/cuda_nist5.cu
+++ b/cuda_nist5.cu
@@ -148,7 +148,7 @@ extern "C" int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce,
 				goto out;
 			}
 			else {
-				applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
@@ -171,7 +171,7 @@ extern "C" void free_nist5(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);
 
diff --git a/fuguecoin.cpp b/fuguecoin.cpp
index aff0828..d7681c8 100644
--- a/fuguecoin.cpp
+++ b/fuguecoin.cpp
@@ -82,8 +82,7 @@ int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, unsigne
 				*hashes_done = foundNounce - start_nonce + 1;
 				return 1;
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!",
-					device_map[thr_id], foundNounce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNounce);
 			}
 		}
 
@@ -106,7 +105,7 @@ void free_fugue256(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	fugue256_cpu_free(thr_id);
 
diff --git a/groestlcoin.cpp b/groestlcoin.cpp
index a4cb26d..c3f803a 100644
--- a/groestlcoin.cpp
+++ b/groestlcoin.cpp
@@ -36,14 +36,15 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
 	uint32_t throughput = cuda_default_throughput(thr_id, 1 << 19); // 256*256*8
 	if (init[thr_id]) throughput = min(throughput, max_nonce - start_nonce);
 
-	uint32_t *outputHash = (uint32_t*)malloc(throughput * 64);
+	uint32_t *outputHash = (uint32_t*)malloc((size_t) 64* throughput);
 
 	if (opt_benchmark)
-		((uint32_t*)ptarget)[7] = 0x000000ff;
+		ptarget[7] = 0x000ff;
 
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		CUDA_LOG_ERROR();
 		groestlcoin_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
@@ -73,8 +74,7 @@ int scanhash_groestlcoin(int thr_id, struct work *work, uint32_t max_nonce, unsi
 				free(outputHash);
 				return true;
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!",
-					device_map[thr_id], foundNounce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNounce);
 			}
 		}
 
@@ -97,7 +97,7 @@ void free_groestlcoin(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	groestlcoin_cpu_free(thr_id);
 	init[thr_id] = false;
diff --git a/heavy/heavy.cu b/heavy/heavy.cu
index 4454ec6..07f706f 100644
--- a/heavy/heavy.cu
+++ b/heavy/heavy.cu
@@ -275,8 +275,8 @@ int scanhash_heavy(int thr_id, struct work *work, uint32_t max_nonce, unsigned l
                     uint32_t vhash[8];
                     pdata[19] += nonce - pdata[19];
                     heavycoin_hash((uchar*)vhash, (uchar*)pdata, blocklen);
-                    if (memcmp(vhash, foundhash, 8*sizeof(uint32_t))) {
-                        applog(LOG_ERR, "hash for nonce %08x does not validate on CPU!\n", nonce);
+                    if (memcmp(vhash, foundhash, 32)) {
+                        gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", nonce);
                     } else {
                         *hashes_done = pdata[19] - first_nonce;
                         work_set_target_ratio(work, vhash);
@@ -306,7 +306,7 @@ extern "C" void free_heavy(int thr_id)
     if (!init[thr_id])
         return;
 
-    cudaSetDevice(device_map[thr_id]);
+    cudaThreadSynchronize();
 
     cudaFree(heavy_nonceVector[thr_id]);
 
diff --git a/myriadgroestl.cpp b/myriadgroestl.cpp
index 0eea601..f27d060 100644
--- a/myriadgroestl.cpp
+++ b/myriadgroestl.cpp
@@ -78,8 +78,7 @@ int scanhash_myriad(int thr_id, struct work *work, uint32_t max_nonce, unsigned
 				free(outputHash);
 				return 1;
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!",
-					device_map[thr_id], foundNounce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNounce);
 			}
 		}
 
@@ -102,7 +101,7 @@ void free_myriad(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	myriadgroestl_cpu_free(thr_id);
 	init[thr_id] = false;
diff --git a/neoscrypt/neoscrypt.cpp b/neoscrypt/neoscrypt.cpp
index c111cfc..0e07845 100644
--- a/neoscrypt/neoscrypt.cpp
+++ b/neoscrypt/neoscrypt.cpp
@@ -75,7 +75,7 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
 				pdata[19] = foundNonce;
 				return 1;
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", dev_id, foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
@@ -93,7 +93,7 @@ void free_neoscrypt(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	neoscrypt_cpu_free(thr_id);
 	init[thr_id] = false;
diff --git a/pentablake.cu b/pentablake.cu
index b0d8132..062b270 100644
--- a/pentablake.cu
+++ b/pentablake.cu
@@ -430,7 +430,7 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n
 				pdata[19] = foundNonce;
 				return rc;
 			} else {
-				gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
diff --git a/qubit/deep.cu b/qubit/deep.cu
index 32df85c..e30bd2e 100644
--- a/qubit/deep.cu
+++ b/qubit/deep.cu
@@ -67,8 +67,9 @@ extern "C" int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce,
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
+		CUDA_LOG_ERROR();
 
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], throughput * 64));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
 
 		qubit_luffa512_cpu_init(thr_id, throughput);
 		x11_cubehash512_cpu_init(thr_id, throughput);
@@ -117,7 +118,7 @@ extern "C" int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce,
 				return res;
 			}
 			else {
-				applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
@@ -135,7 +136,7 @@ extern "C" void free_deep(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);
 
diff --git a/qubit/luffa.cu b/qubit/luffa.cu
index f03efaf..afbf545 100644
--- a/qubit/luffa.cu
+++ b/qubit/luffa.cu
@@ -48,7 +48,7 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce,
 		CUDA_LOG_ERROR();
 		//if (opt_cudaschedule == -1) // to reduce cpu usage...
 		//	cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
-		CUDA_LOG_ERROR();
+		//CUDA_LOG_ERROR();
 
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
 
@@ -82,7 +82,7 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce,
 				pdata[19] = foundNonce;
 				return 1;
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
@@ -111,6 +111,6 @@ extern "C" void free_luffa(int thr_id)
 
 	cuda_check_cpu_free(thr_id);
 
-	cudaDeviceSynchronize();
 	init[thr_id] = false;
+	cudaDeviceSynchronize();
 }
diff --git a/qubit/qubit.cu b/qubit/qubit.cu
index 1a94a54..629adad 100644
--- a/qubit/qubit.cu
+++ b/qubit/qubit.cu
@@ -160,7 +160,7 @@ extern "C" void free_qubit(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);
 
@@ -170,4 +170,4 @@ extern "C" void free_qubit(int thr_id)
 	init[thr_id] = false;
 
 	cudaDeviceSynchronize();
-}
\ No newline at end of file
+}
diff --git a/skein.cu b/skein.cu
index 9efcd6d..f178a78 100644
--- a/skein.cu
+++ b/skein.cu
@@ -449,7 +449,7 @@ extern "C" int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_no
 				return res;
 			}
 			else {
-				applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
@@ -473,7 +473,7 @@ extern "C" void free_skeincoin(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	if (sm5)
 		skeincoin_free(thr_id);
diff --git a/skein2.cpp b/skein2.cpp
index 86f5fc3..b05ccb3 100644
--- a/skein2.cpp
+++ b/skein2.cpp
@@ -106,7 +106,7 @@ int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned
 				pdata[19] = swab32(foundNonce);
 				return res;
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", dev_id, foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
@@ -129,7 +129,7 @@ void free_skein2(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);
 
diff --git a/x11/c11.cu b/x11/c11.cu
index f514cdb..5db4cb6 100644
--- a/x11/c11.cu
+++ b/x11/c11.cu
@@ -235,7 +235,7 @@ extern "C" int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, u
 				pdata[19] = foundNonce;
 				return res;
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 				pdata[19] = foundNonce + 1;
 			}
 		}
@@ -254,7 +254,7 @@ extern "C" void free_c11(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);
 	quark_groestl512_cpu_free(thr_id);
diff --git a/x11/fresh.cu b/x11/fresh.cu
index 8ef4fe7..8e2b42a 100644
--- a/x11/fresh.cu
+++ b/x11/fresh.cu
@@ -143,7 +143,7 @@ extern "C" int scanhash_fresh(int thr_id, struct work* work, uint32_t max_nonce,
 				pdata[19] = foundNonce;
 				return res;
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
diff --git a/x11/s3.cu b/x11/s3.cu
index ddc6382..d8d11e4 100644
--- a/x11/s3.cu
+++ b/x11/s3.cu
@@ -126,7 +126,7 @@ extern "C" int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, un
 				return res;
 
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for nonce $%08X does not validate on CPU!", device_map[thr_id], foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
@@ -144,7 +144,7 @@ extern "C" void free_s3(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);
 	x11_simd512_cpu_free(thr_id);
@@ -153,4 +153,4 @@ extern "C" void free_s3(int thr_id)
 	init[thr_id] = false;
 
 	cudaDeviceSynchronize();
-}
\ No newline at end of file
+}
diff --git a/x11/x11.cu b/x11/x11.cu
index bffed09..ee33cc1 100644
--- a/x11/x11.cu
+++ b/x11/x11.cu
@@ -233,7 +233,7 @@ extern "C" int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, u
 				pdata[19] = foundNonce;
 				return res;
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 				pdata[19] = foundNonce + 1;
 			}
 		}
@@ -252,7 +252,7 @@ extern "C" void free_x11(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);
 
@@ -263,4 +263,4 @@ extern "C" void free_x11(int thr_id)
 	init[thr_id] = false;
 
 	cudaDeviceSynchronize();
-}
\ No newline at end of file
+}
diff --git a/x13/x13.cu b/x13/x13.cu
index cfdc3fb..5859029 100644
--- a/x13/x13.cu
+++ b/x13/x13.cu
@@ -234,7 +234,7 @@ extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, u
 				}
 				return res;
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
@@ -272,4 +272,4 @@ extern "C" void free_x13(int thr_id)
 
 	cudaDeviceSynchronize();
 	init[thr_id] = false;
-}
\ No newline at end of file
+}
diff --git a/x15/whirlpool.cu b/x15/whirlpool.cu
index 99d0d33..31f6466 100644
--- a/x15/whirlpool.cu
+++ b/x15/whirlpool.cu
@@ -126,7 +126,7 @@ extern "C" void free_whirl(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);
 
diff --git a/x15/whirlpoolx.cu b/x15/whirlpoolx.cu
index b7b356f..4416512 100644
--- a/x15/whirlpoolx.cu
+++ b/x15/whirlpoolx.cu
@@ -107,7 +107,7 @@ extern "C" void free_whirlx(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);
 
diff --git a/x15/x14.cu b/x15/x14.cu
index 559d03f..2696c4d 100644
--- a/x15/x14.cu
+++ b/x15/x14.cu
@@ -248,7 +248,7 @@ extern "C" int scanhash_x14(int thr_id,  struct work* work, uint32_t max_nonce,
 				pdata[19] = foundNonce;
 				return res;
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 			}
 		}
 		pdata[19] += throughput;
diff --git a/x15/x15.cu b/x15/x15.cu
index ba72208..5f9eab5 100644
--- a/x15/x15.cu
+++ b/x15/x15.cu
@@ -254,7 +254,7 @@ extern "C" int scanhash_x15(int thr_id,  struct work* work, uint32_t max_nonce,
 				pdata[19] = foundNonce;
 				return res;
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
diff --git a/x17/x17.cu b/x17/x17.cu
index 9bb40bd..a5198df 100644
--- a/x17/x17.cu
+++ b/x17/x17.cu
@@ -279,7 +279,7 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
 				pdata[19] = foundNonce;
 				return res;
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
diff --git a/zr5.cu b/zr5.cu
index d591f1d..8f6d821 100644
--- a/zr5.cu
+++ b/zr5.cu
@@ -481,7 +481,7 @@ extern "C" void free_zr5(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);