From 9dfa757dc7c14c64554bb5e14fb9d1673a3ac823 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Mon, 12 Oct 2015 04:49:12 +0200
Subject: [PATCH] warn on cuda errors + various small changes

The full benchmark can now be launched with "ccminer --benchmark"

add a new helper function which log a warning with last cuda error
(not shown with the quiet option) : CUDA_LOG_ERROR();
it can be used where miner.h is included (.c/.cpp/.cu)

fix x14 (in ccminer.cpp), a break was missing in switch..case
---
 JHA/jackpotcoin.cu        | 132 ++++++++++++++++++++------------------
 README.txt                |   2 +-
 api.cpp                   |   4 ++
 bench.cpp                 |  56 ++++++++++++++--
 ccminer.cpp               |  59 ++++-------------
 cuda.cpp                  |  11 +++-
 lyra2/lyra2REv2.cu        |   3 +-
 miner.h                   |   5 +-
 neoscrypt/neoscrypt.cpp   |   2 +
 qubit/luffa.cu            |  10 +--
 scrypt-jane.cpp           |   3 +
 x13/x13.cu                |  10 ++-
 x15/cuda_x15_whirlpool.cu |  12 ++--
 x15/x14.cu                |  21 ++++--
 x15/x15.cu                |   8 +--
 x17/x17.cu                |   8 ++-
 16 files changed, 198 insertions(+), 148 deletions(-)

diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu
index 4e811df..6d605e4 100644
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@@ -10,7 +10,13 @@ extern "C"
 #include "miner.h"
 #include "cuda_helper.h"
 
-static uint32_t *d_hash[MAX_GPUS];
+static uint32_t *d_hash[MAX_GPUS] = { 0 };
+
+// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
+static uint32_t *d_jackpotNonces[MAX_GPUS] = { 0 };
+static uint32_t *d_branch1Nonces[MAX_GPUS] = { 0 };
+static uint32_t *d_branch2Nonces[MAX_GPUS] = { 0 };
+static uint32_t *d_branch3Nonces[MAX_GPUS] = { 0 };
 
 extern void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads);
 extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen);
@@ -31,60 +37,54 @@ extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t st
 
 extern void jackpot_compactTest_cpu_init(int thr_id, uint32_t threads);
 extern void jackpot_compactTest_cpu_free(int thr_id);
-extern void jackpot_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable, 
-											uint32_t *d_nonces1, uint32_t *nrm1,
-											uint32_t *d_nonces2, uint32_t *nrm2,
-											int order);
+extern void jackpot_compactTest_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_validNonceTable,
+                                            uint32_t *d_nonces1, uint32_t *nrm1, uint32_t *d_nonces2, uint32_t *nrm2, int order);
 
 extern uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
 
-// Speicher zur Generierung der Noncevektoren für die bedingten Hashes
-static uint32_t *d_jackpotNonces[MAX_GPUS];
-static uint32_t *d_branch1Nonces[MAX_GPUS];
-static uint32_t *d_branch2Nonces[MAX_GPUS];
-static uint32_t *d_branch3Nonces[MAX_GPUS];
-
 // Original jackpothash Funktion aus einem miner Quelltext
 extern "C" unsigned int jackpothash(void *state, const void *input)
 {
-    sph_blake512_context     ctx_blake;
-    sph_groestl512_context   ctx_groestl;
-    sph_jh512_context        ctx_jh;
-    sph_keccak512_context    ctx_keccak;
-    sph_skein512_context     ctx_skein;
-
-    uint32_t hash[16];
-
-    sph_keccak512_init(&ctx_keccak);
-    sph_keccak512 (&ctx_keccak, input, 80);
-    sph_keccak512_close(&ctx_keccak, hash);
-
-    unsigned int round;
-    for (round = 0; round < 3; round++) {
-        if (hash[0] & 0x01) {
-           sph_groestl512_init(&ctx_groestl);
-           sph_groestl512 (&ctx_groestl, (&hash), 64);
-           sph_groestl512_close(&ctx_groestl, (&hash));
-        }
-        else {
-           sph_skein512_init(&ctx_skein);
-           sph_skein512 (&ctx_skein, (&hash), 64);
-           sph_skein512_close(&ctx_skein, (&hash));
-        }
-        if (hash[0] & 0x01) {
-           sph_blake512_init(&ctx_blake);
-           sph_blake512 (&ctx_blake, (&hash), 64);
-           sph_blake512_close(&ctx_blake, (&hash));
-        }
-        else {
-           sph_jh512_init(&ctx_jh);
-           sph_jh512 (&ctx_jh, (&hash), 64);
-           sph_jh512_close(&ctx_jh, (&hash));
-        }
-    }
-    memcpy(state, hash, 32);
-
-    return round;
+	uint32_t hash[16];
+	unsigned int rnd;
+
+	sph_blake512_context     ctx_blake;
+	sph_groestl512_context   ctx_groestl;
+	sph_jh512_context        ctx_jh;
+	sph_keccak512_context    ctx_keccak;
+	sph_skein512_context     ctx_skein;
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512 (&ctx_keccak, input, 80);
+	sph_keccak512_close(&ctx_keccak, hash);
+
+	for (rnd = 0; rnd < 3; rnd++)
+	{
+		if (hash[0] & 0x01) {
+			sph_groestl512_init(&ctx_groestl);
+			sph_groestl512 (&ctx_groestl, (&hash), 64);
+			sph_groestl512_close(&ctx_groestl, (&hash));
+		}
+		else {
+			sph_skein512_init(&ctx_skein);
+			sph_skein512 (&ctx_skein, (&hash), 64);
+			sph_skein512_close(&ctx_skein, (&hash));
+		}
+
+		if (hash[0] & 0x01) {
+			sph_blake512_init(&ctx_blake);
+			sph_blake512 (&ctx_blake, (&hash), 64);
+			sph_blake512_close(&ctx_blake, (&hash));
+		}
+		else {
+			sph_jh512_init(&ctx_jh);
+			sph_jh512 (&ctx_jh, (&hash), 64);
+			sph_jh512_close(&ctx_jh, (&hash));
+		}
+	}
+	memcpy(state, hash, 32);
+
+	return rnd;
 }
 
 static bool init[MAX_GPUS] = { 0 };
@@ -106,7 +106,7 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
 	{
 		cudaSetDevice(device_map[thr_id]);
 
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
 
 		jackpot_keccak512_cpu_init(thr_id, throughput);
 		jackpot_compactTest_cpu_init(thr_id, throughput);
@@ -117,11 +117,11 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
 
 		cuda_check_cpu_init(thr_id, throughput);
 
-		cudaMalloc(&d_branch1Nonces[thr_id], sizeof(uint32_t)*throughput*2);
-		cudaMalloc(&d_branch2Nonces[thr_id], sizeof(uint32_t)*throughput*2);
-		cudaMalloc(&d_branch3Nonces[thr_id], sizeof(uint32_t)*throughput*2);
+		cudaMalloc(&d_branch1Nonces[thr_id], (size_t) sizeof(uint32_t)*throughput*2);
+		cudaMalloc(&d_branch2Nonces[thr_id], (size_t) sizeof(uint32_t)*throughput*2);
+		cudaMalloc(&d_branch3Nonces[thr_id], (size_t) sizeof(uint32_t)*throughput*2);
 
-		CUDA_SAFE_CALL(cudaMalloc(&d_jackpotNonces[thr_id], sizeof(uint32_t)*throughput*2));
+		CUDA_SAFE_CALL(cudaMalloc(&d_jackpotNonces[thr_id], (size_t) sizeof(uint32_t)*throughput*2));
 
 		init[thr_id] = true;
 	}
@@ -211,19 +211,22 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
 
 		*hashes_done = pdata[19] - first_nonce + throughput;
 
+		CUDA_LOG_ERROR();
+
 		uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
-		if  (foundNonce != 0xffffffff)
+		if (foundNonce != UINT32_MAX)
 		{
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 
-			// diese jackpothash Funktion gibt die Zahl der Runden zurück
+			// jackpothash function gibt die Zahl der Runden zurück
 			jackpothash(vhash64, endiandata);
 
 			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
 				int res = 1;
-				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 				work_set_target_ratio(work, vhash64);
+#if 0
+				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 				if (secNonce != 0) {
 					be32enc(&endiandata[19], secNonce);
 					nist5hash(vhash64, endiandata);
@@ -232,6 +235,7 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
 					pdata[21] = secNonce;
 					res++;
 				}
+#endif
 				pdata[19] = foundNonce;
 				return res;
 			} else {
@@ -250,6 +254,8 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
 
 	} while (!work_restart[thr_id].restart);
 
+	CUDA_LOG_ERROR();
+
 	return 0;
 }
 
@@ -259,9 +265,7 @@ extern "C" void free_jackpot(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
-
-	cudaFree(d_hash[thr_id]);
+	cudaThreadSynchronize();
 
 	cudaFree(d_branch1Nonces[thr_id]);
 	cudaFree(d_branch2Nonces[thr_id]);
@@ -271,8 +275,12 @@ extern "C" void free_jackpot(int thr_id)
 	quark_groestl512_cpu_free(thr_id);
 	jackpot_compactTest_cpu_free(thr_id);
 
+	cudaFree(d_hash[thr_id]);
+
 	cuda_check_cpu_free(thr_id);
-	init[thr_id] = false;
+	CUDA_LOG_ERROR();
 
 	cudaDeviceSynchronize();
-}
\ No newline at end of file
+
+	init[thr_id] = false;
+}
diff --git a/README.txt b/README.txt
index a2a2a1a..8eac428 100644
--- a/README.txt
+++ b/README.txt
@@ -231,7 +231,7 @@ features.
   Under Dev...    v1.7
                   Restore whirlpool algo (and whirlcoin variant)
                   Prepare algo switch ability
-                  Add --benchmark -a auto to run a multi algo benchmark
+                  Add --benchmark -a all to run a benchmark for all algos
                   Add --cuda-schedule parameter
                   Add --show-diff parameter, which display shares diff,
                     and is able to detect real solved blocks on pools.
diff --git a/api.cpp b/api.cpp
index c9e28ba..9ad0c71 100644
--- a/api.cpp
+++ b/api.cpp
@@ -1002,5 +1002,9 @@ void api_set_throughput(int thr_id, uint32_t throughput)
 		if (i && (1U << i) < throughput) {
 			cgpu->intensity += ((float) (throughput-(1U << i)) / (1U << i));
 		}
+
+		// to display in bench results
+		if (opt_benchmark)
+			bench_set_throughput(thr_id, throughput);
 	}
 }
diff --git a/bench.cpp b/bench.cpp
index 87712bd..98330f4 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -21,7 +21,7 @@ static pthread_barrier_t algo_barr;
 static pthread_mutex_t bench_lock = PTHREAD_MUTEX_INITIALIZER;
 
 extern double thr_hashrates[MAX_GPUS];
-extern enum sha_algos opt_algo;
+extern volatile enum sha_algos opt_algo;
 
 void bench_init(int threads)
 {
@@ -41,6 +41,48 @@ void bench_free()
 	pthread_barrier_destroy(&algo_barr);
 }
 
+// required to switch algos
+void algo_free_all(int thr_id)
+{
+	// only initialized algos will be freed
+	free_blake256(thr_id);
+	free_bmw(thr_id);
+	free_c11(thr_id);
+	free_deep(thr_id);
+	free_keccak256(thr_id);
+	free_fresh(thr_id);
+	free_fugue256(thr_id);
+	free_groestlcoin(thr_id);
+	free_heavy(thr_id);
+	free_jackpot(thr_id);
+	free_luffa(thr_id);
+	free_lyra2(thr_id);
+	free_lyra2v2(thr_id);
+	free_myriad(thr_id);
+	free_neoscrypt(thr_id);
+	free_nist5(thr_id);
+	free_pentablake(thr_id);
+	free_quark(thr_id);
+	free_qubit(thr_id);
+	free_skeincoin(thr_id);
+	free_skein2(thr_id);
+	free_s3(thr_id);
+	free_whirl(thr_id);
+	free_whirlx(thr_id);
+	free_x11(thr_id);
+	free_x13(thr_id);
+	free_x14(thr_id);
+	free_x15(thr_id);
+	free_x17(thr_id);
+	free_zr5(thr_id);
+	//free_sha256d(thr_id);
+	free_scrypt(thr_id);
+	free_scrypt_jane(thr_id);
+
+	// warn on cuda error
+	CUDA_LOG_ERROR();
+}
+
 // benchmark all algos (called once per mining thread)
 bool bench_algo_switch_next(int thr_id)
 {
@@ -60,6 +102,11 @@ bool bench_algo_switch_next(int thr_id)
 	if (algo == ALGO_SCRYPT) algo++;
 	if (algo == ALGO_SCRYPT_JANE) algo++;
 
+	// free current algo memory and track mem usage
+	mused = cuda_available_memory(thr_id);
+	algo_free_all(thr_id);
+	mfree = cuda_available_memory(thr_id);
+
 	// we need to wait completion on all cards before the switch
 	if (opt_n_threads > 1) {
 		pthread_barrier_wait(&miner_barr);
@@ -70,15 +117,12 @@ bool bench_algo_switch_next(int thr_id)
 	format_hashrate(hashrate, rate);
 	gpulog(LOG_NOTICE, thr_id, "%s hashrate = %s", algo_names[prev_algo], rate);
 
-	// free current algo memory and track mem usage
-	mused = cuda_available_memory(thr_id);
-	miner_free_device(thr_id);
-	mfree = cuda_available_memory(thr_id);
-
 	// check if there is memory leak
 	if (device_mem_free[thr_id] > mfree) {
 		gpulog(LOG_WARNING, thr_id, "memory leak detected in %s ! %d MB free",
 			algo_names[prev_algo], mfree);
+		cuda_reset_device(thr_id, NULL); // force to free the leak
+		mfree = cuda_available_memory(thr_id);
 	}
 	// store used memory per algo
 	algo_mem_used[thr_id][opt_algo] = device_mem_free[thr_id] - mused;
diff --git a/ccminer.cpp b/ccminer.cpp
index 7ff4d24..b0df5f3 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -108,7 +108,7 @@ int opt_timeout = 300; // curl
 int opt_scantime = 10;
 static json_t *opt_config;
 static const bool opt_time = true;
-enum sha_algos opt_algo = ALGO_X11;
+volatile enum sha_algos opt_algo = ALGO_AUTO;
 int opt_n_threads = 0;
 int gpu_threads = 1;
 int64_t opt_affinity = -1L;
@@ -1435,49 +1435,6 @@ static bool wanna_mine(int thr_id)
 	return state;
 }
 
-// required to switch algos
-void miner_free_device(int thr_id)
-{
-	// todo: some kind of algo "registration"
-	// to call automatically if needed
-	free_blake256(thr_id);
-	free_bmw(thr_id);
-	free_c11(thr_id);
-	free_deep(thr_id);
-	free_keccak256(thr_id);
-	free_fresh(thr_id);
-	free_fugue256(thr_id);
-	free_groestlcoin(thr_id);
-	free_heavy(thr_id);
-	free_jackpot(thr_id);
-	free_luffa(thr_id);
-	free_lyra2(thr_id);
-	free_lyra2v2(thr_id);
-	free_myriad(thr_id);
-	free_neoscrypt(thr_id);
-	free_nist5(thr_id);
-	free_pentablake(thr_id);
-	free_quark(thr_id);
-	free_qubit(thr_id);
-	free_skeincoin(thr_id);
-	free_skein2(thr_id);
-	free_s3(thr_id);
-	free_whirl(thr_id);
-	free_whirlx(thr_id);
-	free_x11(thr_id);
-	free_x13(thr_id);
-	free_x14(thr_id);
-	free_x15(thr_id);
-	free_x17(thr_id);
-	free_zr5(thr_id);
-	//free_sha256d(thr_id);
-	free_scrypt(thr_id);
-	free_scrypt_jane(thr_id);
-
-	// reset remains of error..
-	cudaGetLastError();
-}
-
 static void *miner_thread(void *userdata)
 {
 	struct thr_info *mythr = (struct thr_info *)userdata;
@@ -1634,7 +1591,7 @@ static void *miner_thread(void *userdata)
 
 		pthread_mutex_unlock(&g_work_lock);
 
-		// --benchmark [-a auto]
+		// --benchmark [-a all]
 		if (opt_benchmark && bench_algo >= 0) {
 			//gpulog(LOG_DEBUG, thr_id, "loop %d", loopcnt);
 			if (loopcnt >= 3) {
@@ -1756,6 +1713,7 @@ static void *miner_thread(void *userdata)
 				break;
 			case ALGO_KECCAK:
 			case ALGO_JACKPOT:
+			case ALGO_X14:
 			case ALGO_X15:
 				minmax = 0x300000;
 				break;
@@ -1800,8 +1758,10 @@ static void *miner_thread(void *userdata)
 		hashes_done = 0;
 		gettimeofday(&tv_start, NULL);
 
-
-		cudaGetLastError(); // reset previous errors
+		// check (and reset) previous errors
+		cudaError_t err = cudaGetLastError();
+		if (err != cudaSuccess && !opt_quiet)
+			gpulog(LOG_WARNING, thr_id, "%s", cudaGetErrorString(err));
 
 		/* scan nonces for a proof-of-work hash */
 		switch (opt_algo) {
@@ -1904,6 +1864,7 @@ static void *miner_thread(void *userdata)
 			break;
 		case ALGO_X14:
 			rc = scanhash_x14(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_X15:
 			rc = scanhash_x15(thr_id, &work, max_nonce, &hashes_done);
 			break;
@@ -2388,7 +2349,9 @@ void parse_arg(int key, char *arg)
 		}
 		if (i == ALGO_COUNT) {
 			// some aliases...
-			if (!strcasecmp("flax", arg))
+			if (!strcasecmp("all", arg))
+				i = opt_algo = ALGO_AUTO;
+			else if (!strcasecmp("flax", arg))
 				i = opt_algo = ALGO_C11;
 			else if (!strcasecmp("diamond", arg))
 				i = opt_algo = ALGO_DMD_GR;
diff --git a/cuda.cpp b/cuda.cpp
index 86f479d..b912cad 100644
--- a/cuda.cpp
+++ b/cuda.cpp
@@ -156,8 +156,7 @@ uint32_t cuda_default_throughput(int thr_id, uint32_t defcount)
 	uint32_t throughput = gpus_intensity[thr_id] ? gpus_intensity[thr_id] : defcount;
 	if (gpu_threads > 1 && throughput == defcount) throughput /= (gpu_threads-1);
 	api_set_throughput(thr_id, throughput);
-	bench_set_throughput(thr_id, throughput);
-	//if (opt_debug) applog(LOG_DEBUG, "GPU %d-%d: throughput %u", dev_id, thr_id, throughput);
+	//gpulog(LOG_INFO, thr_id, "throughput %u", throughput);
 	return throughput;
 }
 
@@ -196,6 +195,14 @@ int cuda_available_memory(int thr_id)
 	return (int) (mfree / (1024 * 1024));
 }
 
+// Check (and reset) last cuda error, and report it in logs
+void cuda_log_lasterror(int thr_id, const char* func, int line)
+{
+	cudaError_t err = cudaGetLastError();
+	if (err != cudaSuccess && !opt_quiet)
+		gpulog(LOG_WARNING, thr_id, "%s:%d %s", func, line, cudaGetErrorString(err));
+}
+
 #ifdef __cplusplus
 } /* extern "C" */
 #endif
diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu
index 98a0291..ed601f5 100644
--- a/lyra2/lyra2REv2.cu
+++ b/lyra2/lyra2REv2.cu
@@ -147,6 +147,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 			{
 				int res = 1;
 				work_set_target_ratio(work, vhash64);
+				pdata[19] = foundNonces[0];
 				// check if there was another one...
 				if (foundNonces[1] != 0)
 				{
@@ -155,9 +156,9 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio)
 						work_set_target_ratio(work, vhash64);
 					pdata[21] = foundNonces[1];
+					//xchg(pdata[19], pdata[21]);
 					res++;
 				}
-				pdata[19] = foundNonces[0];
 				MyStreamSynchronize(NULL, 0, device_map[thr_id]);
 				return res;
 			}
diff --git a/miner.h b/miner.h
index f65a9c3..3096500 100644
--- a/miner.h
+++ b/miner.h
@@ -493,6 +493,9 @@ int cuda_available_memory(int thr_id);
 uint32_t cuda_default_throughput(int thr_id, uint32_t defcount);
 #define device_intensity(t,f,d) cuda_default_throughput(t,d)
 
+void cuda_log_lasterror(int thr_id, const char* func, int line);
+#define CUDA_LOG_ERROR() cuda_log_lasterror(thr_id, __func__, __LINE__)
+
 #define CL_N    "\x1B[0m"
 #define CL_RED  "\x1B[31m"
 #define CL_GRN  "\x1B[32m"
@@ -553,7 +556,7 @@ void bench_free();
 bool bench_algo_switch_next(int thr_id);
 void bench_set_throughput(int thr_id, uint32_t throughput);
 void bench_display_results();
-
+void algo_free_all(int thr_id);
 
 struct stratum_job {
 	char *job_id;
diff --git a/neoscrypt/neoscrypt.cpp b/neoscrypt/neoscrypt.cpp
index 43681d9..c111cfc 100644
--- a/neoscrypt/neoscrypt.cpp
+++ b/neoscrypt/neoscrypt.cpp
@@ -20,6 +20,8 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
 	int intensity = is_windows() ? 18 : 19;
 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
 	throughput = throughput / 32; /* set for max intensity ~= 20 */
+	api_set_throughput(thr_id, throughput);
+
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce + 1);
 
 	if (opt_benchmark)
diff --git a/qubit/luffa.cu b/qubit/luffa.cu
index 5d278d6..2ea6b20 100644
--- a/qubit/luffa.cu
+++ b/qubit/luffa.cu
@@ -48,7 +48,7 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce,
 		if (opt_cudaschedule == -1) // to reduce cpu usage...
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], throughput * 64));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
 
 		qubit_luffa512_cpu_init(thr_id, throughput);
 		cuda_check_cpu_init(thr_id, throughput);
@@ -103,12 +103,12 @@ extern "C" void free_luffa(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaDeviceSynchronize();
 
 	cudaFree(d_hash[thr_id]);
 
 	cuda_check_cpu_free(thr_id);
-	init[thr_id] = false;
 
-	cudaDeviceSynchronize();
-}
\ No newline at end of file
+	cudaThreadSynchronize();
+	init[thr_id] = false;
+}
diff --git a/scrypt-jane.cpp b/scrypt-jane.cpp
index 4620fdf..f49e950 100644
--- a/scrypt-jane.cpp
+++ b/scrypt-jane.cpp
@@ -395,6 +395,9 @@ unsigned char GetNfactor(unsigned int nTimestamp)
 		} else if (!strcmp(jane_params, "RAD") || !strcasecmp(jane_params, "RadioactiveCoin")) {
 			// InternetCoin:1389196388, minN: 4, maxN: 30
 			Ntimestamp = 1389196388; minN= 4; maxN= 30;
+		} else if (!strcmp(jane_params, "LEO") || !strcasecmp(jane_params, "LEOCoin")) {
+			// LEOCoin:1402845776, minN: 4, maxN: 30
+			Ntimestamp = 1402845776; minN= 4; maxN= 30;
 		} else {
 			if (sscanf(jane_params, "%u,%u,%u", &Ntimestamp, &minN, &maxN) != 3)
 			if (sscanf(jane_params, "%u", &Nfactor) == 1) return Nfactor; // skip bounding against minN, maxN
diff --git a/x13/x13.cu b/x13/x13.cu
index e7fc923..cfdc3fb 100644
--- a/x13/x13.cu
+++ b/x13/x13.cu
@@ -208,6 +208,8 @@ extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, u
 
 		*hashes_done = pdata[19] - first_nonce + throughput;
 
+		CUDA_LOG_ERROR();
+
 		foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != UINT32_MAX)
 		{
@@ -245,6 +247,9 @@ extern "C" int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, u
 	} while (!work_restart[thr_id].restart);
 
 	*hashes_done = pdata[19] - first_nonce + 1;
+
+	CUDA_LOG_ERROR();
+
 	return 0;
 }
 
@@ -254,7 +259,7 @@ extern "C" void free_x13(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);
 
@@ -263,7 +268,8 @@ extern "C" void free_x13(int thr_id)
 	x13_fugue512_cpu_free(thr_id);
 
 	cuda_check_cpu_free(thr_id);
-	init[thr_id] = false;
+	CUDA_LOG_ERROR();
 
 	cudaDeviceSynchronize();
+	init[thr_id] = false;
 }
\ No newline at end of file
diff --git a/x15/cuda_x15_whirlpool.cu b/x15/cuda_x15_whirlpool.cu
index 49f6867..810f51d 100644
--- a/x15/cuda_x15_whirlpool.cu
+++ b/x15/cuda_x15_whirlpool.cu
@@ -14,8 +14,8 @@
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
 __constant__ uint32_t pTarget[8];
 
-static uint32_t *d_wnounce[MAX_GPUS];
-static uint32_t *d_WNonce[MAX_GPUS];
+static uint32_t *h_wnounce[MAX_GPUS] = { 0 };
+static uint32_t *d_WNonce[MAX_GPUS] = { 0 };
 
 #define USE_ALL_TABLES 1
 
@@ -2575,14 +2575,14 @@ extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int mode)
 	}
 
 	cudaMalloc(&d_WNonce[thr_id], sizeof(uint32_t));
-	cudaMallocHost(&d_wnounce[thr_id], sizeof(uint32_t));
+	cudaMallocHost(&h_wnounce[thr_id], sizeof(uint32_t));
 }
 
 __host__
 extern void x15_whirlpool_cpu_free(int thr_id)
 {
 	cudaFree(d_WNonce[thr_id]);
-	cudaFreeHost(d_wnounce[thr_id]);
+	cudaFreeHost(h_wnounce[thr_id]);
 }
 
 __host__
@@ -2613,9 +2613,9 @@ extern uint32_t whirlpool512_cpu_finalhash_64(int thr_id, uint32_t threads, uint
 	oldwhirlpool_gpu_finalhash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector,d_WNonce[thr_id]);
 
 	MyStreamSynchronize(NULL, order, thr_id);
-	cudaMemcpy(d_wnounce[thr_id], d_WNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaMemcpy(h_wnounce[thr_id], d_WNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 
-	result = *d_wnounce[thr_id];
+	result = *h_wnounce[thr_id];
 
 	return result;
 }
diff --git a/x15/x14.cu b/x15/x14.cu
index 817ec43..559d03f 100644
--- a/x15/x14.cu
+++ b/x15/x14.cu
@@ -27,7 +27,7 @@ extern "C" {
 #include "cuda_helper.h"
 
 // Memory for the hash functions
-static uint32_t *d_hash[MAX_GPUS];
+static uint32_t *d_hash[MAX_GPUS] = { 0 };
 
 extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
@@ -190,7 +190,9 @@ extern "C" int scanhash_x14(int thr_id,  struct work* work, uint32_t max_nonce,
 
 		cuda_check_cpu_init(thr_id, throughput);
 
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
+		cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput);
+
+		CUDA_LOG_ERROR();
 
 		init[thr_id] = true;
 	}
@@ -217,9 +219,12 @@ extern "C" int scanhash_x14(int thr_id,  struct work* work, uint32_t max_nonce,
 		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 
+		CUDA_LOG_ERROR();
+
 		*hashes_done = pdata[19] - first_nonce + throughput;
 
 		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+
 		if (foundNonce != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
@@ -250,6 +255,8 @@ extern "C" int scanhash_x14(int thr_id,  struct work* work, uint32_t max_nonce,
 
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
 
+	CUDA_LOG_ERROR();
+
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
@@ -260,17 +267,17 @@ extern "C" void free_x14(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
-	cudaDeviceSynchronize();
-
-	cudaFree(d_hash[thr_id]);
+	cudaThreadSynchronize();
 
 	quark_groestl512_cpu_free(thr_id);
 	x11_simd512_cpu_free(thr_id);
 	x13_fugue512_cpu_free(thr_id);
 
+	cudaFree(d_hash[thr_id]);
+	d_hash[thr_id] = NULL;
+
 	cuda_check_cpu_free(thr_id);
-	init[thr_id] = false;
 
 	cudaDeviceSynchronize();
+	init[thr_id] = false;
 }
diff --git a/x15/x15.cu b/x15/x15.cu
index a7c224a..ba72208 100644
--- a/x15/x15.cu
+++ b/x15/x15.cu
@@ -28,7 +28,7 @@ extern "C" {
 #include "cuda_helper.h"
 
 // Memory for the hash functions
-static uint32_t *d_hash[MAX_GPUS];
+static uint32_t *d_hash[MAX_GPUS] = { 0 };
 
 extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
 extern void quark_blake512_cpu_setBlock_80(int thr_id, uint32_t *pdata);
@@ -264,7 +264,6 @@ extern "C" int scanhash_x15(int thr_id,  struct work* work, uint32_t max_nonce,
 
 	*hashes_done = pdata[19] - first_nonce + 1;
 
-	x15_whirlpool_cpu_free(thr_id);
 	return 0;
 }
 
@@ -274,16 +273,17 @@ extern "C" void free_x15(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);
 
 	quark_groestl512_cpu_free(thr_id);
 	x11_simd512_cpu_free(thr_id);
 	x13_fugue512_cpu_free(thr_id);
+	x15_whirlpool_cpu_free(thr_id);
 
 	cuda_check_cpu_free(thr_id);
-	init[thr_id] = false;
 
 	cudaDeviceSynchronize();
+	init[thr_id] = false;
 }
diff --git a/x17/x17.cu b/x17/x17.cu
index d72f319..9bb40bd 100644
--- a/x17/x17.cu
+++ b/x17/x17.cu
@@ -78,6 +78,7 @@ extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t sta
 
 extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag);
 extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x15_whirlpool_cpu_free(int thr_id);
 
 extern void x17_sha512_cpu_init(int thr_id, uint32_t threads);
 extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -296,16 +297,17 @@ extern "C" void free_x17(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);
 
 	quark_groestl512_cpu_free(thr_id);
 	x11_simd512_cpu_free(thr_id);
 	x13_fugue512_cpu_free(thr_id);
+	x15_whirlpool_cpu_free(thr_id);
 
 	cuda_check_cpu_free(thr_id);
-	init[thr_id] = false;
 
 	cudaDeviceSynchronize();
-}
\ No newline at end of file
+	init[thr_id] = false;
+}