From 5bf1f9820003b7d0be75af3351d0eb424deb916a Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Wed, 14 Oct 2015 02:04:13 +0000
Subject: [PATCH] various fixes for SM 2.1 and the benchmark

X11+ algos and quark are not compatible for the moment

but these ones are :

Benchmark results for Gigabyte GTX 460 (SM 2.1 / 1 GB):

   blakecoin :     159090.5 kH/s,     1 MB,  1048576 thr.
       blake :      70208.9 kH/s,     1 MB,  1048576 thr.
         bmw :     122802.6 kH/s,    65 MB,  2097152 thr.
        deep :       3533.6 kH/s,    33 MB,   524288 thr.
    fugue256 :      43177.9 kH/s,    17 MB,   524288 thr.
       heavy :       4118.2 kH/s,   147 MB,   524032 thr.
      keccak :      18673.1 kH/s,   129 MB,  2097152 thr.
       luffa :      28816.0 kH/s,   257 MB,  4194304 thr.
       lyra2 :        213.7 kH/s,   570 MB,    65536 thr.
    mjollnir :       3895.6 kH/s,   147 MB,   524032 thr.
       nist5 :       1101.4 kH/s,    67 MB,  1048576 thr.
       penta :        501.6 kH/s,    21 MB,   327680 thr.
       skein :       5432.4 kH/s,    65 MB,  1048576 thr.
      skein2 :       6788.9 kH/s,    33 MB,   524288 thr.
   whirlpool :        688.5 kH/s,    33 MB,   524288 thr.
         zr5 :        122.5 kH/s,    86 MB,   262144 thr.
---
 Algo256/blake256.cu      | 10 ++++++----
 Algo256/bmw.cu           |  9 ++++-----
 Algo256/cuda_blake256.cu |  2 +-
 Algo256/keccak256.cu     |  8 ++++----
 JHA/jackpotcoin.cu       | 15 +++++++++++----
 bench.cpp                | 15 +++++++++++++++
 cuda_groestlcoin.cu      |  2 +-
 lyra2/cuda_lyra2.cu      |  1 +
 pentablake.cu            | 11 +++++------
 quark/quarkcoin.cu       |  5 ++---
 qubit/luffa.cu           | 10 ++++++----
 util.cpp                 |  2 +-
 x15/whirlpoolx.cu        |  8 ++++----
 zr5.cu                   |  8 ++++++--
 14 files changed, 67 insertions(+), 39 deletions(-)

diff --git a/Algo256/blake256.cu b/Algo256/blake256.cu
index 0b5dd87..ad2a493 100644
--- a/Algo256/blake256.cu
+++ b/Algo256/blake256.cu
@@ -10,9 +10,11 @@
 
 extern "C" {
 #include "sph/sph_blake.h"
+//extern int blake256_rounds;
+}
+
 #include <stdint.h>
 #include <memory.h>
-}
 
 /* threads per block and throughput (intensity) */
 #define TPB 128
@@ -467,10 +469,10 @@ extern "C" int scanhash_blake256(int thr_id, struct work* work, uint32_t max_non
 #endif
 				return rc;
 			}
-			else if (opt_debug) {
+			else if (vhashcpu[7] > ptarget[7] && opt_debug) {
 				applog_hash((uchar*)ptarget);
 				applog_compare_hash((uchar*)vhashcpu, (uchar*)ptarget);
-				applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
@@ -493,7 +495,7 @@ extern "C" void free_blake256(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	cudaFreeHost(h_resNonce[thr_id]);
 	cudaFree(d_resNonce[thr_id]);
diff --git a/Algo256/bmw.cu b/Algo256/bmw.cu
index f4bbb4c..1cf06ea 100644
--- a/Algo256/bmw.cu
+++ b/Algo256/bmw.cu
@@ -87,7 +87,7 @@ extern "C" int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, u
 				return 1;
 			}
 			else {
-				applog(LOG_DEBUG, "GPU #%d: result for nounce %08x does not validate on CPU!", thr_id, foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
@@ -110,13 +110,12 @@ extern "C" void free_bmw(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);
 	bmw256_midstate_free(thr_id);
 	cuda_check_cpu_free(thr_id);
 
-	init[thr_id] = false;
-
 	cudaDeviceSynchronize();
-}
\ No newline at end of file
+	init[thr_id] = false;
+}
diff --git a/Algo256/cuda_blake256.cu b/Algo256/cuda_blake256.cu
index afa9b3b..c3326e6 100644
--- a/Algo256/cuda_blake256.cu
+++ b/Algo256/cuda_blake256.cu
@@ -207,7 +207,7 @@ void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uin
 		input[3] = startNonce + thread;
 		blake256_compress2nd(h, input, 640);
 
-        #pragma unroll
+		#pragma unroll
 		for (int i = 0; i<4; i++) {
 			Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2*i+1]));
 		}
diff --git a/Algo256/keccak256.cu b/Algo256/keccak256.cu
index e2234d3..7922912 100644
--- a/Algo256/keccak256.cu
+++ b/Algo256/keccak256.cu
@@ -80,7 +80,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
 				return 1;
 			}
 			else {
-				applog(LOG_WARNING, "GPU #%d: result for nounce %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
@@ -101,12 +101,12 @@ extern "C" void free_keccak256(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);
 
 	keccak256_cpu_free(thr_id);
-	init[thr_id] = false;
 
 	cudaDeviceSynchronize();
-}
\ No newline at end of file
+	init[thr_id] = false;
+}
diff --git a/JHA/jackpotcoin.cu b/JHA/jackpotcoin.cu
index 6d605e4..b441362 100644
--- a/JHA/jackpotcoin.cu
+++ b/JHA/jackpotcoin.cu
@@ -95,6 +95,7 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
+	int dev_id = device_map[thr_id];
 
 	uint32_t throughput =  cuda_default_throughput(thr_id, 1U << 20);
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
@@ -104,7 +105,13 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
 
 	if (!init[thr_id])
 	{
-		cudaSetDevice(device_map[thr_id]);
+		cudaSetDevice(dev_id);
+		cuda_get_arch(thr_id);
+		if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
+			gpulog(LOG_ERR, thr_id, "Sorry, This algo is not supported by this GPU arch (SM 3.0 required)");
+			proper_exit(EXIT_CODE_CUDA_ERROR);
+		}
+
 
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
 
@@ -214,6 +221,7 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
 		CUDA_LOG_ERROR();
 
 		uint32_t foundNonce = cuda_check_hash_branch(thr_id, nrm3, pdata[19], d_branch3Nonces[thr_id], d_hash[thr_id], order++);
+
 		if (foundNonce != UINT32_MAX)
 		{
 			uint32_t vhash64[8];
@@ -239,15 +247,14 @@ extern "C" int scanhash_jackpot(int thr_id, struct work *work, uint32_t max_nonc
 				pdata[19] = foundNonce;
 				return res;
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!",
-					device_map[thr_id], foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
 		if ((uint64_t) pdata[19] + throughput > max_nonce) {
 			*hashes_done = pdata[19] - first_nonce;
 			pdata[19] = max_nonce;
-			break;
+			return 0;
 		}
 
 		pdata[19] += throughput;
diff --git a/bench.cpp b/bench.cpp
index 8c3f1a2..0a45a41 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -97,6 +97,21 @@ bool bench_algo_switch_next(int thr_id)
 	if (algo == ALGO_C11) algo++; // same as x11
 	if (algo == ALGO_DMD_GR) algo++; // same as groestl
 	if (algo == ALGO_WHIRLCOIN) algo++; // same as whirlpool
+
+	if (device_sm[dev_id] && device_sm[dev_id] < 300) {
+		// incompatible SM 2.1 kernels...
+		if (algo == ALGO_FRESH) algo++;
+		if (algo == ALGO_GROESTL) algo++;
+		if (algo == ALGO_MYR_GR) algo++;
+		if (algo == ALGO_JACKPOT) algo++;
+		if (algo == ALGO_LYRA2v2) algo++;
+		if (algo == ALGO_NEOSCRYPT) algo++;
+		if (algo == ALGO_QUARK) algo++;
+		if (algo == ALGO_QUBIT) algo++;
+		if (algo == ALGO_S3) algo++; // to check...
+		while (algo >= ALGO_X11 && algo <= ALGO_X17) algo++;
+		if (algo == ALGO_WHIRLPOOLX) algo++;
+	}
 	// and unwanted ones...
 	if (algo == ALGO_SCRYPT) algo++;
 	if (algo == ALGO_SCRYPT_JANE) algo++;
diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu
index fcef970..4e4b728 100644
--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@@ -153,7 +153,7 @@ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, vo
 
 	int dev_id = device_map[thr_id];
 	if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) {
-		printf("Sorry, This algo is not supported by this GPU arch (SM 3.0 required)");
+		gpulog(LOG_ERR, thr_id, "Sorry, This algo is not supported by this GPU arch (SM 3.0 required)");
 		proper_exit(EXIT_CODE_CUDA_ERROR);
 	}
 
diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu
index e2757ea..82c31a8 100644
--- a/lyra2/cuda_lyra2.cu
+++ b/lyra2/cuda_lyra2.cu
@@ -261,6 +261,7 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6
 	int dev_id = device_map[thr_id % MAX_GPUS];
 	uint32_t tpb = TPB52;
 	if (device_sm[dev_id] == 500) tpb = TPB50;
+	if (device_sm[dev_id] == 350) tpb = TPB30; // to enhance (or not)
 	if (device_sm[dev_id] <= 300) tpb = TPB30;
 
 	dim3 grid((threads + tpb - 1) / tpb);
diff --git a/pentablake.cu b/pentablake.cu
index c54eac3..b0d8132 100644
--- a/pentablake.cu
+++ b/pentablake.cu
@@ -423,7 +423,6 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n
 					pentablakehash(vhash, endiandata);
 					if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio)
 						work_set_target_ratio(work, vhash);
-					applog(LOG_NOTICE, "GPU found more than one result yippee!");
 					pdata[21] = extra_results[0];
 					extra_results[0] = UINT32_MAX;
 					rc++;
@@ -431,7 +430,7 @@ extern "C" int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_n
 				pdata[19] = foundNonce;
 				return rc;
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for nounce %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for nonce %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
@@ -449,13 +448,13 @@ void free_pentablake(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
+	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);
 	cudaFreeHost(h_resNounce[thr_id]);
 	cudaFree(d_resNounce[thr_id]);
 
-	init[thr_id] = false;
-
 	cudaDeviceSynchronize();
-}
\ No newline at end of file
+
+	init[thr_id] = false;
+}
diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu
index c1a5d3f..f615444 100644
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@@ -262,8 +262,7 @@ extern "C" void free_quark(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaSetDevice(device_map[thr_id]);
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);
 
@@ -278,4 +277,4 @@ extern "C" void free_quark(int thr_id)
 	init[thr_id] = false;
 
 	cudaDeviceSynchronize();
-}
\ No newline at end of file
+}
diff --git a/qubit/luffa.cu b/qubit/luffa.cu
index 2ea6b20..f03efaf 100644
--- a/qubit/luffa.cu
+++ b/qubit/luffa.cu
@@ -45,8 +45,10 @@ extern "C" int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce,
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
-		if (opt_cudaschedule == -1) // to reduce cpu usage...
-			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		CUDA_LOG_ERROR();
+		//if (opt_cudaschedule == -1) // to reduce cpu usage...
+		//	cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		CUDA_LOG_ERROR();
 
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
 
@@ -103,12 +105,12 @@ extern "C" void free_luffa(int thr_id)
 	if (!init[thr_id])
 		return;
 
-	cudaDeviceSynchronize();
+	cudaThreadSynchronize();
 
 	cudaFree(d_hash[thr_id]);
 
 	cuda_check_cpu_free(thr_id);
 
-	cudaThreadSynchronize();
+	cudaDeviceSynchronize();
 	init[thr_id] = false;
 }
diff --git a/util.cpp b/util.cpp
index 8ed5505..dbee5c9 100644
--- a/util.cpp
+++ b/util.cpp
@@ -1856,7 +1856,7 @@ static uint32_t zrtest[20] = {
 
 void do_gpu_tests(void)
 {
-#ifdef _DEBUG
+#if 1 //def _DEBUG
 	unsigned long done;
 	char s[128] = { '\0' };
 	uchar buf[160];
diff --git a/x15/whirlpoolx.cu b/x15/whirlpoolx.cu
index ec8db42..b7b356f 100644
--- a/x15/whirlpoolx.cu
+++ b/x15/whirlpoolx.cu
@@ -9,7 +9,7 @@ extern "C" {
 #include "miner.h"
 #include "cuda_helper.h"
 
-static uint32_t *d_hash[MAX_GPUS];
+static uint32_t *d_hash[MAX_GPUS] = { 0 };
 
 extern void whirlpoolx_cpu_init(int thr_id, uint32_t threads);
 extern void whirlpoolx_cpu_free(int thr_id);
@@ -54,7 +54,7 @@ extern "C" int scanhash_whirlx(int thr_id,  struct work* work, uint32_t max_nonc
 	if (!init[thr_id]) {
 		cudaSetDevice(device_map[thr_id]);
 
-		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 64 * throughput), 0);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
 
 		whirlpoolx_cpu_init(thr_id, throughput);
 
@@ -84,7 +84,7 @@ extern "C" int scanhash_whirlx(int thr_id,  struct work* work, uint32_t max_nonc
 				pdata[19] = foundNonce;
 				return 1;
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 			}
 		}
 
@@ -115,4 +115,4 @@ extern "C" void free_whirlx(int thr_id)
 	init[thr_id] = false;
 
 	cudaDeviceSynchronize();
-}
\ No newline at end of file
+}
diff --git a/zr5.cu b/zr5.cu
index 5d05b01..d591f1d 100644
--- a/zr5.cu
+++ b/zr5.cu
@@ -419,6 +419,10 @@ extern "C" int scanhash_zr5(int thr_id, struct work *work,
 		}
 		zr5_final_round(thr_id, throughput);
 
+		// do not scan results on interuption
+		if (work_restart[thr_id].restart)
+			return -1;
+
 		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != UINT32_MAX)
 		{
@@ -455,7 +459,7 @@ extern "C" int scanhash_zr5(int thr_id, struct work *work,
 				}
 				return res;
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
 
 				pdata[19]++;
 				pdata[0] = oldp0;
@@ -497,4 +501,4 @@ extern "C" void free_zr5(int thr_id)
 	init[thr_id] = false;
 
 	cudaDeviceSynchronize();
-}
\ No newline at end of file
+}