From 38e6672d7032ddd8dff326f8d21f084b0c487811 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sat, 28 Mar 2015 10:09:55 +0100
Subject: [PATCH] Allow test of SM 2.1/3.0 binaries on newer cards

Implementation based on klausT work.. a bit different

This code must be placed in a common .cu file,
cuda.cpp is not compiled with nvcc and doesnt allow cuda code...
---
 Algo256/cuda_skein256.cu       |  4 ++--
 cuda_checkhash.cu              | 27 ++++++++++++++++++++++++++-
 cuda_helper.h                  |  3 +++
 quark/cuda_quark_groestl512.cu | 28 +++++++++++++++-------------
 quark/cuda_quark_keccak512.cu  |  4 +++-
 quark/cuda_skein512.cu         | 12 ++++++++----
 6 files changed, 57 insertions(+), 21 deletions(-)

diff --git a/Algo256/cuda_skein256.cu b/Algo256/cuda_skein256.cu
index 71687dc..bc9acef 100644
--- a/Algo256/cuda_skein256.cu
+++ b/Algo256/cuda_skein256.cu
@@ -295,7 +295,7 @@ void skein256_gpu_hash_32_v30(uint32_t threads, uint32_t startNounce, uint64_t *
 __host__
 void skein256_cpu_init(int thr_id, uint32_t threads)
 {
-	//empty
+	cuda_get_arch(thr_id);
 }
 
 __host__
@@ -306,7 +306,7 @@ void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, ui
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	if (device_sm[device_map[thr_id]] >= 320)
+	if (device_sm[device_map[thr_id]] > 300 && cuda_arch[device_map[thr_id]] > 300)
 		skein256_gpu_hash_32<<<grid, block>>>(threads, startNounce, d_outputHash);
 	else
 		skein256_gpu_hash_32_v30<<<grid, block>>>(threads, startNounce, d_outputHash);
diff --git a/cuda_checkhash.cu b/cuda_checkhash.cu
index 85c2c04..4bbef59 100644
--- a/cuda_checkhash.cu
+++ b/cuda_checkhash.cu
@@ -195,4 +195,29 @@ uint32_t cuda_check_hash_branch(int thr_id, uint32_t threads, uint32_t startNoun
 	result = *h_resNonces[thr_id];
 
 	return result;
-}
\ No newline at end of file
+}
+
+/* Function to get the compiled Shader Model version */
+int cuda_arch[MAX_GPUS] = { 0 };
+__global__
+void nvcc_get_arch(int *d_version)
+{
+#ifdef __CUDA_ARCH__
+	*d_version = __CUDA_ARCH__;
+#endif
+}
+
+__host__
+int cuda_get_arch(int thr_id)
+{
+	int *d_version;
+	int dev_id = device_map[thr_id];
+	if (cuda_arch[dev_id] == 0) {
+		// only do it once...
+		cudaMalloc(&d_version, sizeof(int));
+		nvcc_get_arch <<< 1, 1 >>> (d_version);
+		cudaMemcpy(&cuda_arch[dev_id], d_version, sizeof(int), cudaMemcpyDeviceToHost);
+		cudaFree(d_version);
+	}
+	return cuda_arch[dev_id];
+}
diff --git a/cuda_helper.h b/cuda_helper.h
index 6b1ce24..355c098 100644
--- a/cuda_helper.h
+++ b/cuda_helper.h
@@ -20,7 +20,10 @@
 extern "C" short device_map[MAX_GPUS];
 extern "C"  long device_sm[MAX_GPUS];
 
+extern int cuda_arch[MAX_GPUS];
+
 // common functions
+extern int cuda_get_arch(int thr_id);
 extern void cuda_check_cpu_init(int thr_id, uint32_t threads);
 extern void cuda_check_cpu_setTarget(const void *ptarget);
 extern uint32_t cuda_check_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_inputHash);
diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu
index ba69214..3ec7f9b 100644
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@@ -133,15 +133,17 @@ __global__ void __launch_bounds__(TPB, THF)
 #endif
 }
 
-
-
-__host__ void quark_groestl512_cpu_init(int thr_id, uint32_t threads)
+__host__
+void quark_groestl512_cpu_init(int thr_id, uint32_t threads)
 {
-    if (device_sm[device_map[thr_id]] < 300)
+    int dev_id = device_map[thr_id];
+    cuda_get_arch(thr_id);
+    if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300)
         quark_groestl512_sm20_init(thr_id, threads);
 }
 
-__host__ void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__
+void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
     int threadsperblock = TPB;
 
@@ -153,11 +155,10 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_
     dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
     dim3 block(threadsperblock);
 
-    // Größe des dynamischen Shared Memory Bereichs
-    size_t shared_size = 0;
+    int dev_id = device_map[thr_id];
 
-    if (device_sm[device_map[thr_id]] >= 300)
-        quark_groestl512_gpu_hash_64_quad<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
+    if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300)
+        quark_groestl512_gpu_hash_64_quad<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
     else
         quark_groestl512_sm20_hash_64(thr_id, threads, startNounce, d_nonceVector, d_hash, order);
 
@@ -165,7 +166,8 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_
     MyStreamSynchronize(NULL, order, thr_id);
 }
 
-__host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
+__host__
+void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
     const int factor = THF;
     int threadsperblock = TPB;
@@ -173,10 +175,10 @@ __host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, uint32_t threads, u
     dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
     dim3 block(threadsperblock);
 
-    size_t shared_size = 0;
+    int dev_id = device_map[thr_id];
 
-    if (device_sm[device_map[thr_id]] >= 300)
-        quark_doublegroestl512_gpu_hash_64_quad<<<grid, block, shared_size>>>(threads, startNounce, d_hash, d_nonceVector);
+    if (device_sm[dev_id] >= 300 && cuda_arch[dev_id] >= 300)
+        quark_doublegroestl512_gpu_hash_64_quad<<<grid, block>>>(threads, startNounce, d_hash, d_nonceVector);
     else
         quark_doublegroestl512_sm20_hash_64(thr_id, threads, startNounce, d_nonceVector, d_hash, order);
 
diff --git a/quark/cuda_quark_keccak512.cu b/quark/cuda_quark_keccak512.cu
index 5d7dcd7..21ee856 100644
--- a/quark/cuda_quark_keccak512.cu
+++ b/quark/cuda_quark_keccak512.cu
@@ -248,7 +248,9 @@ void quark_keccak512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNou
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
-	if (device_sm[device_map[thr_id]] >= 320)
+	int dev_id = device_map[thr_id];
+
+	if (device_sm[dev_id] >= 320)
 		quark_keccak512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 	else
 		quark_keccak512_gpu_hash_64_v30<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu
index 3de4309..f472087 100644
--- a/quark/cuda_skein512.cu
+++ b/quark/cuda_skein512.cu
@@ -11,7 +11,7 @@ static __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes
 #define SHL(x, n)			((x) << (n))
 #define SHR(x, n)			((x) >> (n))
 
-#if __CUDA_ARCH__ >= 320
+#if __CUDA_ARCH__ > 300
 __device__
 uint64_t skein_rotl64(const uint64_t x, const int offset)
 {
@@ -443,11 +443,11 @@ void quark_skein512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t
 		TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18);
 
 		// fertig
-		uint64_t *outpHash = &g_hash[8 * hashPosition];
+		uint2 *outpHash = (uint2*) (&g_hash[hashPosition * 8]);
 
 		#pragma unroll 8
 		for(int i=0; i<8; i++)
-			outpHash[i] = devectorize(p[i]);
+			outpHash[i] = p[i];
 	}
 }
 
@@ -724,6 +724,8 @@ void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *outp
 __host__
 void quark_skein512_cpu_init(int thr_id, uint32_t threads)
 {
+	// store the binary SM version
+	cuda_get_arch(thr_id);
 }
 
 __host__
@@ -734,8 +736,10 @@ void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNoun
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 
+	int dev_id = device_map[thr_id];
+
 	// uint2 uint64 variants for SM 3.2+
-	if (device_sm[device_map[thr_id]] >= 320)
+	if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
 		quark_skein512_gpu_hash_64 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 	else
 		quark_skein512_gpu_hash_64_v30 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector);