From e7567332f467e61ec6a526caa40c5e9b19c31080 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Wed, 18 Mar 2015 03:51:58 +0100
Subject: [PATCH] pluck: fix SM 3.0 compilation

---
 cuda_vector.h       | 11 +++++++----
 pluck/cuda_pluck.cu | 29 ++++++++++++++++-------------
 2 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/cuda_vector.h b/cuda_vector.h
index 683b893..2c18f09 100644
--- a/cuda_vector.h
+++ b/cuda_vector.h
@@ -94,6 +94,12 @@ static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) { a = a
 static __forceinline__ __device__  __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; }
 static __forceinline__ __device__  __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; }
 
+#if __CUDA_ARCH__ < 320
+
+#define rotate ROTL32
+#define rotateR ROTR32
+
+#else
 
 static __forceinline__ __device__ uint32_t rotate(uint32_t vec4, uint32_t shift)
 {
@@ -102,7 +108,6 @@ static __forceinline__ __device__ uint32_t rotate(uint32_t vec4, uint32_t shift)
 	return ret;
 }
 
-
 static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift)
 {
 	uint32_t ret;
@@ -110,7 +115,6 @@ static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift
 	return ret;
 }
 
-
 static __device__ __inline__ uint8 __ldg8(const uint8_t *ptr)
 {
 	uint8 test;
@@ -119,7 +123,6 @@ static __device__ __inline__ uint8 __ldg8(const uint8_t *ptr)
 	return (test);
 }
 
-
 static __device__ __inline__ uint32_t __ldgtoint(const uint8_t *ptr)
 {
 	uint32_t test;
@@ -204,7 +207,7 @@ static __device__ __inline__ uint32_t __ldgtoint_unaligned2(const uint8_t *ptr)
 	return (test);
 }
 
-
+#endif
 
 
 static __forceinline__ __device__ uint8 swapvec(const uint8 *buf)
diff --git a/pluck/cuda_pluck.cu b/pluck/cuda_pluck.cu
index cbb21cb..dfd9420 100644
--- a/pluck/cuda_pluck.cu
+++ b/pluck/cuda_pluck.cu
@@ -34,9 +34,10 @@
 #include <stdint.h>
 #include <memory.h>
 
-#include "cuda_helper.h"
 #include "cuda_vector.h"
 
+#include "miner.h"
+
 uint32_t *d_PlNonce[MAX_GPUS];
 
 __device__  uint8_t *  hashbuffer;
@@ -297,8 +298,8 @@ void pluck_gpu_hash0_v50(uint32_t threads, uint32_t startNonce)
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
+#if __CUDA_ARCH__ >= 320
 		const uint32_t nonce = startNonce + thread;
-
 		uint32_t shift = SHIFT * thread;
 		((uint8*)(hashbuffer + shift))[0] = sha256_80(nonce);
 		((uint8*)(hashbuffer + shift))[1] = make_uint8(0, 0, 0, 0, 0, 0, 0, 0);
@@ -308,7 +309,7 @@ void pluck_gpu_hash0_v50(uint32_t threads, uint32_t startNonce)
 			uint32_t randseed[16];
 			uint32_t randbuffer[16];
 			uint32_t joint[16];
-			uint8 Buffbuffer[2];
+//			uint8 Buffbuffer[2];
 
 			((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]);
 			((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]);
@@ -339,8 +340,8 @@ void pluck_gpu_hash0_v50(uint32_t threads, uint32_t startNonce)
 				(hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
 				(hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
 			}
-
 		} // main loop
+#endif
 	}
 }
 
@@ -351,8 +352,8 @@ void pluck_gpu_hash_v50(uint32_t threads, uint32_t startNonce, uint32_t *nonceVe
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
+#if __CUDA_ARCH__ >= 320
 		const uint32_t nonce = startNonce + thread;
-
 		uint32_t shift = SHIFT * thread;
 
 		for (int i = 5; i < HASH_MEMORY - 1; i++)
@@ -405,7 +406,7 @@ void pluck_gpu_hash_v50(uint32_t threads, uint32_t startNonce, uint32_t *nonceVe
 		if (outbuf <= pTarget[7]) {
 			nonceVector[0] = nonce;
 		}
-
+#endif
 	}
 }
 
@@ -415,6 +416,7 @@ void pluck_gpu_hash0(uint32_t threads, uint32_t startNonce)
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
+#if __CUDA_ARCH__ >= 320
 		const uint32_t nonce = startNonce + thread;
 
 		uint32_t shift = SHIFT * thread;
@@ -426,7 +428,7 @@ void pluck_gpu_hash0(uint32_t threads, uint32_t startNonce)
 			uint32_t randseed[16];
 			uint32_t randbuffer[16];
 			uint32_t joint[16];
-			uint8 Buffbuffer[2];
+//			uint8 Buffbuffer[2];
 
 			((uint8*)randseed)[0] = __ldg8(&(hashbuffer + shift)[32 * i - 64]);
 			((uint8*)randseed)[1] = __ldg8(&(hashbuffer + shift)[32 * i - 32]);
@@ -457,9 +459,8 @@ void pluck_gpu_hash0(uint32_t threads, uint32_t startNonce)
 				(hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
 				(hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
 			}
-
 		} // main loop
-
+#endif
 	}
 }
 
@@ -469,6 +470,7 @@ void pluck_gpu_hash(uint32_t threads, uint32_t startNonce, uint32_t *nonceVector
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
+#if __CUDA_ARCH__ >= 320
 		const uint32_t nonce = startNonce + thread;
 
 		uint32_t shift = SHIFT * thread;
@@ -508,7 +510,6 @@ void pluck_gpu_hash(uint32_t threads, uint32_t startNonce, uint32_t *nonceVector
 
 			((uint16*)randseed)[0] ^= ((uint16*)Buffbuffer)[0];
 			((uint16*)randbuffer)[0] = xor_salsa8(((uint16*)randseed)[0]);
-
 			for (int j = 0; j < 32; j += 2)
 			{
 				uint32_t rand = randbuffer[j / 2] % randmax;
@@ -518,7 +519,6 @@ void pluck_gpu_hash(uint32_t threads, uint32_t startNonce, uint32_t *nonceVector
 				(hashbuffer + shift)[rand + 2] = __ldg(&(hashbuffer + shift)[randmax + j + 2]);
 				(hashbuffer + shift)[rand + 3] = __ldg(&(hashbuffer + shift)[randmax + j + 3]);
 			}
-
 		} // main loop
 
 		uint32_t outbuf = __ldgtoint(&(hashbuffer + shift)[28]);
@@ -526,7 +526,7 @@ void pluck_gpu_hash(uint32_t threads, uint32_t startNonce, uint32_t *nonceVector
 		if (outbuf <= pTarget[7]) {
 			nonceVector[0] = nonce;
 		}
-
+#endif
 	}
 }
 
@@ -549,7 +549,10 @@ uint32_t pluck_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce,  int
 	dim3 grid50((threads + 256 - 1) / 256);
 	dim3 block50(256);
 
-	if (device_sm[device_map[thr_id]] >= 500) {
+	if (device_sm[device_map[thr_id]] <= 300) {
+		applog(LOG_ERR,"Sorry pluck not supported on SM 3.0 devices");
+		return 0;
+	} else if (device_sm[device_map[thr_id]] >= 500) {
 		pluck_gpu_hash0_v50 <<< grid50, block50 >>>(threads, startNounce);
 		pluck_gpu_hash_v50  <<< grid50, block50 >>>(threads, startNounce, d_PlNonce[thr_id]);
 	} else {