From 9d3d09103b87d2beb72181671deb3f407cd5292d Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Tue, 12 Aug 2014 18:07:50 +0200
Subject: [PATCH] Try to restore compat with 2.1 devices (GTX 460)

---
 JHA/cuda_jha_compactionTest.cu     |  8 ++++++++
 Makefile.am                        |  8 ++++----
 Makefile.in                        |  8 ++++----
 bitslice_transformations_quad.cu   |  9 +++++++++
 config.sh                          |  4 +++-
 groestl_functions_quad.cu          |  9 +++++++++
 quark/cuda_quark_compactionTest.cu |  8 ++++++++
 x11/cuda_x11_simd512.cu            | 16 ++++++++++++++++
 8 files changed, 61 insertions(+), 9 deletions(-)

diff --git a/JHA/cuda_jha_compactionTest.cu b/JHA/cuda_jha_compactionTest.cu
index e043e2c..22c0da4 100644
--- a/JHA/cuda_jha_compactionTest.cu
+++ b/JHA/cuda_jha_compactionTest.cu
@@ -60,6 +60,14 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
 	cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
 }
 
+#if __CUDA_ARCH__ < 300
+/**
+ * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
+ */
+#undef __shfl_up
+#define __shfl_up(var, delta, width) (0)
+#endif
+
 // Die Summenfunktion (vom NVIDIA SDK)
 __global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
 {
diff --git a/Makefile.am b/Makefile.am
index e5ba361..8b6387b 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -48,15 +48,15 @@ ccminer_CPPFLAGS	= -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -f
 
 # we're now targeting all major compute architectures within one binary.
 .cu.o:
-	$(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+	$(NVCC) @CFLAGS@ -I . -Xptxas "-v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
 
 # Shavite compiles faster with 128 regs
 x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
-	$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<        
+	$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<        
 
 # ABI requiring code modules
 quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
-	$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+	$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
 
 JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
-	$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+	$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
diff --git a/Makefile.in b/Makefile.in
index da86612..1ac3a36 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -1470,18 +1470,18 @@ uninstall-am: uninstall-binPROGRAMS
 
 # we're now targeting all major compute architectures within one binary.
 .cu.o:
-	$(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+	$(NVCC) @CFLAGS@ -I . -Xptxas "-v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
 
 # Shavite compiles faster with 128 regs
 x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
-	$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<        
+	$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<        
 
 # ABI requiring code modules
 quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
-	$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+	$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
 
 JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
-	$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
+	$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
 
 # Tell versions [3.59,3.63) of GNU make to not export all variables.
 # Otherwise a system limit (for SysV at least) may be exceeded.
diff --git a/bitslice_transformations_quad.cu b/bitslice_transformations_quad.cu
index f4a5f2a..c47f73d 100644
--- a/bitslice_transformations_quad.cu
+++ b/bitslice_transformations_quad.cu
@@ -1,4 +1,13 @@
 
+#if __CUDA_ARCH__ < 300
+/**
+ * __shfl() returns the value of var held by the thread whose ID is given by srcLane.
+ * If srcLane is outside the range 0..width-1, the thread's own value of var is returned.
+ */
+#undef __shfl
+#define __shfl(var, srcLane, width) (uint32_t)(var)
+#endif
+
 __device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *output)
 {
     int n = threadIdx.x % 4;
diff --git a/config.sh b/config.sh
index 87fda3c..d3f74d0 100755
--- a/config.sh
+++ b/config.sh
@@ -3,9 +3,11 @@
 # Simple script to create the Makefile
 # then type 'make'
 
+# export PATH="$PATH:/usr/local/cuda-6.5/bin/"
+
 make clean || echo clean
 
 rm -f config.status
 ./autogen.sh || echo done
 
-CFLAGS="-O2 -D_REENTRANT" ./configure
+CC=/usr/local/bin/colorgcc.pl CFLAGS="-O2 -D_REENTRANT" ./configure
diff --git a/groestl_functions_quad.cu b/groestl_functions_quad.cu
index 745a843..b7839ab 100644
--- a/groestl_functions_quad.cu
+++ b/groestl_functions_quad.cu
@@ -240,6 +240,15 @@ __device__ __forceinline__ void G256_ShiftBytesQ_quad(uint32_t &x7, uint32_t &x6
     x7 = __byte_perm(t0, t1, 0x5410);
 }
 
+#if __CUDA_ARCH__ < 300
+/**
+ * __shfl() returns the value of var held by the thread whose ID is given by srcLane.
+ * If srcLane is outside the range 0..width-1, the thread’s own value of var is returned.
+ */
+#undef __shfl
+#define __shfl(var, srcLane, width) (uint32_t)(var)
+#endif
+
 __device__ __forceinline__ void G256_MixFunction_quad(uint32_t *r)
 {
 #define SHIFT64_16(hi, lo)    __byte_perm(lo, hi, 0x5432)
diff --git a/quark/cuda_quark_compactionTest.cu b/quark/cuda_quark_compactionTest.cu
index ed2e2c9..2eb2852 100644
--- a/quark/cuda_quark_compactionTest.cu
+++ b/quark/cuda_quark_compactionTest.cu
@@ -58,6 +58,14 @@ __host__ void quark_compactTest_cpu_init(int thr_id, int threads)
 	cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
 }
 
+#if __CUDA_ARCH__ < 300
+/**
+ * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
+ */
+#undef __shfl_up
+#define __shfl_up(var, delta, width) (0)
+#endif
+
 // Die Summenfunktion (vom NVIDIA SDK)
 __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
 {
diff --git a/x11/cuda_x11_simd512.cu b/x11/cuda_x11_simd512.cu
index 801910f..473fa8e 100644
--- a/x11/cuda_x11_simd512.cu
+++ b/x11/cuda_x11_simd512.cu
@@ -167,8 +167,21 @@ X(j) = (u-v) << (2*n); \
 #undef BUTTERFLY
 }
 
+#if __CUDA_ARCH__ < 300
+/**
+ * __shfl() returns the value of var held by the thread whose ID is given by srcLane.
+ * If srcLane is outside the range 0..width-1, the thread's own value of var is returned.
+ */
+#undef __shfl
+#define __shfl(var, srcLane, width) (uint32_t)(var)
+#endif
+
 __device__ __forceinline__ void FFT_16(int *y) {
 
+#if __CUDA_ARCH__ < 300
+#warning FFT_16() function is not compatible with SM 2.1 devices!
+#endif
+
   /*
    * FFT_16 using w=2 as 16th root of unity
    * Unrolled decimation in frequency (DIF) radix-2 NTT.
@@ -332,6 +345,9 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) {
 __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
 {
   int i;
+#if __CUDA_ARCH__ < 300
+#warning Expansion() function is not compatible with SM 2.1 Devices
+#endif
 
   /* Message Expansion using Number Theoretical Transform similar to FFT */
   int expanded[32];