From 9d3d09103b87d2beb72181671deb3f407cd5292d Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Tue, 12 Aug 2014 18:07:50 +0200 Subject: [PATCH] Try to restore compat with 2.1 devices (GTX 460) --- JHA/cuda_jha_compactionTest.cu | 8 ++++++++ Makefile.am | 8 ++++---- Makefile.in | 8 ++++---- bitslice_transformations_quad.cu | 9 +++++++++ config.sh | 4 +++- groestl_functions_quad.cu | 9 +++++++++ quark/cuda_quark_compactionTest.cu | 8 ++++++++ x11/cuda_x11_simd512.cu | 16 ++++++++++++++++ 8 files changed, 61 insertions(+), 9 deletions(-) diff --git a/JHA/cuda_jha_compactionTest.cu b/JHA/cuda_jha_compactionTest.cu index e043e2c..22c0da4 100644 --- a/JHA/cuda_jha_compactionTest.cu +++ b/JHA/cuda_jha_compactionTest.cu @@ -60,6 +60,14 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads) cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) } +#if __CUDA_ARCH__ < 300 +/** + * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1 + */ +#undef __shfl_up +#define __shfl_up(var, delta, width) (0) +#endif + // Die Summenfunktion (vom NVIDIA SDK) __global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) { diff --git a/Makefile.am b/Makefile.am index e5ba361..8b6387b 100644 --- a/Makefile.am +++ b/Makefile.am @@ -48,15 +48,15 @@ ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -f # we're now targeting all major compute architectures within one binary. .cu.o: - $(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< + $(NVCC) @CFLAGS@ -I . -Xptxas "-v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< # Shavite compiles faster with 128 regs x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu - $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< + $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< # ABI requiring code modules quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu - $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< + $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu - $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< + $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< diff --git a/Makefile.in b/Makefile.in index da86612..1ac3a36 100644 --- a/Makefile.in +++ b/Makefile.in @@ -1470,18 +1470,18 @@ uninstall-am: uninstall-binPROGRAMS # we're now targeting all major compute architectures within one binary. .cu.o: - $(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< + $(NVCC) @CFLAGS@ -I . -Xptxas "-v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< # Shavite compiles faster with 128 regs x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu - $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< + $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< # ABI requiring code modules quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu - $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< + $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu - $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< + $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< # Tell versions [3.59,3.63) of GNU make to not export all variables. # Otherwise a system limit (for SysV at least) may be exceeded. diff --git a/bitslice_transformations_quad.cu b/bitslice_transformations_quad.cu index f4a5f2a..c47f73d 100644 --- a/bitslice_transformations_quad.cu +++ b/bitslice_transformations_quad.cu @@ -1,4 +1,13 @@ +#if __CUDA_ARCH__ < 300 +/** + * __shfl() returns the value of var held by the thread whose ID is given by srcLane. + * If srcLane is outside the range 0..width-1, the thread's own value of var is returned. + */ +#undef __shfl +#define __shfl(var, srcLane, width) (uint32_t)(var) +#endif + __device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *output) { int n = threadIdx.x % 4; diff --git a/config.sh b/config.sh index 87fda3c..d3f74d0 100755 --- a/config.sh +++ b/config.sh @@ -3,9 +3,11 @@ # Simple script to create the Makefile # then type 'make' +# export PATH="$PATH:/usr/local/cuda-6.5/bin/" + make clean || echo clean rm -f config.status ./autogen.sh || echo done -CFLAGS="-O2 -D_REENTRANT" ./configure +CC=/usr/local/bin/colorgcc.pl CFLAGS="-O2 -D_REENTRANT" ./configure diff --git a/groestl_functions_quad.cu b/groestl_functions_quad.cu index 745a843..b7839ab 100644 --- a/groestl_functions_quad.cu +++ b/groestl_functions_quad.cu @@ -240,6 +240,15 @@ __device__ __forceinline__ void G256_ShiftBytesQ_quad(uint32_t &x7, uint32_t &x6 x7 = __byte_perm(t0, t1, 0x5410); } +#if __CUDA_ARCH__ < 300 +/** + * __shfl() returns the value of var held by the thread whose ID is given by srcLane. + * If srcLane is outside the range 0..width-1, the thread’s own value of var is returned. + */ +#undef __shfl +#define __shfl(var, srcLane, width) (uint32_t)(var) +#endif + __device__ __forceinline__ void G256_MixFunction_quad(uint32_t *r) { #define SHIFT64_16(hi, lo) __byte_perm(lo, hi, 0x5432) diff --git a/quark/cuda_quark_compactionTest.cu b/quark/cuda_quark_compactionTest.cu index ed2e2c9..2eb2852 100644 --- a/quark/cuda_quark_compactionTest.cu +++ b/quark/cuda_quark_compactionTest.cu @@ -58,6 +58,14 @@ __host__ void quark_compactTest_cpu_init(int thr_id, int threads) cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) } +#if __CUDA_ARCH__ < 300 +/** + * __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1 + */ +#undef __shfl_up +#define __shfl_up(var, delta, width) (0) +#endif + // Die Summenfunktion (vom NVIDIA SDK) __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) { diff --git a/x11/cuda_x11_simd512.cu b/x11/cuda_x11_simd512.cu index 801910f..473fa8e 100644 --- a/x11/cuda_x11_simd512.cu +++ b/x11/cuda_x11_simd512.cu @@ -167,8 +167,21 @@ X(j) = (u-v) << (2*n); \ #undef BUTTERFLY } +#if __CUDA_ARCH__ < 300 +/** + * __shfl() returns the value of var held by the thread whose ID is given by srcLane. + * If srcLane is outside the range 0..width-1, the thread's own value of var is returned. + */ +#undef __shfl +#define __shfl(var, srcLane, width) (uint32_t)(var) +#endif + __device__ __forceinline__ void FFT_16(int *y) { +#if __CUDA_ARCH__ < 300 +#warning FFT_16() function is not compatible with SM 2.1 devices! +#endif + /* * FFT_16 using w=2 as 16th root of unity * Unrolled decimation in frequency (DIF) radix-2 NTT. @@ -332,6 +345,9 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) { __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4) { int i; +#if __CUDA_ARCH__ < 300 +#warning Expansion() function is not compatible with SM 2.1 Devices +#endif /* Message Expansion using Number Theoretical Transform similar to FFT */ int expanded[32];