Browse Source

Try to restore compat with 2.1 devices (GTX 460)

master
Tanguy Pruvot 10 years ago
parent
commit
9d3d09103b
  1. 8
      JHA/cuda_jha_compactionTest.cu
  2. 8
      Makefile.am
  3. 8
      Makefile.in
  4. 9
      bitslice_transformations_quad.cu
  5. 4
      config.sh
  6. 9
      groestl_functions_quad.cu
  7. 8
      quark/cuda_quark_compactionTest.cu
  8. 16
      x11/cuda_x11_simd512.cu

8
JHA/cuda_jha_compactionTest.cu

@ -60,6 +60,14 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
} }
#if __CUDA_ARCH__ < 300
/**
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
*/
#undef __shfl_up
#define __shfl_up(var, delta, width) (0)
#endif
// Die Summenfunktion (vom NVIDIA SDK) // Die Summenfunktion (vom NVIDIA SDK)
__global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) __global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
{ {

8
Makefile.am

@ -48,15 +48,15 @@ ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -f
# we're now targeting all major compute architectures within one binary. # we're now targeting all major compute architectures within one binary.
.cu.o: .cu.o:
$(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< $(NVCC) @CFLAGS@ -I . -Xptxas "-v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
# Shavite compiles faster with 128 regs # Shavite compiles faster with 128 regs
x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
# ABI requiring code modules # ABI requiring code modules
quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<

8
Makefile.in

@ -1470,18 +1470,18 @@ uninstall-am: uninstall-binPROGRAMS
# we're now targeting all major compute architectures within one binary. # we're now targeting all major compute architectures within one binary.
.cu.o: .cu.o:
$(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< $(NVCC) @CFLAGS@ -I . -Xptxas "-v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
# Shavite compiles faster with 128 regs # Shavite compiles faster with 128 regs
x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
# ABI requiring code modules # ABI requiring code modules
quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $< $(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
# Tell versions [3.59,3.63) of GNU make to not export all variables. # Tell versions [3.59,3.63) of GNU make to not export all variables.
# Otherwise a system limit (for SysV at least) may be exceeded. # Otherwise a system limit (for SysV at least) may be exceeded.

9
bitslice_transformations_quad.cu

@ -1,4 +1,13 @@
#if __CUDA_ARCH__ < 300
/**
* __shfl() returns the value of var held by the thread whose ID is given by srcLane.
* If srcLane is outside the range 0..width-1, the thread's own value of var is returned.
*/
#undef __shfl
#define __shfl(var, srcLane, width) (uint32_t)(var)
#endif
__device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *output) __device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *output)
{ {
int n = threadIdx.x % 4; int n = threadIdx.x % 4;

4
config.sh

@ -3,9 +3,11 @@
# Simple script to create the Makefile # Simple script to create the Makefile
# then type 'make' # then type 'make'
# export PATH="$PATH:/usr/local/cuda-6.5/bin/"
make clean || echo clean make clean || echo clean
rm -f config.status rm -f config.status
./autogen.sh || echo done ./autogen.sh || echo done
CFLAGS="-O2 -D_REENTRANT" ./configure CC=/usr/local/bin/colorgcc.pl CFLAGS="-O2 -D_REENTRANT" ./configure

9
groestl_functions_quad.cu

@ -240,6 +240,15 @@ __device__ __forceinline__ void G256_ShiftBytesQ_quad(uint32_t &x7, uint32_t &x6
x7 = __byte_perm(t0, t1, 0x5410); x7 = __byte_perm(t0, t1, 0x5410);
} }
#if __CUDA_ARCH__ < 300
/**
* __shfl() returns the value of var held by the thread whose ID is given by srcLane.
* If srcLane is outside the range 0..width-1, the thread’s own value of var is returned.
*/
#undef __shfl
#define __shfl(var, srcLane, width) (uint32_t)(var)
#endif
__device__ __forceinline__ void G256_MixFunction_quad(uint32_t *r) __device__ __forceinline__ void G256_MixFunction_quad(uint32_t *r)
{ {
#define SHIFT64_16(hi, lo) __byte_perm(lo, hi, 0x5432) #define SHIFT64_16(hi, lo) __byte_perm(lo, hi, 0x5432)

8
quark/cuda_quark_compactionTest.cu

@ -58,6 +58,14 @@ __host__ void quark_compactTest_cpu_init(int thr_id, int threads)
cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block) cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
} }
#if __CUDA_ARCH__ < 300
/**
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
*/
#undef __shfl_up
#define __shfl_up(var, delta, width) (0)
#endif
// Die Summenfunktion (vom NVIDIA SDK) // Die Summenfunktion (vom NVIDIA SDK)
__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL) __global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
{ {

16
x11/cuda_x11_simd512.cu

@ -167,8 +167,21 @@ X(j) = (u-v) << (2*n); \
#undef BUTTERFLY #undef BUTTERFLY
} }
#if __CUDA_ARCH__ < 300
/**
* __shfl() returns the value of var held by the thread whose ID is given by srcLane.
* If srcLane is outside the range 0..width-1, the thread's own value of var is returned.
*/
#undef __shfl
#define __shfl(var, srcLane, width) (uint32_t)(var)
#endif
__device__ __forceinline__ void FFT_16(int *y) { __device__ __forceinline__ void FFT_16(int *y) {
#if __CUDA_ARCH__ < 300
#warning FFT_16() function is not compatible with SM 2.1 devices!
#endif
/* /*
* FFT_16 using w=2 as 16th root of unity * FFT_16 using w=2 as 16th root of unity
* Unrolled decimation in frequency (DIF) radix-2 NTT. * Unrolled decimation in frequency (DIF) radix-2 NTT.
@ -332,6 +345,9 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) {
__device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4) __device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
{ {
int i; int i;
#if __CUDA_ARCH__ < 300
#warning Expansion() function is not compatible with SM 2.1 Devices
#endif
/* Message Expansion using Number Theoretical Transform similar to FFT */ /* Message Expansion using Number Theoretical Transform similar to FFT */
int expanded[32]; int expanded[32];

Loading…
Cancel
Save