Try to restore compat with 2.1 devices (GTX 460)
This commit is contained in:
parent
2c8ccd0edc
commit
9d3d09103b
@ -60,6 +60,14 @@ __host__ void jackpot_compactTest_cpu_init(int thr_id, int threads)
|
||||
cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ < 300
|
||||
/**
|
||||
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
|
||||
*/
|
||||
#undef __shfl_up
|
||||
#define __shfl_up(var, delta, width) (0)
|
||||
#endif
|
||||
|
||||
// Die Summenfunktion (vom NVIDIA SDK)
|
||||
__global__ void jackpot_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
|
||||
{
|
||||
|
@ -48,15 +48,15 @@ ccminer_CPPFLAGS = -msse2 @LIBCURL_CPPFLAGS@ @OPENMP_CFLAGS@ $(PTHREAD_FLAGS) -f
|
||||
|
||||
# we're now targeting all major compute architectures within one binary.
|
||||
.cu.o:
|
||||
$(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||
$(NVCC) @CFLAGS@ -I . -Xptxas "-v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||
|
||||
# Shavite compiles faster with 128 regs
|
||||
x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
|
||||
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||
|
||||
# ABI requiring code modules
|
||||
quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
|
||||
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||
|
||||
JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
|
||||
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||
|
@ -1470,18 +1470,18 @@ uninstall-am: uninstall-binPROGRAMS
|
||||
|
||||
# we're now targeting all major compute architectures within one binary.
|
||||
.cu.o:
|
||||
$(NVCC) @CFLAGS@ -I . -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||
$(NVCC) @CFLAGS@ -I . -Xptxas "-v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||
|
||||
# Shavite compiles faster with 128 regs
|
||||
x11/cuda_x11_shavite512.o: x11/cuda_x11_shavite512.cu
|
||||
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=no -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=128 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||
|
||||
# ABI requiring code modules
|
||||
quark/cuda_quark_compactionTest.o: quark/cuda_quark_compactionTest.cu
|
||||
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||
|
||||
JHA/cuda_jha_compactionTest.o: JHA/cuda_jha_compactionTest.cu
|
||||
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_30,code=\"sm_30,compute_30\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||
$(NVCC) -I . -I cudpp-2.1/include @CFLAGS@ -Xptxas "-abi=yes -v" -gencode=arch=compute_20,code=\"sm_21,compute_20\" -gencode=arch=compute_35,code=\"sm_35,compute_35\" --maxrregcount=80 --ptxas-options=-v $(JANSSON_INCLUDES) -o $@ -c $<
|
||||
|
||||
# Tell versions [3.59,3.63) of GNU make to not export all variables.
|
||||
# Otherwise a system limit (for SysV at least) may be exceeded.
|
||||
|
@ -1,4 +1,13 @@
|
||||
|
||||
#if __CUDA_ARCH__ < 300
|
||||
/**
|
||||
* __shfl() returns the value of var held by the thread whose ID is given by srcLane.
|
||||
* If srcLane is outside the range 0..width-1, the thread's own value of var is returned.
|
||||
*/
|
||||
#undef __shfl
|
||||
#define __shfl(var, srcLane, width) (uint32_t)(var)
|
||||
#endif
|
||||
|
||||
__device__ __forceinline__ void to_bitslice_quad(uint32_t *input, uint32_t *output)
|
||||
{
|
||||
int n = threadIdx.x % 4;
|
||||
|
@ -3,9 +3,11 @@
|
||||
# Simple script to create the Makefile
|
||||
# then type 'make'
|
||||
|
||||
# export PATH="$PATH:/usr/local/cuda-6.5/bin/"
|
||||
|
||||
make clean || echo clean
|
||||
|
||||
rm -f config.status
|
||||
./autogen.sh || echo done
|
||||
|
||||
CFLAGS="-O2 -D_REENTRANT" ./configure
|
||||
CC=/usr/local/bin/colorgcc.pl CFLAGS="-O2 -D_REENTRANT" ./configure
|
||||
|
@ -240,6 +240,15 @@ __device__ __forceinline__ void G256_ShiftBytesQ_quad(uint32_t &x7, uint32_t &x6
|
||||
x7 = __byte_perm(t0, t1, 0x5410);
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ < 300
|
||||
/**
|
||||
* __shfl() returns the value of var held by the thread whose ID is given by srcLane.
|
||||
* If srcLane is outside the range 0..width-1, the thread’s own value of var is returned.
|
||||
*/
|
||||
#undef __shfl
|
||||
#define __shfl(var, srcLane, width) (uint32_t)(var)
|
||||
#endif
|
||||
|
||||
__device__ __forceinline__ void G256_MixFunction_quad(uint32_t *r)
|
||||
{
|
||||
#define SHIFT64_16(hi, lo) __byte_perm(lo, hi, 0x5432)
|
||||
|
@ -58,6 +58,14 @@ __host__ void quark_compactTest_cpu_init(int thr_id, int threads)
|
||||
cudaMalloc(&d_partSum[1][thr_id], sizeof(uint32_t) * s1); // BLOCKSIZE (Threads/Block)
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ < 300
|
||||
/**
|
||||
* __shfl_up() calculates a source lane ID by subtracting delta from the caller's lane ID, and clamping to the range 0..width-1
|
||||
*/
|
||||
#undef __shfl_up
|
||||
#define __shfl_up(var, delta, width) (0)
|
||||
#endif
|
||||
|
||||
// Die Summenfunktion (vom NVIDIA SDK)
|
||||
__global__ void quark_compactTest_gpu_SCAN(uint32_t *data, int width, uint32_t *partial_sums=NULL, cuda_compactTestFunction_t testFunc=NULL, int threads=0, uint32_t startNounce=0, uint32_t *inpHashes=NULL, uint32_t *d_validNonceTable=NULL)
|
||||
{
|
||||
|
@ -167,8 +167,21 @@ X(j) = (u-v) << (2*n); \
|
||||
#undef BUTTERFLY
|
||||
}
|
||||
|
||||
#if __CUDA_ARCH__ < 300
|
||||
/**
|
||||
* __shfl() returns the value of var held by the thread whose ID is given by srcLane.
|
||||
* If srcLane is outside the range 0..width-1, the thread's own value of var is returned.
|
||||
*/
|
||||
#undef __shfl
|
||||
#define __shfl(var, srcLane, width) (uint32_t)(var)
|
||||
#endif
|
||||
|
||||
__device__ __forceinline__ void FFT_16(int *y) {
|
||||
|
||||
#if __CUDA_ARCH__ < 300
|
||||
#warning FFT_16() function is not compatible with SM 2.1 devices!
|
||||
#endif
|
||||
|
||||
/*
|
||||
* FFT_16 using w=2 as 16th root of unity
|
||||
* Unrolled decimation in frequency (DIF) radix-2 NTT.
|
||||
@ -332,6 +345,9 @@ __device__ __forceinline__ void FFT_256_halfzero(int y[256]) {
|
||||
__device__ __forceinline__ void Expansion(const uint32_t *data, uint4 *g_temp4)
|
||||
{
|
||||
int i;
|
||||
#if __CUDA_ARCH__ < 300
|
||||
#warning Expansion() function is not compatible with SM 2.1 Devices
|
||||
#endif
|
||||
|
||||
/* Message Expansion using Number Theoretical Transform similar to FFT */
|
||||
int expanded[32];
|
||||
|
Loading…
Reference in New Issue
Block a user