diff --git a/Makefile.am b/Makefile.am index 4360cdd..01fc6b1 100644 --- a/Makefile.am +++ b/Makefile.am @@ -111,9 +111,10 @@ endif #ccminer_LDADD += -lsodium ccminer_LDADD += -lcuda -nvcc_ARCH = -gencode=arch=compute_50,code=\"sm_50,compute_50\" - +nvcc_ARCH := +#nvcc_ARCH += -gencode=arch=compute_61,code=\"sm_61,compute_61\" nvcc_ARCH += -gencode=arch=compute_52,code=\"sm_52,compute_52\" +nvcc_ARCH += -gencode=arch=compute_50,code=\"sm_50,compute_50\" #nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\" #nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\" diff --git a/configure.ac b/configure.ac index 906f31d..08a340f 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [2.2.4], [], [ccminer], [http://github.com/tpruvot/ccminer]) +AC_INIT([ccminer], [2.2.5], [], [ccminer], [http://github.com/tpruvot/ccminer]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cuda_helper.h b/cuda_helper.h index 42e60d8..c51a325 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -669,4 +669,14 @@ static uint2 SHR2(uint2 a, int offset) #endif } +// CUDA 9+ deprecated functions warnings (new mask param) +#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300 +#undef __shfl +#define __shfl(var, srcLane, width) __shfl_sync(0xFFFFFFFFu, var, srcLane, width) +#undef __shfl_up +#define __shfl_up(var, delta, width) __shfl_up_sync(0xFFFFFFFF, var, delta, width) +#undef __any +#define __any(p) __any_sync(0xFFFFFFFFu, p) +#endif + #endif // #ifndef CUDA_HELPER_H diff --git a/equi/cuda_equi.cu b/equi/cuda_equi.cu index b7bcbb5..ea1e841 100644 --- a/equi/cuda_equi.cu +++ b/equi/cuda_equi.cu @@ -65,7 +65,8 @@ #define __CUDA_ARCH__ 520 uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z); uint32_t __byte_perm(uint32_t x, uint32_t y, uint32_t z); -uint32_t __shfl(uint32_t x, uint32_t y, uint32_t z); +uint32_t __shfl2(uint32_t x, uint32_t y); +uint32_t __shfl_sync(uint32_t mask, uint32_t x, uint32_t y); uint32_t atomicExch(uint32_t *x, uint32_t y); uint32_t atomicAdd(uint32_t *x, uint32_t y); void __syncthreads(void); @@ -79,6 +80,14 @@ u32 umin(const u32, const u32); u32 umax(const u32, const u32); #endif +#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300 +#define __shfl2(var, srcLane) __shfl_sync(0xFFFFFFFFu, var, srcLane) +#undef __any +#define __any(p) __any_sync(0xFFFFFFFFu, p) +#else +#define __shfl2 __shfl +#endif + typedef u32 proof[PROOFSIZE]; struct __align__(32) slot { @@ -1844,7 +1853,7 @@ __global__ void digit_last_wdc(equi* eq) } #if __CUDA_ARCH__ >= 300 // all threads get the value from lane 0 - soli = __shfl(soli, 0); + soli = __shfl2(soli, 0); #else __syncthreads(); soli = eq->edata.srealcont.nsols; diff --git a/lyra2/cuda_lyra2_vectors.h b/lyra2/cuda_lyra2_vectors.h index 63e3fcb..6bb11d3 100644 --- a/lyra2/cuda_lyra2_vectors.h +++ b/lyra2/cuda_lyra2_vectors.h @@ -16,6 +16,12 @@ #define __shfl(x, y, z) (x) #endif +#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300 +#define __shfl2(var, srcLane) __shfl_sync(0xFFFFFFFFu, var, srcLane) +#else +#define __shfl2 __shfl +#endif + #if __CUDA_ARCH__ < 320 && !defined(__ldg4) #define __ldg4(x) (*(x)) #endif @@ -89,7 +95,7 @@ typedef struct __align__(16) uint28 { typedef uint2x4 uint28; /* name deprecated */ typedef struct __builtin_align__(32) uint48 { - uint4 s0,s1; + uint4 s0,s1; } uint48; typedef struct __builtin_align__(128) uint4x16{ @@ -368,10 +374,10 @@ static __forceinline__ __device__ void operator^= (ulonglong2to8 &a, const ulong static __forceinline__ __device__ void operator+= (uint4 &a, uint4 b) { a = a + b; } static __forceinline__ __device__ void operator+= (uchar4 &a, uchar4 b) { a = a + b; } -static __forceinline__ __device__ __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; } -static __forceinline__ __device__ __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; } -static __forceinline__ __device__ void operator+= (uint2_16 &a, const uint2_16 &b) { a = a + b; } -static __forceinline__ __device__ void operator^= (uint2_16 &a, const uint2_16 &b) { a = a + b; } +static __forceinline__ __device__ __host__ void operator+= (uint8 &a, const uint8 &b) { a = a + b; } +static __forceinline__ __device__ __host__ void operator+= (uint16 &a, const uint16 &b) { a = a + b; } +static __forceinline__ __device__ void operator+= (uint2_16 &a, const uint2_16 &b) { a = a + b; } +static __forceinline__ __device__ void operator^= (uint2_16 &a, const uint2_16 &b) { a = a + b; } static __forceinline__ __device__ void operator+= (ulong8 &a, const ulong8 &b) { a = a + b; } static __forceinline__ __device__ void operator+= (ulonglong16 &a, const ulonglong16 &b) { a = a + b; } @@ -551,14 +557,14 @@ static __device__ __forceinline__ uint28 shuffle4(const uint28 &var, int lane) { #if __CUDA_ARCH__ >= 300 uint28 res; - res.x.x = __shfl(var.x.x, lane); - res.x.y = __shfl(var.x.y, lane); - res.y.x = __shfl(var.y.x, lane); - res.y.y = __shfl(var.y.y, lane); - res.z.x = __shfl(var.z.x, lane); - res.z.y = __shfl(var.z.y, lane); - res.w.x = __shfl(var.w.x, lane); - res.w.y = __shfl(var.w.y, lane); + res.x.x = __shfl2(var.x.x, lane); + res.x.y = __shfl2(var.x.y, lane); + res.y.x = __shfl2(var.y.x, lane); + res.y.y = __shfl2(var.y.y, lane); + res.z.x = __shfl2(var.z.x, lane); + res.z.y = __shfl2(var.z.y, lane); + res.w.x = __shfl2(var.w.x, lane); + res.w.y = __shfl2(var.w.y, lane); return res; #else return var; @@ -569,22 +575,22 @@ static __device__ __forceinline__ ulonglong4 shuffle4(ulonglong4 var, int lane) { #if __CUDA_ARCH__ >= 300 ulonglong4 res; - uint2 temp; + uint2 temp; temp = vectorize(var.x); - temp.x = __shfl(temp.x, lane); - temp.y = __shfl(temp.y, lane); + temp.x = __shfl2(temp.x, lane); + temp.y = __shfl2(temp.y, lane); res.x = devectorize(temp); temp = vectorize(var.y); - temp.x = __shfl(temp.x, lane); - temp.y = __shfl(temp.y, lane); + temp.x = __shfl2(temp.x, lane); + temp.y = __shfl2(temp.y, lane); res.y = devectorize(temp); temp = vectorize(var.z); - temp.x = __shfl(temp.x, lane); - temp.y = __shfl(temp.y, lane); + temp.x = __shfl2(temp.x, lane); + temp.y = __shfl2(temp.y, lane); res.z = devectorize(temp); temp = vectorize(var.w); - temp.x = __shfl(temp.x, lane); - temp.y = __shfl(temp.y, lane); + temp.x = __shfl2(temp.x, lane); + temp.y = __shfl2(temp.y, lane); res.w = devectorize(temp); return res; #else diff --git a/scrypt/kepler_kernel.cu b/scrypt/kepler_kernel.cu index c67806e..247fdb8 100644 --- a/scrypt/kepler_kernel.cu +++ b/scrypt/kepler_kernel.cu @@ -10,6 +10,8 @@ #include #include +#include + #include "miner.h" #include "salsa_kernel.h" @@ -18,6 +20,12 @@ #define TEXWIDTH 32768 #define THREADS_PER_WU 4 // four threads per hash +#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300 +#define __shfl2(var, srcLane) __shfl_sync(0xFFFFFFFFu, var, srcLane) +#else +#define __shfl2 __shfl +#endif + typedef enum { ANDERSEN, @@ -57,12 +65,12 @@ static __host__ __device__ uint4& operator += (uint4& left, const uint4& right) return left; } -static __device__ uint4 __shfl(const uint4 bx, int target_thread) { +static __device__ uint4 shfl4(const uint4 bx, int target_thread) { return make_uint4( - __shfl((int)bx.x, target_thread), - __shfl((int)bx.y, target_thread), - __shfl((int)bx.z, target_thread), - __shfl((int)bx.w, target_thread) + __shfl2((int)bx.x, target_thread), + __shfl2((int)bx.y, target_thread), + __shfl2((int)bx.z, target_thread), + __shfl2((int)bx.w, target_thread) ); } @@ -97,8 +105,8 @@ void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start) if (SCHEME == ANDERSEN) { int target_thread = (threadIdx.x + 4)%32; - uint4 t=b, t2=__shfl(bx, target_thread); - int t2_start = __shfl((int)start, target_thread) + 4; + uint4 t = b, t2 = shfl4(bx, target_thread); + int t2_start = __shfl2((int)start, target_thread) + 4; bool c = (threadIdx.x & 0x4); *((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t); *((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2); @@ -115,7 +123,7 @@ void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start) if (TEX_DIM == 0) scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32]; if (SCHEME == ANDERSEN) { - int t2_start = __shfl((int)start, (threadIdx.x + 4)%32) + 4; + int t2_start = __shfl2((int)start, (threadIdx.x + 4)%32) + 4; if (TEX_DIM > 0) { start /= 4; t2_start /= 4; } bool c = (threadIdx.x & 0x4); if (TEX_DIM == 0) { @@ -129,7 +137,7 @@ void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start) bx = tex2D(texRef2D_4_V, 0.5f + ((c ? start : t2_start)%TEXWIDTH), 0.5f + ((c ? start : t2_start)/TEXWIDTH)); } uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx); - bx = __shfl(bx, (threadIdx.x + 28)%32); + bx = shfl4(bx, (threadIdx.x + 28)%32); } else { if (TEX_DIM == 0) b = *((uint4 *)(&scratch[start])); else if (TEX_DIM == 1) b = tex1Dfetch(texRef1D_4_V, start/4); @@ -149,14 +157,15 @@ void primary_order_shuffle(uint4 &b, uint4 &bx) int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+2)&0x3); int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 0x03)+3)&0x3); - b.w = __shfl((int)b.w, x1); - b.z = __shfl((int)b.z, x2); - b.y = __shfl((int)b.y, x3); + b.w = __shfl2((int)b.w, x1); + b.z = __shfl2((int)b.z, x2); + b.y = __shfl2((int)b.y, x3); + uint32_t tmp = b.y; b.y = b.w; b.w = tmp; - bx.w = __shfl((int)bx.w, x1); - bx.z = __shfl((int)bx.z, x2); - bx.y = __shfl((int)bx.y, x3); + bx.w = __shfl2((int)bx.w, x1); + bx.z = __shfl2((int)bx.z, x2); + bx.y = __shfl2((int)bx.y, x3); tmp = bx.y; bx.y = bx.w; bx.w = tmp; } @@ -318,9 +327,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x /* Unclear if this optimization is needed: These are ordered based * upon the dependencies needed in the later xors. Compiler should be * able to figure this out, but might as well give it a hand. */ - x.y = __shfl((int)x.y, x3); - x.w = __shfl((int)x.w, x1); - x.z = __shfl((int)x.z, x2); + x.y = __shfl2((int)x.y, x3); + x.w = __shfl2((int)x.w, x1); + x.z = __shfl2((int)x.z, x2); /* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first, * but the register targets are rewritten here to swap x[1] and x[3] so that @@ -333,9 +342,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x XOR_ROTATE_ADD(x.y, x.z, x.w, 13); XOR_ROTATE_ADD(x.x, x.y, x.z, 18); - x.w = __shfl((int)x.w, x3); - x.y = __shfl((int)x.y, x1); - x.z = __shfl((int)x.z, x2); + x.w = __shfl2((int)x.w, x3); + x.y = __shfl2((int)x.y, x1); + x.z = __shfl2((int)x.z, x2); } b += x; @@ -352,18 +361,18 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x XOR_ROTATE_ADD(x.w, x.z, x.y, 13); XOR_ROTATE_ADD(x.x, x.w, x.z, 18); - x.y = __shfl((int)x.y, x3); - x.w = __shfl((int)x.w, x1); - x.z = __shfl((int)x.z, x2); + x.y = __shfl2((int)x.y, x3); + x.w = __shfl2((int)x.w, x1); + x.z = __shfl2((int)x.z, x2); XOR_ROTATE_ADD(x.w, x.x, x.y, 7); XOR_ROTATE_ADD(x.z, x.w, x.x, 9); XOR_ROTATE_ADD(x.y, x.z, x.w, 13); XOR_ROTATE_ADD(x.x, x.y, x.z, 18); - x.w = __shfl((int)x.w, x3); - x.y = __shfl((int)x.y, x1); - x.z = __shfl((int)x.z, x2); + x.w = __shfl2((int)x.w, x3); + x.y = __shfl2((int)x.y, x1); + x.z = __shfl2((int)x.z, x2); } // At the end of these iterations, the data is in primary order again. @@ -407,9 +416,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) - x.y = __shfl((int)x.y, x1); - x.z = __shfl((int)x.z, x2); - x.w = __shfl((int)x.w, x3); + x.y = __shfl2((int)x.y, x1); + x.z = __shfl2((int)x.z, x2); + x.w = __shfl2((int)x.w, x3); // Diagonal Mixing phase of chacha CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) @@ -417,9 +426,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) - x.y = __shfl((int)x.y, x3); - x.z = __shfl((int)x.z, x2); - x.w = __shfl((int)x.w, x1); + x.y = __shfl2((int)x.y, x3); + x.z = __shfl2((int)x.z, x2); + x.w = __shfl2((int)x.w, x1); } b += x; @@ -436,9 +445,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) - x.y = __shfl((int)x.y, x1); - x.z = __shfl((int)x.z, x2); - x.w = __shfl((int)x.w, x3); + x.y = __shfl2((int)x.y, x1); + x.z = __shfl2((int)x.z, x2); + x.w = __shfl2((int)x.w, x3); // Diagonal Mixing phase of chacha CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) @@ -446,9 +455,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) - x.y = __shfl((int)x.y, x3); - x.z = __shfl((int)x.z, x2); - x.w = __shfl((int)x.w, x1); + x.y = __shfl2((int)x.y, x3); + x.z = __shfl2((int)x.z, x2); + x.w = __shfl2((int)x.w, x1); } #undef CHACHA_PRIMITIVE @@ -572,7 +581,7 @@ void kepler_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end) } else load_key(d_odata, b, bx); for (int i = begin; i < end; i++) { - int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); + int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); uint4 t, tx; read_keys_direct(t, tx, start+32*j); b ^= t; bx ^= tx; block_mixer(b, bx, x1, x2, x3); @@ -604,7 +613,7 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign { // better divergent thread handling submitted by nVidia engineers, but // supposedly this does not run with the ANDERSEN memory access scheme - int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); + int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); int pos = j/LOOKUP_GAP; int loop = -1; uint4 t, tx; @@ -612,7 +621,7 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign int i = begin; while(i < end) { if (loop==-1) { - j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); + j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); pos = j/LOOKUP_GAP; loop = j-pos*LOOKUP_GAP; read_keys_direct(t, tx, start+32*pos); @@ -634,7 +643,7 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign // this is my original implementation, now used with the ANDERSEN // memory access scheme only. for (int i = begin; i < end; i++) { - int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); + int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP; uint4 t, tx; read_keys_direct(t, tx, start+32*pos); while(loop--) block_mixer(t, tx, x1, x2, x3); @@ -644,7 +653,7 @@ void kepler_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsign } //for (int i = begin; i < end; i++) { -// int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); +// int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); // int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP; // uint4 t, tx; read_keys_direct(t, tx, start+32*pos); // while(loop--) block_mixer(t, tx, x1, x2, x3); diff --git a/scrypt/nv_kernel.cu b/scrypt/nv_kernel.cu index 9a79eb9..90b4c8a 100644 --- a/scrypt/nv_kernel.cu +++ b/scrypt/nv_kernel.cu @@ -11,7 +11,8 @@ #include -#include "cuda_runtime.h" +#include +#include #include "miner.h" #include "salsa_kernel.h" @@ -176,13 +177,14 @@ static __device__ uint4& operator^=(uint4& left, const uint4& right) return left; } -__device__ __forceinline__ uint4 __shfl(const uint4 val, unsigned int lane, unsigned int width) +__device__ __forceinline__ uint4 shfl4(const uint4 val, unsigned int lane, unsigned int width) { return make_uint4( (unsigned int)__shfl((int)val.x, lane, width), (unsigned int)__shfl((int)val.y, lane, width), (unsigned int)__shfl((int)val.z, lane, width), - (unsigned int)__shfl((int)val.w, lane, width)); + (unsigned int)__shfl((int)val.w, lane, width) + ); } __device__ __forceinline__ void __transposed_write_BC(uint4 (&B)[4], uint4 (&C)[4], uint4 *D, int spacing) @@ -208,13 +210,13 @@ __device__ __forceinline__ void __transposed_write_BC(uint4 (&B)[4], uint4 (&C)[ // rotate rows T1[0] = B[0]; - T1[1] = __shfl(B[1], lane8 + 7, 8); - T1[2] = __shfl(B[2], lane8 + 6, 8); - T1[3] = __shfl(B[3], lane8 + 5, 8); - T1[4] = __shfl(C[0], lane8 + 4, 8); - T1[5] = __shfl(C[1], lane8 + 3, 8); - T1[6] = __shfl(C[2], lane8 + 2, 8); - T1[7] = __shfl(C[3], lane8 + 1, 8); + T1[1] = shfl4(B[1], lane8 + 7, 8); + T1[2] = shfl4(B[2], lane8 + 6, 8); + T1[3] = shfl4(B[3], lane8 + 5, 8); + T1[4] = shfl4(C[0], lane8 + 4, 8); + T1[5] = shfl4(C[1], lane8 + 3, 8); + T1[6] = shfl4(C[2], lane8 + 2, 8); + T1[7] = shfl4(C[3], lane8 + 1, 8); /* Matrix after row rotates: @@ -301,13 +303,13 @@ template __device__ __forceinline__ void __transposed_read_BC(cons // rotate rows B[0] = T2[0]; - B[1] = __shfl(T2[1], lane8 + 1, 8); - B[2] = __shfl(T2[2], lane8 + 2, 8); - B[3] = __shfl(T2[3], lane8 + 3, 8); - C[0] = __shfl(T2[4], lane8 + 4, 8); - C[1] = __shfl(T2[5], lane8 + 5, 8); - C[2] = __shfl(T2[6], lane8 + 6, 8); - C[3] = __shfl(T2[7], lane8 + 7, 8); + B[1] = shfl4(T2[1], lane8 + 1, 8); + B[2] = shfl4(T2[2], lane8 + 2, 8); + B[3] = shfl4(T2[3], lane8 + 3, 8); + C[0] = shfl4(T2[4], lane8 + 4, 8); + C[1] = shfl4(T2[5], lane8 + 5, 8); + C[2] = shfl4(T2[6], lane8 + 6, 8); + C[3] = shfl4(T2[7], lane8 + 7, 8); } diff --git a/scrypt/nv_kernel2.cu b/scrypt/nv_kernel2.cu index 0e94106..5f4e2c8 100644 --- a/scrypt/nv_kernel2.cu +++ b/scrypt/nv_kernel2.cu @@ -12,6 +12,7 @@ #include #include +#include #include "miner.h" #include "salsa_kernel.h" @@ -117,13 +118,14 @@ static __device__ uint4& operator^=(uint4& left, const uint4& right) return left; } -__device__ __forceinline__ uint4 __shfl(const uint4 val, unsigned int lane, unsigned int width) +__device__ __forceinline__ uint4 shfl4(const uint4 val, unsigned int lane, unsigned int width) { return make_uint4( (unsigned int)__shfl((int)val.x, lane, width), (unsigned int)__shfl((int)val.y, lane, width), (unsigned int)__shfl((int)val.z, lane, width), - (unsigned int)__shfl((int)val.w, lane, width)); + (unsigned int)__shfl((int)val.w, lane, width) + ); } __device__ __forceinline__ void __transposed_write_BC(uint4 (&B)[4], uint4 (&C)[4], uint4 *D, int spacing) @@ -149,13 +151,13 @@ __device__ __forceinline__ void __transposed_write_BC(uint4 (&B)[4], uint4 (&C)[ // rotate rows T1[0] = B[0]; - T1[1] = __shfl(B[1], lane8 + 7, 8); - T1[2] = __shfl(B[2], lane8 + 6, 8); - T1[3] = __shfl(B[3], lane8 + 5, 8); - T1[4] = __shfl(C[0], lane8 + 4, 8); - T1[5] = __shfl(C[1], lane8 + 3, 8); - T1[6] = __shfl(C[2], lane8 + 2, 8); - T1[7] = __shfl(C[3], lane8 + 1, 8); + T1[1] = shfl4(B[1], lane8 + 7, 8); + T1[2] = shfl4(B[2], lane8 + 6, 8); + T1[3] = shfl4(B[3], lane8 + 5, 8); + T1[4] = shfl4(C[0], lane8 + 4, 8); + T1[5] = shfl4(C[1], lane8 + 3, 8); + T1[6] = shfl4(C[2], lane8 + 2, 8); + T1[7] = shfl4(C[3], lane8 + 1, 8); /* Matrix after row rotates: @@ -233,13 +235,13 @@ __device__ __forceinline__ void __transposed_read_BC(const uint4 *S, uint4 (&B)[ // rotate rows B[0] = T2[0]; - B[1] = __shfl(T2[1], lane8 + 1, 8); - B[2] = __shfl(T2[2], lane8 + 2, 8); - B[3] = __shfl(T2[3], lane8 + 3, 8); - C[0] = __shfl(T2[4], lane8 + 4, 8); - C[1] = __shfl(T2[5], lane8 + 5, 8); - C[2] = __shfl(T2[6], lane8 + 6, 8); - C[3] = __shfl(T2[7], lane8 + 7, 8); + B[1] = shfl4(T2[1], lane8 + 1, 8); + B[2] = shfl4(T2[2], lane8 + 2, 8); + B[3] = shfl4(T2[3], lane8 + 3, 8); + C[0] = shfl4(T2[4], lane8 + 4, 8); + C[1] = shfl4(T2[5], lane8 + 5, 8); + C[2] = shfl4(T2[6], lane8 + 6, 8); + C[3] = shfl4(T2[7], lane8 + 7, 8); } diff --git a/scrypt/titan_kernel.cu b/scrypt/titan_kernel.cu index e27b832..1758722 100644 --- a/scrypt/titan_kernel.cu +++ b/scrypt/titan_kernel.cu @@ -10,6 +10,7 @@ #include #include +#include #include "miner.h" #include "salsa_kernel.h" @@ -28,6 +29,12 @@ typedef enum #define __ldg(x) (*(x)) #endif +#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300 +#define __shfl2(var, srcLane) __shfl_sync(0xFFFFFFFFu, var, srcLane) +#else +#define __shfl2 __shfl +#endif + #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 // scratchbuf constants (pointers to scratch buffer for each warp, i.e. 32 hashes) @@ -59,8 +66,12 @@ static __host__ __device__ uint4& operator += (uint4& left, const uint4& right) return left; } -static __device__ uint4 __shfl(const uint4 bx, int target_thread) { - return make_uint4(__shfl((int)bx.x, target_thread), __shfl((int)bx.y, target_thread), __shfl((int)bx.z, target_thread), __shfl((int)bx.w, target_thread)); + +static __device__ uint4 shfl4(const uint4 bx, int target_thread) { + return make_uint4( + __shfl2((int)bx.x, target_thread), __shfl2((int)bx.y, target_thread), + __shfl2((int)bx.z, target_thread), __shfl2((int)bx.w, target_thread) + ); } /* write_keys writes the 8 keys being processed by a warp to the global @@ -93,8 +104,8 @@ void write_keys_direct(const uint4 &b, const uint4 &bx, uint32_t start) uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32]; if (SCHEME == ANDERSEN) { int target_thread = (threadIdx.x + 4)&31; - uint4 t=b, t2=__shfl(bx, target_thread); - int t2_start = __shfl((int)start, target_thread) + 4; + uint4 t = b, t2 = shfl4(bx, target_thread); + int t2_start = __shfl2((int)start, target_thread) + 4; bool c = (threadIdx.x & 0x4); *((uint4 *)(&scratch[c ? t2_start : start])) = (c ? t2 : t); *((uint4 *)(&scratch[c ? start : t2_start])) = (c ? t : t2); @@ -109,12 +120,12 @@ void read_keys_direct(uint4 &b, uint4 &bx, uint32_t start) { uint32_t *scratch = c_V[(blockIdx.x*blockDim.x + threadIdx.x)/32]; if (SCHEME == ANDERSEN) { - int t2_start = __shfl((int)start, (threadIdx.x + 4)&31) + 4; + int t2_start = __shfl2((int)start, (threadIdx.x + 4)&31) + 4; bool c = (threadIdx.x & 0x4); b = __ldg((uint4 *)(&scratch[c ? t2_start : start])); bx = __ldg((uint4 *)(&scratch[c ? start : t2_start])); uint4 tmp = b; b = (c ? bx : b); bx = (c ? tmp : bx); - bx = __shfl(bx, (threadIdx.x + 28)&31); + bx = shfl4(bx, (threadIdx.x + 28)&31); } else { b = *((uint4 *)(&scratch[start])); bx = *((uint4 *)(&scratch[start+16])); @@ -128,14 +139,14 @@ void primary_order_shuffle(uint32_t b[4], uint32_t bx[4]) { int x2 = (threadIdx.x & 0xfc) + (((threadIdx.x & 3)+2)&3); int x3 = (threadIdx.x & 0xfc) + (((threadIdx.x & 3)+3)&3); - b[3] = __shfl((int)b[3], x1); - b[2] = __shfl((int)b[2], x2); - b[1] = __shfl((int)b[1], x3); + b[3] = __shfl2((int)b[3], x1); + b[2] = __shfl2((int)b[2], x2); + b[1] = __shfl2((int)b[1], x3); uint32_t tmp = b[1]; b[1] = b[3]; b[3] = tmp; - bx[3] = __shfl((int)bx[3], x1); - bx[2] = __shfl((int)bx[2], x2); - bx[1] = __shfl((int)bx[1], x3); + bx[3] = __shfl2((int)bx[3], x1); + bx[2] = __shfl2((int)bx[2], x2); + bx[1] = __shfl2((int)bx[1], x3); tmp = bx[1]; bx[1] = bx[3]; bx[3] = tmp; } @@ -146,14 +157,14 @@ void primary_order_shuffle(uint4 &b, uint4 &bx) { int x2 = (threadIdx.x & 0x1c) + (((threadIdx.x & 3)+2)&3); int x3 = (threadIdx.x & 0x1c) + (((threadIdx.x & 3)+3)&3); - b.w = __shfl((int)b.w, x1); - b.z = __shfl((int)b.z, x2); - b.y = __shfl((int)b.y, x3); + b.w = __shfl2((int)b.w, x1); + b.z = __shfl2((int)b.z, x2); + b.y = __shfl2((int)b.y, x3); uint32_t tmp = b.y; b.y = b.w; b.w = tmp; - bx.w = __shfl((int)bx.w, x1); - bx.z = __shfl((int)bx.z, x2); - bx.y = __shfl((int)bx.y, x3); + bx.w = __shfl2((int)bx.w, x1); + bx.z = __shfl2((int)bx.z, x2); + bx.y = __shfl2((int)bx.y, x3); tmp = bx.y; bx.y = bx.w; bx.w = tmp; } @@ -327,9 +338,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x /* Unclear if this optimization is needed: These are ordered based * upon the dependencies needed in the later xors. Compiler should be * able to figure this out, but might as well give it a hand. */ - x.y = __shfl((int)x.y, x3); - x.w = __shfl((int)x.w, x1); - x.z = __shfl((int)x.z, x2); + x.y = __shfl2((int)x.y, x3); + x.w = __shfl2((int)x.w, x1); + x.z = __shfl2((int)x.z, x2); /* The next XOR_ROTATE_ADDS could be written to be a copy-paste of the first, * but the register targets are rewritten here to swap x[1] and x[3] so that @@ -342,9 +353,9 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x XOR_ROTATE_ADD(x.y, x.z, x.w, 13); XOR_ROTATE_ADD(x.x, x.y, x.z, 18); - x.w = __shfl((int)x.w, x3); - x.y = __shfl((int)x.y, x1); - x.z = __shfl((int)x.z, x2); + x.w = __shfl2((int)x.w, x3); + x.y = __shfl2((int)x.y, x1); + x.z = __shfl2((int)x.z, x2); } b += x; @@ -362,18 +373,18 @@ void salsa_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int x XOR_ROTATE_ADD(x.w, x.z, x.y, 13); XOR_ROTATE_ADD(x.x, x.w, x.z, 18); - x.y = __shfl((int)x.y, x3); - x.w = __shfl((int)x.w, x1); - x.z = __shfl((int)x.z, x2); + x.y = __shfl2((int)x.y, x3); + x.w = __shfl2((int)x.w, x1); + x.z = __shfl2((int)x.z, x2); XOR_ROTATE_ADD(x.w, x.x, x.y, 7); XOR_ROTATE_ADD(x.z, x.w, x.x, 9); XOR_ROTATE_ADD(x.y, x.z, x.w, 13); XOR_ROTATE_ADD(x.x, x.y, x.z, 18); - x.w = __shfl((int)x.w, x3); - x.y = __shfl((int)x.y, x1); - x.z = __shfl((int)x.z, x2); + x.w = __shfl2((int)x.w, x3); + x.y = __shfl2((int)x.y, x1); + x.z = __shfl2((int)x.z, x2); } // At the end of these iterations, the data is in primary order again. @@ -424,9 +435,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) - x.y = __shfl((int)x.y, x1); - x.z = __shfl((int)x.z, x2); - x.w = __shfl((int)x.w, x3); + x.y = __shfl2((int)x.y, x1); + x.z = __shfl2((int)x.z, x2); + x.w = __shfl2((int)x.w, x3); // Diagonal Mixing phase of chacha CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) @@ -434,9 +445,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) - x.y = __shfl((int)x.y, x3); - x.z = __shfl((int)x.z, x2); - x.w = __shfl((int)x.w, x1); + x.y = __shfl2((int)x.y, x3); + x.z = __shfl2((int)x.z, x2); + x.w = __shfl2((int)x.w, x1); } b += x; @@ -454,9 +465,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) - x.y = __shfl((int)x.y, x1); - x.z = __shfl((int)x.z, x2); - x.w = __shfl((int)x.w, x3); + x.y = __shfl2((int)x.y, x1); + x.z = __shfl2((int)x.z, x2); + x.w = __shfl2((int)x.w, x3); // Diagonal Mixing phase of chacha CHACHA_PRIMITIVE(x.x ,x.w, x.y, 16) @@ -464,9 +475,9 @@ void chacha_xor_core(uint4 &b, uint4 &bx, const int x1, const int x2, const int CHACHA_PRIMITIVE(x.x ,x.w, x.y, 8) CHACHA_PRIMITIVE(x.z ,x.y, x.w, 7) - x.y = __shfl((int)x.y, x3); - x.z = __shfl((int)x.z, x2); - x.w = __shfl((int)x.w, x1); + x.y = __shfl2((int)x.y, x3); + x.z = __shfl2((int)x.z, x2); + x.w = __shfl2((int)x.w, x1); } #undef CHACHA_PRIMITIVE @@ -589,7 +600,7 @@ void titan_scrypt_core_kernelB(uint32_t *d_odata, int begin, int end) } else load_key(d_odata, b, bx); for (int i = begin; i < end; i++) { - int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); + int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); uint4 t, tx; read_keys_direct(t, tx, start+32*j); b ^= t; bx ^= tx; block_mixer(b, bx, x1, x2, x3); @@ -623,7 +634,7 @@ void titan_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigne { // better divergent thread handling submitted by nVidia engineers, but // supposedly this does not run with the ANDERSEN memory access scheme - int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); + int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); int pos = j/LOOKUP_GAP; int loop = -1; uint4 t, tx; @@ -632,7 +643,7 @@ void titan_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigne while(i < end) { if (loop == -1) { - j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); + j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); pos = j/LOOKUP_GAP; loop = j-pos*LOOKUP_GAP; read_keys_direct(t, tx, start+32*pos); @@ -655,7 +666,7 @@ void titan_scrypt_core_kernelB_LG(uint32_t *d_odata, int begin, int end, unsigne // this is my original implementation, now used with the ANDERSEN // memory access scheme only. for (int i = begin; i < end; i++) { - int j = (__shfl((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); + int j = (__shfl2((int)bx.x, (threadIdx.x & 0x1c)) & (c_N_1)); int pos = j/LOOKUP_GAP, loop = j-pos*LOOKUP_GAP; uint4 t, tx; read_keys_direct(t, tx, start+32*pos); while (loop--)