diff --git a/cuda_helper.h b/cuda_helper.h index 55db8a7..f893b77 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -64,6 +64,7 @@ extern const uint3 threadIdx; // Host and Compute 3.0 #define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) +#define __ldg(x) (*(x)) #else // Compute 3.2+ #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) ) diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu index 75f949f..cd0908f 100644 --- a/neoscrypt/cuda_neoscrypt.cu +++ b/neoscrypt/cuda_neoscrypt.cu @@ -214,7 +214,7 @@ void fastkdf256(const uint32_t* password, uint8_t* output) int bitbuf = rbuf << 3; uint32_t shifted[9]; - shift256R2(shifted, ((uint8*)input)[0], bitbuf); + shift256R(shifted, ((uint8*)input)[0], bitbuf); for (int k = 0; k < 9; ++k) { ((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k]; @@ -264,7 +264,7 @@ void fastkdf256(const uint32_t* password, uint8_t* output) } static __forceinline__ __device__ -void fastkdf32( const uint32_t * password, const uint32_t * salt, uint32_t * output) +void fastkdf32(const uint32_t * password, const uint32_t * salt, uint32_t * output) { uint8_t bufidx = 0; uchar4 bufhelper; @@ -300,7 +300,7 @@ void fastkdf32( const uint32_t * password, const uint32_t * salt, uint32_t * out int bitbuf = rbuf << 3; uint32_t shifted[9]; - shift256R2(shifted, ((uint8*)input)[0], bitbuf); + shift256R(shifted, ((uint8*)input)[0], bitbuf); for (int k = 0; k < 9; ++k) { ((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k]; @@ -455,7 +455,7 @@ void neoscrypt_gpu_hash_k01(uint32_t threads, uint32_t startNonce) // if (thread < threads) { uint16 X[4]; - ((uintx64 *)X)[0]= __ldg32(&(W + shift)[0]); + ((uintx64 *)X)[0]= ldg256(&(W + shift)[0]); //#pragma unroll for (int i = 0; i < 128; ++i) @@ -475,12 +475,12 @@ void neoscrypt_gpu_hash_k2(uint32_t threads, uint32_t startNonce) // if (thread < threads) { uint16 X[4]; - ((uintx64 *)X)[0] = __ldg32(&(W + shift)[2048]); + ((uintx64 *)X)[0] = ldg256(&(W + shift)[2048]); for (int t = 0; t < 128; t++) { int idx = X[3].lo.s0 & 0x7F; - ((uintx64 *)X)[0] ^= __ldg32(&(W + shift)[idx << 4]); + ((uintx64 *)X)[0] ^= ldg256(&(W + shift)[idx << 4]); neoscrypt_chacha(X); } @@ -498,7 +498,7 @@ void neoscrypt_gpu_hash_k3(uint32_t threads, uint32_t startNonce) uint32_t shift = SHIFT * 16 * thread; uint16 Z[4]; - ((uintx64*)Z)[0] = __ldg32(&(W + shift)[0]); + ((uintx64*)Z)[0] = ldg256(&(W + shift)[0]); //#pragma unroll for (int i = 0; i < 128; ++i) { @@ -529,14 +529,14 @@ void neoscrypt_gpu_hash_k4(int stratum, uint32_t threads, uint32_t startNonce, u data[19] = (stratum) ? cuda_swab32(nonce) : nonce; data[39] = data[19]; data[59] = data[19]; - ((uintx64 *)Z)[0] = __ldg32(&(W + shift)[2048]); + ((uintx64 *)Z)[0] = ldg256(&(W + shift)[2048]); for (int t = 0; t < 128; t++) { int idx = Z[3].lo.s0 & 0x7F; - ((uintx64 *)Z)[0] ^= __ldg32(&(W + shift)[idx << 4]); + ((uintx64 *)Z)[0] ^= ldg256(&(W + shift)[idx << 4]); neoscrypt_salsa(Z); } - ((uintx64 *)Z)[0] ^= __ldg32(&(W + shift)[2064]); + ((uintx64 *)Z)[0] ^= ldg256(&(W + shift)[2064]); fastkdf32(data, (uint32_t*)Z, outbuf); if (outbuf[7] <= pTarget[7]) { uint32_t tmp = atomicExch(&nonceVector[0], nonce); diff --git a/neoscrypt/cuda_vectors.h b/neoscrypt/cuda_vectors.h index 0036740..67f77d9 100644 --- a/neoscrypt/cuda_vectors.h +++ b/neoscrypt/cuda_vectors.h @@ -478,24 +478,23 @@ static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift #if __CUDA_ARCH__ < 320 -// TO FINISH FOR SM 3.0 SUPPORT... -static __forceinline__ __device__ void shift256R2(uint32_t* ret, const uint8 &vec4, uint32_t shift) -{ - uint32_t *v = (uint32_t*) &vec4.s0; - for (int i=0; i<8; i++) { - ret[i] = ROTR32(v[i], shift); - } -} - -static __device__ __inline__ uintx64 __ldg32(const uint4 *ptr) -{ - uintx64 ret = { 0 }; - return ret; +// right shift a 64 bytes input (256-bits integer) by 0 8 16 24 bits +static __forceinline__ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift) +{ + uint8_t *v = (uint8_t*) &vec4.s0; + uint8_t *r = (uint8_t*) ret; + uint8_t bytes = (uint8_t) (shift >> 3); + for (uint8_t i=0; i> (32 - shift); // shuffled part required ? + //printf("A %02u %08x %08x > %08x %08x\n", shift, vec4.s6, vec4.s7, ret[7], ret[8]); } - #else -static __forceinline__ __device__ void shift256R2(uint32_t* ret, const uint8 &vec4, uint32_t shift) +// right shift a 32 bytes input (256-bits integer) by 0 8 16 24 bits +static __forceinline__ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift) { uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0; asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); @@ -523,9 +522,28 @@ static __forceinline__ __device__ void shift256R2(uint32_t* ret, const uint8 &ve ret[1] = cuda_swab32(truc); asm("shr.b32 %0, %1, %2;" : "=r"(truc) : "r"(truc3), "r"(shift)); ret[0] = cuda_swab32(truc); + //printf("B %02u %08x %08x > %08x %08x\n", shift, vec4.s6, vec4.s7, ret[7], ret[8]); } +#endif -static __device__ __inline__ uintx64 __ldg32(const uint4 *ptr) +#if __CUDA_ARCH__ < 320 + +// copy 256 bytes +static __device__ __inline__ uintx64 ldg256(const uint4 *ptr) +{ + uintx64 ret; + uint32_t *dst = (uint32_t*) &ret.s0; + uint32_t *src = (uint32_t*) &ptr[0].x; + for (int i=0; i < (256 / sizeof(uint32_t)); i++) { + dst[i] = src[i]; + } + return ret; +} + +#else + +// complicated way to copy 256 bytes ;) +static __device__ __inline__ uintx64 ldg256(const uint4 *ptr) { uintx64 ret; asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.w) : __LDG_PTR(ptr)); @@ -546,79 +564,6 @@ static __device__ __inline__ uintx64 __ldg32(const uint4 *ptr) asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.w) : __LDG_PTR(ptr)); return ret; } - #endif -static __forceinline__ __device__ uint8 swapvec(const uint8 &buf) -{ - uint8 vec; - vec.s0 = cuda_swab32(buf.s0); - vec.s1 = cuda_swab32(buf.s1); - vec.s2 = cuda_swab32(buf.s2); - vec.s3 = cuda_swab32(buf.s3); - vec.s4 = cuda_swab32(buf.s4); - vec.s5 = cuda_swab32(buf.s5); - vec.s6 = cuda_swab32(buf.s6); - vec.s7 = cuda_swab32(buf.s7); - return vec; -} - -static __forceinline__ __device__ uint8 swapvec(const uint8 *buf) -{ - uint8 vec; - vec.s0 = cuda_swab32(buf[0].s0); - vec.s1 = cuda_swab32(buf[0].s1); - vec.s2 = cuda_swab32(buf[0].s2); - vec.s3 = cuda_swab32(buf[0].s3); - vec.s4 = cuda_swab32(buf[0].s4); - vec.s5 = cuda_swab32(buf[0].s5); - vec.s6 = cuda_swab32(buf[0].s6); - vec.s7 = cuda_swab32(buf[0].s7); - return vec; -} - -static __forceinline__ __device__ uint16 swapvec(const uint16 *buf) -{ - uint16 vec; - vec.s0 = cuda_swab32(buf[0].s0); - vec.s1 = cuda_swab32(buf[0].s1); - vec.s2 = cuda_swab32(buf[0].s2); - vec.s3 = cuda_swab32(buf[0].s3); - vec.s4 = cuda_swab32(buf[0].s4); - vec.s5 = cuda_swab32(buf[0].s5); - vec.s6 = cuda_swab32(buf[0].s6); - vec.s7 = cuda_swab32(buf[0].s7); - vec.s8 = cuda_swab32(buf[0].s8); - vec.s9 = cuda_swab32(buf[0].s9); - vec.sa = cuda_swab32(buf[0].sa); - vec.sb = cuda_swab32(buf[0].sb); - vec.sc = cuda_swab32(buf[0].sc); - vec.sd = cuda_swab32(buf[0].sd); - vec.se = cuda_swab32(buf[0].se); - vec.sf = cuda_swab32(buf[0].sf); - return vec; -} - -static __forceinline__ __device__ uint16 swapvec(const uint16 &buf) -{ - uint16 vec; - vec.s0 = cuda_swab32(buf.s0); - vec.s1 = cuda_swab32(buf.s1); - vec.s2 = cuda_swab32(buf.s2); - vec.s3 = cuda_swab32(buf.s3); - vec.s4 = cuda_swab32(buf.s4); - vec.s5 = cuda_swab32(buf.s5); - vec.s6 = cuda_swab32(buf.s6); - vec.s7 = cuda_swab32(buf.s7); - vec.s8 = cuda_swab32(buf.s8); - vec.s9 = cuda_swab32(buf.s9); - vec.sa = cuda_swab32(buf.sa); - vec.sb = cuda_swab32(buf.sb); - vec.sc = cuda_swab32(buf.sc); - vec.sd = cuda_swab32(buf.sd); - vec.se = cuda_swab32(buf.se); - vec.sf = cuda_swab32(buf.sf); - return vec; -} - #endif // #ifndef CUDA_VECTOR_H diff --git a/scrypt/titan_kernel.cu b/scrypt/titan_kernel.cu index 197aabd..9b64006 100644 --- a/scrypt/titan_kernel.cu +++ b/scrypt/titan_kernel.cu @@ -23,7 +23,7 @@ typedef enum SIMPLE } MemoryAccess; -#if __CUDA_ARCH__ < 350 +#if __CUDA_ARCH__ < 320 // Kepler (Compute 3.0) #define __ldg(x) (*(x)) #endif