Browse Source

neoscrypt: attempt to recode shift256R for SM 3.0

master
Tanguy Pruvot 10 years ago
parent
commit
7c7f40a634
  1. 1
      cuda_helper.h
  2. 18
      neoscrypt/cuda_neoscrypt.cu
  3. 123
      neoscrypt/cuda_vectors.h
  4. 2
      scrypt/titan_kernel.cu

1
cuda_helper.h

@ -64,6 +64,7 @@ extern const uint3 threadIdx;
// Host and Compute 3.0 // Host and Compute 3.0
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) #define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
#define __ldg(x) (*(x))
#else #else
// Compute 3.2+ // Compute 3.2+
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) ) #define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )

18
neoscrypt/cuda_neoscrypt.cu

@ -214,7 +214,7 @@ void fastkdf256(const uint32_t* password, uint8_t* output)
int bitbuf = rbuf << 3; int bitbuf = rbuf << 3;
uint32_t shifted[9]; uint32_t shifted[9];
shift256R2(shifted, ((uint8*)input)[0], bitbuf); shift256R(shifted, ((uint8*)input)[0], bitbuf);
for (int k = 0; k < 9; ++k) { for (int k = 0; k < 9; ++k) {
((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k]; ((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
@ -300,7 +300,7 @@ void fastkdf32( const uint32_t * password, const uint32_t * salt, uint32_t * out
int bitbuf = rbuf << 3; int bitbuf = rbuf << 3;
uint32_t shifted[9]; uint32_t shifted[9];
shift256R2(shifted, ((uint8*)input)[0], bitbuf); shift256R(shifted, ((uint8*)input)[0], bitbuf);
for (int k = 0; k < 9; ++k) { for (int k = 0; k < 9; ++k) {
((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k]; ((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
@ -455,7 +455,7 @@ void neoscrypt_gpu_hash_k01(uint32_t threads, uint32_t startNonce)
// if (thread < threads) // if (thread < threads)
{ {
uint16 X[4]; uint16 X[4];
((uintx64 *)X)[0]= __ldg32(&(W + shift)[0]); ((uintx64 *)X)[0]= ldg256(&(W + shift)[0]);
//#pragma unroll //#pragma unroll
for (int i = 0; i < 128; ++i) for (int i = 0; i < 128; ++i)
@ -475,12 +475,12 @@ void neoscrypt_gpu_hash_k2(uint32_t threads, uint32_t startNonce)
// if (thread < threads) // if (thread < threads)
{ {
uint16 X[4]; uint16 X[4];
((uintx64 *)X)[0] = __ldg32(&(W + shift)[2048]); ((uintx64 *)X)[0] = ldg256(&(W + shift)[2048]);
for (int t = 0; t < 128; t++) for (int t = 0; t < 128; t++)
{ {
int idx = X[3].lo.s0 & 0x7F; int idx = X[3].lo.s0 & 0x7F;
((uintx64 *)X)[0] ^= __ldg32(&(W + shift)[idx << 4]); ((uintx64 *)X)[0] ^= ldg256(&(W + shift)[idx << 4]);
neoscrypt_chacha(X); neoscrypt_chacha(X);
} }
@ -498,7 +498,7 @@ void neoscrypt_gpu_hash_k3(uint32_t threads, uint32_t startNonce)
uint32_t shift = SHIFT * 16 * thread; uint32_t shift = SHIFT * 16 * thread;
uint16 Z[4]; uint16 Z[4];
((uintx64*)Z)[0] = __ldg32(&(W + shift)[0]); ((uintx64*)Z)[0] = ldg256(&(W + shift)[0]);
//#pragma unroll //#pragma unroll
for (int i = 0; i < 128; ++i) { for (int i = 0; i < 128; ++i) {
@ -529,14 +529,14 @@ void neoscrypt_gpu_hash_k4(int stratum, uint32_t threads, uint32_t startNonce, u
data[19] = (stratum) ? cuda_swab32(nonce) : nonce; data[19] = (stratum) ? cuda_swab32(nonce) : nonce;
data[39] = data[19]; data[39] = data[19];
data[59] = data[19]; data[59] = data[19];
((uintx64 *)Z)[0] = __ldg32(&(W + shift)[2048]); ((uintx64 *)Z)[0] = ldg256(&(W + shift)[2048]);
for (int t = 0; t < 128; t++) for (int t = 0; t < 128; t++)
{ {
int idx = Z[3].lo.s0 & 0x7F; int idx = Z[3].lo.s0 & 0x7F;
((uintx64 *)Z)[0] ^= __ldg32(&(W + shift)[idx << 4]); ((uintx64 *)Z)[0] ^= ldg256(&(W + shift)[idx << 4]);
neoscrypt_salsa(Z); neoscrypt_salsa(Z);
} }
((uintx64 *)Z)[0] ^= __ldg32(&(W + shift)[2064]); ((uintx64 *)Z)[0] ^= ldg256(&(W + shift)[2064]);
fastkdf32(data, (uint32_t*)Z, outbuf); fastkdf32(data, (uint32_t*)Z, outbuf);
if (outbuf[7] <= pTarget[7]) { if (outbuf[7] <= pTarget[7]) {
uint32_t tmp = atomicExch(&nonceVector[0], nonce); uint32_t tmp = atomicExch(&nonceVector[0], nonce);

123
neoscrypt/cuda_vectors.h

@ -478,24 +478,23 @@ static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift
#if __CUDA_ARCH__ < 320 #if __CUDA_ARCH__ < 320
// TO FINISH FOR SM 3.0 SUPPORT... // right shift a 64 bytes input (256-bits integer) by 0 8 16 24 bits
static __forceinline__ __device__ void shift256R2(uint32_t* ret, const uint8 &vec4, uint32_t shift) static __forceinline__ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
{ {
uint32_t *v = (uint32_t*) &vec4.s0; uint8_t *v = (uint8_t*) &vec4.s0;
for (int i=0; i<8; i++) { uint8_t *r = (uint8_t*) ret;
ret[i] = ROTR32(v[i], shift); uint8_t bytes = (uint8_t) (shift >> 3);
} for (uint8_t i=0; i<bytes; i++)
} r[i] = 0;
for (uint8_t i=bytes; i<32; i++)
static __device__ __inline__ uintx64 __ldg32(const uint4 *ptr) r[i] = v[i-bytes];
{ ret[8] = vec4.s7 >> (32 - shift); // shuffled part required ?
uintx64 ret = { 0 }; //printf("A %02u %08x %08x > %08x %08x\n", shift, vec4.s6, vec4.s7, ret[7], ret[8]);
return ret;
} }
#else #else
static __forceinline__ __device__ void shift256R2(uint32_t* ret, const uint8 &vec4, uint32_t shift) // right shift a 32 bytes input (256-bits integer) by 0 8 16 24 bits
static __forceinline__ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
{ {
uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0; uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0;
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
@ -523,9 +522,28 @@ static __forceinline__ __device__ void shift256R2(uint32_t* ret, const uint8 &ve
ret[1] = cuda_swab32(truc); ret[1] = cuda_swab32(truc);
asm("shr.b32 %0, %1, %2;" : "=r"(truc) : "r"(truc3), "r"(shift)); asm("shr.b32 %0, %1, %2;" : "=r"(truc) : "r"(truc3), "r"(shift));
ret[0] = cuda_swab32(truc); ret[0] = cuda_swab32(truc);
//printf("B %02u %08x %08x > %08x %08x\n", shift, vec4.s6, vec4.s7, ret[7], ret[8]);
} }
#endif
static __device__ __inline__ uintx64 __ldg32(const uint4 *ptr) #if __CUDA_ARCH__ < 320
// copy 256 bytes
static __device__ __inline__ uintx64 ldg256(const uint4 *ptr)
{
uintx64 ret;
uint32_t *dst = (uint32_t*) &ret.s0;
uint32_t *src = (uint32_t*) &ptr[0].x;
for (int i=0; i < (256 / sizeof(uint32_t)); i++) {
dst[i] = src[i];
}
return ret;
}
#else
// complicated way to copy 256 bytes ;)
static __device__ __inline__ uintx64 ldg256(const uint4 *ptr)
{ {
uintx64 ret; uintx64 ret;
asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.w) : __LDG_PTR(ptr)); asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.w) : __LDG_PTR(ptr));
@ -546,79 +564,6 @@ static __device__ __inline__ uintx64 __ldg32(const uint4 *ptr)
asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.w) : __LDG_PTR(ptr)); asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.w) : __LDG_PTR(ptr));
return ret; return ret;
} }
#endif #endif
static __forceinline__ __device__ uint8 swapvec(const uint8 &buf)
{
uint8 vec;
vec.s0 = cuda_swab32(buf.s0);
vec.s1 = cuda_swab32(buf.s1);
vec.s2 = cuda_swab32(buf.s2);
vec.s3 = cuda_swab32(buf.s3);
vec.s4 = cuda_swab32(buf.s4);
vec.s5 = cuda_swab32(buf.s5);
vec.s6 = cuda_swab32(buf.s6);
vec.s7 = cuda_swab32(buf.s7);
return vec;
}
static __forceinline__ __device__ uint8 swapvec(const uint8 *buf)
{
uint8 vec;
vec.s0 = cuda_swab32(buf[0].s0);
vec.s1 = cuda_swab32(buf[0].s1);
vec.s2 = cuda_swab32(buf[0].s2);
vec.s3 = cuda_swab32(buf[0].s3);
vec.s4 = cuda_swab32(buf[0].s4);
vec.s5 = cuda_swab32(buf[0].s5);
vec.s6 = cuda_swab32(buf[0].s6);
vec.s7 = cuda_swab32(buf[0].s7);
return vec;
}
static __forceinline__ __device__ uint16 swapvec(const uint16 *buf)
{
uint16 vec;
vec.s0 = cuda_swab32(buf[0].s0);
vec.s1 = cuda_swab32(buf[0].s1);
vec.s2 = cuda_swab32(buf[0].s2);
vec.s3 = cuda_swab32(buf[0].s3);
vec.s4 = cuda_swab32(buf[0].s4);
vec.s5 = cuda_swab32(buf[0].s5);
vec.s6 = cuda_swab32(buf[0].s6);
vec.s7 = cuda_swab32(buf[0].s7);
vec.s8 = cuda_swab32(buf[0].s8);
vec.s9 = cuda_swab32(buf[0].s9);
vec.sa = cuda_swab32(buf[0].sa);
vec.sb = cuda_swab32(buf[0].sb);
vec.sc = cuda_swab32(buf[0].sc);
vec.sd = cuda_swab32(buf[0].sd);
vec.se = cuda_swab32(buf[0].se);
vec.sf = cuda_swab32(buf[0].sf);
return vec;
}
static __forceinline__ __device__ uint16 swapvec(const uint16 &buf)
{
uint16 vec;
vec.s0 = cuda_swab32(buf.s0);
vec.s1 = cuda_swab32(buf.s1);
vec.s2 = cuda_swab32(buf.s2);
vec.s3 = cuda_swab32(buf.s3);
vec.s4 = cuda_swab32(buf.s4);
vec.s5 = cuda_swab32(buf.s5);
vec.s6 = cuda_swab32(buf.s6);
vec.s7 = cuda_swab32(buf.s7);
vec.s8 = cuda_swab32(buf.s8);
vec.s9 = cuda_swab32(buf.s9);
vec.sa = cuda_swab32(buf.sa);
vec.sb = cuda_swab32(buf.sb);
vec.sc = cuda_swab32(buf.sc);
vec.sd = cuda_swab32(buf.sd);
vec.se = cuda_swab32(buf.se);
vec.sf = cuda_swab32(buf.sf);
return vec;
}
#endif // #ifndef CUDA_VECTOR_H #endif // #ifndef CUDA_VECTOR_H

2
scrypt/titan_kernel.cu

@ -23,7 +23,7 @@ typedef enum
SIMPLE SIMPLE
} MemoryAccess; } MemoryAccess;
#if __CUDA_ARCH__ < 350 #if __CUDA_ARCH__ < 320
// Kepler (Compute 3.0) // Kepler (Compute 3.0)
#define __ldg(x) (*(x)) #define __ldg(x) (*(x))
#endif #endif

Loading…
Cancel
Save