Browse Source

neoscrypt: attempt to recode shift256R for SM 3.0

master
Tanguy Pruvot 9 years ago
parent
commit
7c7f40a634
  1. 1
      cuda_helper.h
  2. 20
      neoscrypt/cuda_neoscrypt.cu
  3. 123
      neoscrypt/cuda_vectors.h
  4. 2
      scrypt/titan_kernel.cu

1
cuda_helper.h

@ -64,6 +64,7 @@ extern const uint3 threadIdx; @@ -64,6 +64,7 @@ extern const uint3 threadIdx;
// Host and Compute 3.0
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
#define __ldg(x) (*(x))
#else
// Compute 3.2+
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )

20
neoscrypt/cuda_neoscrypt.cu

@ -214,7 +214,7 @@ void fastkdf256(const uint32_t* password, uint8_t* output) @@ -214,7 +214,7 @@ void fastkdf256(const uint32_t* password, uint8_t* output)
int bitbuf = rbuf << 3;
uint32_t shifted[9];
shift256R2(shifted, ((uint8*)input)[0], bitbuf);
shift256R(shifted, ((uint8*)input)[0], bitbuf);
for (int k = 0; k < 9; ++k) {
((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
@ -264,7 +264,7 @@ void fastkdf256(const uint32_t* password, uint8_t* output) @@ -264,7 +264,7 @@ void fastkdf256(const uint32_t* password, uint8_t* output)
}
static __forceinline__ __device__
void fastkdf32( const uint32_t * password, const uint32_t * salt, uint32_t * output)
void fastkdf32(const uint32_t * password, const uint32_t * salt, uint32_t * output)
{
uint8_t bufidx = 0;
uchar4 bufhelper;
@ -300,7 +300,7 @@ void fastkdf32( const uint32_t * password, const uint32_t * salt, uint32_t * out @@ -300,7 +300,7 @@ void fastkdf32( const uint32_t * password, const uint32_t * salt, uint32_t * out
int bitbuf = rbuf << 3;
uint32_t shifted[9];
shift256R2(shifted, ((uint8*)input)[0], bitbuf);
shift256R(shifted, ((uint8*)input)[0], bitbuf);
for (int k = 0; k < 9; ++k) {
((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
@ -455,7 +455,7 @@ void neoscrypt_gpu_hash_k01(uint32_t threads, uint32_t startNonce) @@ -455,7 +455,7 @@ void neoscrypt_gpu_hash_k01(uint32_t threads, uint32_t startNonce)
// if (thread < threads)
{
uint16 X[4];
((uintx64 *)X)[0]= __ldg32(&(W + shift)[0]);
((uintx64 *)X)[0]= ldg256(&(W + shift)[0]);
//#pragma unroll
for (int i = 0; i < 128; ++i)
@ -475,12 +475,12 @@ void neoscrypt_gpu_hash_k2(uint32_t threads, uint32_t startNonce) @@ -475,12 +475,12 @@ void neoscrypt_gpu_hash_k2(uint32_t threads, uint32_t startNonce)
// if (thread < threads)
{
uint16 X[4];
((uintx64 *)X)[0] = __ldg32(&(W + shift)[2048]);
((uintx64 *)X)[0] = ldg256(&(W + shift)[2048]);
for (int t = 0; t < 128; t++)
{
int idx = X[3].lo.s0 & 0x7F;
((uintx64 *)X)[0] ^= __ldg32(&(W + shift)[idx << 4]);
((uintx64 *)X)[0] ^= ldg256(&(W + shift)[idx << 4]);
neoscrypt_chacha(X);
}
@ -498,7 +498,7 @@ void neoscrypt_gpu_hash_k3(uint32_t threads, uint32_t startNonce) @@ -498,7 +498,7 @@ void neoscrypt_gpu_hash_k3(uint32_t threads, uint32_t startNonce)
uint32_t shift = SHIFT * 16 * thread;
uint16 Z[4];
((uintx64*)Z)[0] = __ldg32(&(W + shift)[0]);
((uintx64*)Z)[0] = ldg256(&(W + shift)[0]);
//#pragma unroll
for (int i = 0; i < 128; ++i) {
@ -529,14 +529,14 @@ void neoscrypt_gpu_hash_k4(int stratum, uint32_t threads, uint32_t startNonce, u @@ -529,14 +529,14 @@ void neoscrypt_gpu_hash_k4(int stratum, uint32_t threads, uint32_t startNonce, u
data[19] = (stratum) ? cuda_swab32(nonce) : nonce;
data[39] = data[19];
data[59] = data[19];
((uintx64 *)Z)[0] = __ldg32(&(W + shift)[2048]);
((uintx64 *)Z)[0] = ldg256(&(W + shift)[2048]);
for (int t = 0; t < 128; t++)
{
int idx = Z[3].lo.s0 & 0x7F;
((uintx64 *)Z)[0] ^= __ldg32(&(W + shift)[idx << 4]);
((uintx64 *)Z)[0] ^= ldg256(&(W + shift)[idx << 4]);
neoscrypt_salsa(Z);
}
((uintx64 *)Z)[0] ^= __ldg32(&(W + shift)[2064]);
((uintx64 *)Z)[0] ^= ldg256(&(W + shift)[2064]);
fastkdf32(data, (uint32_t*)Z, outbuf);
if (outbuf[7] <= pTarget[7]) {
uint32_t tmp = atomicExch(&nonceVector[0], nonce);

123
neoscrypt/cuda_vectors.h

@ -478,24 +478,23 @@ static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift @@ -478,24 +478,23 @@ static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift
#if __CUDA_ARCH__ < 320
// TO FINISH FOR SM 3.0 SUPPORT...
static __forceinline__ __device__ void shift256R2(uint32_t* ret, const uint8 &vec4, uint32_t shift)
{
uint32_t *v = (uint32_t*) &vec4.s0;
for (int i=0; i<8; i++) {
ret[i] = ROTR32(v[i], shift);
}
}
static __device__ __inline__ uintx64 __ldg32(const uint4 *ptr)
{
uintx64 ret = { 0 };
return ret;
// right shift a 64 bytes input (256-bits integer) by 0 8 16 24 bits
static __forceinline__ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
{
uint8_t *v = (uint8_t*) &vec4.s0;
uint8_t *r = (uint8_t*) ret;
uint8_t bytes = (uint8_t) (shift >> 3);
for (uint8_t i=0; i<bytes; i++)
r[i] = 0;
for (uint8_t i=bytes; i<32; i++)
r[i] = v[i-bytes];
ret[8] = vec4.s7 >> (32 - shift); // shuffled part required ?
//printf("A %02u %08x %08x > %08x %08x\n", shift, vec4.s6, vec4.s7, ret[7], ret[8]);
}
#else
static __forceinline__ __device__ void shift256R2(uint32_t* ret, const uint8 &vec4, uint32_t shift)
// right shift a 32 bytes input (256-bits integer) by 0 8 16 24 bits
static __forceinline__ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
{
uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0;
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
@ -523,9 +522,28 @@ static __forceinline__ __device__ void shift256R2(uint32_t* ret, const uint8 &ve @@ -523,9 +522,28 @@ static __forceinline__ __device__ void shift256R2(uint32_t* ret, const uint8 &ve
ret[1] = cuda_swab32(truc);
asm("shr.b32 %0, %1, %2;" : "=r"(truc) : "r"(truc3), "r"(shift));
ret[0] = cuda_swab32(truc);
//printf("B %02u %08x %08x > %08x %08x\n", shift, vec4.s6, vec4.s7, ret[7], ret[8]);
}
#endif
static __device__ __inline__ uintx64 __ldg32(const uint4 *ptr)
#if __CUDA_ARCH__ < 320
// copy 256 bytes
static __device__ __inline__ uintx64 ldg256(const uint4 *ptr)
{
uintx64 ret;
uint32_t *dst = (uint32_t*) &ret.s0;
uint32_t *src = (uint32_t*) &ptr[0].x;
for (int i=0; i < (256 / sizeof(uint32_t)); i++) {
dst[i] = src[i];
}
return ret;
}
#else
// complicated way to copy 256 bytes ;)
static __device__ __inline__ uintx64 ldg256(const uint4 *ptr)
{
uintx64 ret;
asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.w) : __LDG_PTR(ptr));
@ -546,79 +564,6 @@ static __device__ __inline__ uintx64 __ldg32(const uint4 *ptr) @@ -546,79 +564,6 @@ static __device__ __inline__ uintx64 __ldg32(const uint4 *ptr)
asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.w) : __LDG_PTR(ptr));
return ret;
}
#endif
static __forceinline__ __device__ uint8 swapvec(const uint8 &buf)
{
uint8 vec;
vec.s0 = cuda_swab32(buf.s0);
vec.s1 = cuda_swab32(buf.s1);
vec.s2 = cuda_swab32(buf.s2);
vec.s3 = cuda_swab32(buf.s3);
vec.s4 = cuda_swab32(buf.s4);
vec.s5 = cuda_swab32(buf.s5);
vec.s6 = cuda_swab32(buf.s6);
vec.s7 = cuda_swab32(buf.s7);
return vec;
}
static __forceinline__ __device__ uint8 swapvec(const uint8 *buf)
{
uint8 vec;
vec.s0 = cuda_swab32(buf[0].s0);
vec.s1 = cuda_swab32(buf[0].s1);
vec.s2 = cuda_swab32(buf[0].s2);
vec.s3 = cuda_swab32(buf[0].s3);
vec.s4 = cuda_swab32(buf[0].s4);
vec.s5 = cuda_swab32(buf[0].s5);
vec.s6 = cuda_swab32(buf[0].s6);
vec.s7 = cuda_swab32(buf[0].s7);
return vec;
}
static __forceinline__ __device__ uint16 swapvec(const uint16 *buf)
{
uint16 vec;
vec.s0 = cuda_swab32(buf[0].s0);
vec.s1 = cuda_swab32(buf[0].s1);
vec.s2 = cuda_swab32(buf[0].s2);
vec.s3 = cuda_swab32(buf[0].s3);
vec.s4 = cuda_swab32(buf[0].s4);
vec.s5 = cuda_swab32(buf[0].s5);
vec.s6 = cuda_swab32(buf[0].s6);
vec.s7 = cuda_swab32(buf[0].s7);
vec.s8 = cuda_swab32(buf[0].s8);
vec.s9 = cuda_swab32(buf[0].s9);
vec.sa = cuda_swab32(buf[0].sa);
vec.sb = cuda_swab32(buf[0].sb);
vec.sc = cuda_swab32(buf[0].sc);
vec.sd = cuda_swab32(buf[0].sd);
vec.se = cuda_swab32(buf[0].se);
vec.sf = cuda_swab32(buf[0].sf);
return vec;
}
static __forceinline__ __device__ uint16 swapvec(const uint16 &buf)
{
uint16 vec;
vec.s0 = cuda_swab32(buf.s0);
vec.s1 = cuda_swab32(buf.s1);
vec.s2 = cuda_swab32(buf.s2);
vec.s3 = cuda_swab32(buf.s3);
vec.s4 = cuda_swab32(buf.s4);
vec.s5 = cuda_swab32(buf.s5);
vec.s6 = cuda_swab32(buf.s6);
vec.s7 = cuda_swab32(buf.s7);
vec.s8 = cuda_swab32(buf.s8);
vec.s9 = cuda_swab32(buf.s9);
vec.sa = cuda_swab32(buf.sa);
vec.sb = cuda_swab32(buf.sb);
vec.sc = cuda_swab32(buf.sc);
vec.sd = cuda_swab32(buf.sd);
vec.se = cuda_swab32(buf.se);
vec.sf = cuda_swab32(buf.sf);
return vec;
}
#endif // #ifndef CUDA_VECTOR_H

2
scrypt/titan_kernel.cu

@ -23,7 +23,7 @@ typedef enum @@ -23,7 +23,7 @@ typedef enum
SIMPLE
} MemoryAccess;
#if __CUDA_ARCH__ < 350
#if __CUDA_ARCH__ < 320
// Kepler (Compute 3.0)
#define __ldg(x) (*(x))
#endif

Loading…
Cancel
Save