neoscrypt: attempt to recode shift256R for SM 3.0
This commit is contained in:
parent
6bb128c055
commit
7c7f40a634
@ -64,6 +64,7 @@ extern const uint3 threadIdx;
|
||||
// Host and Compute 3.0
|
||||
#define ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n))))
|
||||
#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
|
||||
#define __ldg(x) (*(x))
|
||||
#else
|
||||
// Compute 3.2+
|
||||
#define ROTL32(x, n) __funnelshift_l( (x), (x), (n) )
|
||||
|
@ -214,7 +214,7 @@ void fastkdf256(const uint32_t* password, uint8_t* output)
|
||||
int bitbuf = rbuf << 3;
|
||||
uint32_t shifted[9];
|
||||
|
||||
shift256R2(shifted, ((uint8*)input)[0], bitbuf);
|
||||
shift256R(shifted, ((uint8*)input)[0], bitbuf);
|
||||
|
||||
for (int k = 0; k < 9; ++k) {
|
||||
((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
|
||||
@ -264,7 +264,7 @@ void fastkdf256(const uint32_t* password, uint8_t* output)
|
||||
}
|
||||
|
||||
static __forceinline__ __device__
|
||||
void fastkdf32( const uint32_t * password, const uint32_t * salt, uint32_t * output)
|
||||
void fastkdf32(const uint32_t * password, const uint32_t * salt, uint32_t * output)
|
||||
{
|
||||
uint8_t bufidx = 0;
|
||||
uchar4 bufhelper;
|
||||
@ -300,7 +300,7 @@ void fastkdf32( const uint32_t * password, const uint32_t * salt, uint32_t * out
|
||||
int bitbuf = rbuf << 3;
|
||||
uint32_t shifted[9];
|
||||
|
||||
shift256R2(shifted, ((uint8*)input)[0], bitbuf);
|
||||
shift256R(shifted, ((uint8*)input)[0], bitbuf);
|
||||
|
||||
for (int k = 0; k < 9; ++k) {
|
||||
((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
|
||||
@ -455,7 +455,7 @@ void neoscrypt_gpu_hash_k01(uint32_t threads, uint32_t startNonce)
|
||||
// if (thread < threads)
|
||||
{
|
||||
uint16 X[4];
|
||||
((uintx64 *)X)[0]= __ldg32(&(W + shift)[0]);
|
||||
((uintx64 *)X)[0]= ldg256(&(W + shift)[0]);
|
||||
|
||||
//#pragma unroll
|
||||
for (int i = 0; i < 128; ++i)
|
||||
@ -475,12 +475,12 @@ void neoscrypt_gpu_hash_k2(uint32_t threads, uint32_t startNonce)
|
||||
// if (thread < threads)
|
||||
{
|
||||
uint16 X[4];
|
||||
((uintx64 *)X)[0] = __ldg32(&(W + shift)[2048]);
|
||||
((uintx64 *)X)[0] = ldg256(&(W + shift)[2048]);
|
||||
|
||||
for (int t = 0; t < 128; t++)
|
||||
{
|
||||
int idx = X[3].lo.s0 & 0x7F;
|
||||
((uintx64 *)X)[0] ^= __ldg32(&(W + shift)[idx << 4]);
|
||||
((uintx64 *)X)[0] ^= ldg256(&(W + shift)[idx << 4]);
|
||||
neoscrypt_chacha(X);
|
||||
|
||||
}
|
||||
@ -498,7 +498,7 @@ void neoscrypt_gpu_hash_k3(uint32_t threads, uint32_t startNonce)
|
||||
uint32_t shift = SHIFT * 16 * thread;
|
||||
uint16 Z[4];
|
||||
|
||||
((uintx64*)Z)[0] = __ldg32(&(W + shift)[0]);
|
||||
((uintx64*)Z)[0] = ldg256(&(W + shift)[0]);
|
||||
|
||||
//#pragma unroll
|
||||
for (int i = 0; i < 128; ++i) {
|
||||
@ -529,14 +529,14 @@ void neoscrypt_gpu_hash_k4(int stratum, uint32_t threads, uint32_t startNonce, u
|
||||
data[19] = (stratum) ? cuda_swab32(nonce) : nonce;
|
||||
data[39] = data[19];
|
||||
data[59] = data[19];
|
||||
((uintx64 *)Z)[0] = __ldg32(&(W + shift)[2048]);
|
||||
((uintx64 *)Z)[0] = ldg256(&(W + shift)[2048]);
|
||||
for (int t = 0; t < 128; t++)
|
||||
{
|
||||
int idx = Z[3].lo.s0 & 0x7F;
|
||||
((uintx64 *)Z)[0] ^= __ldg32(&(W + shift)[idx << 4]);
|
||||
((uintx64 *)Z)[0] ^= ldg256(&(W + shift)[idx << 4]);
|
||||
neoscrypt_salsa(Z);
|
||||
}
|
||||
((uintx64 *)Z)[0] ^= __ldg32(&(W + shift)[2064]);
|
||||
((uintx64 *)Z)[0] ^= ldg256(&(W + shift)[2064]);
|
||||
fastkdf32(data, (uint32_t*)Z, outbuf);
|
||||
if (outbuf[7] <= pTarget[7]) {
|
||||
uint32_t tmp = atomicExch(&nonceVector[0], nonce);
|
||||
|
@ -478,24 +478,23 @@ static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift
|
||||
|
||||
#if __CUDA_ARCH__ < 320
|
||||
|
||||
// TO FINISH FOR SM 3.0 SUPPORT...
|
||||
static __forceinline__ __device__ void shift256R2(uint32_t* ret, const uint8 &vec4, uint32_t shift)
|
||||
// right shift a 64 bytes input (256-bits integer) by 0 8 16 24 bits
|
||||
static __forceinline__ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
|
||||
{
|
||||
uint32_t *v = (uint32_t*) &vec4.s0;
|
||||
for (int i=0; i<8; i++) {
|
||||
ret[i] = ROTR32(v[i], shift);
|
||||
}
|
||||
uint8_t *v = (uint8_t*) &vec4.s0;
|
||||
uint8_t *r = (uint8_t*) ret;
|
||||
uint8_t bytes = (uint8_t) (shift >> 3);
|
||||
for (uint8_t i=0; i<bytes; i++)
|
||||
r[i] = 0;
|
||||
for (uint8_t i=bytes; i<32; i++)
|
||||
r[i] = v[i-bytes];
|
||||
ret[8] = vec4.s7 >> (32 - shift); // shuffled part required ?
|
||||
//printf("A %02u %08x %08x > %08x %08x\n", shift, vec4.s6, vec4.s7, ret[7], ret[8]);
|
||||
}
|
||||
|
||||
static __device__ __inline__ uintx64 __ldg32(const uint4 *ptr)
|
||||
{
|
||||
uintx64 ret = { 0 };
|
||||
return ret;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static __forceinline__ __device__ void shift256R2(uint32_t* ret, const uint8 &vec4, uint32_t shift)
|
||||
// right shift a 32 bytes input (256-bits integer) by 0 8 16 24 bits
|
||||
static __forceinline__ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
|
||||
{
|
||||
uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0;
|
||||
asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
|
||||
@ -523,9 +522,28 @@ static __forceinline__ __device__ void shift256R2(uint32_t* ret, const uint8 &ve
|
||||
ret[1] = cuda_swab32(truc);
|
||||
asm("shr.b32 %0, %1, %2;" : "=r"(truc) : "r"(truc3), "r"(shift));
|
||||
ret[0] = cuda_swab32(truc);
|
||||
//printf("B %02u %08x %08x > %08x %08x\n", shift, vec4.s6, vec4.s7, ret[7], ret[8]);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if __CUDA_ARCH__ < 320
|
||||
|
||||
// copy 256 bytes
|
||||
static __device__ __inline__ uintx64 ldg256(const uint4 *ptr)
|
||||
{
|
||||
uintx64 ret;
|
||||
uint32_t *dst = (uint32_t*) &ret.s0;
|
||||
uint32_t *src = (uint32_t*) &ptr[0].x;
|
||||
for (int i=0; i < (256 / sizeof(uint32_t)); i++) {
|
||||
dst[i] = src[i];
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
static __device__ __inline__ uintx64 __ldg32(const uint4 *ptr)
|
||||
#else
|
||||
|
||||
// complicated way to copy 256 bytes ;)
|
||||
static __device__ __inline__ uintx64 ldg256(const uint4 *ptr)
|
||||
{
|
||||
uintx64 ret;
|
||||
asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.s0.s0.s0.s0.x), "=r"(ret.s0.s0.s0.s0.y), "=r"(ret.s0.s0.s0.s0.z), "=r"(ret.s0.s0.s0.s0.w) : __LDG_PTR(ptr));
|
||||
@ -546,79 +564,6 @@ static __device__ __inline__ uintx64 __ldg32(const uint4 *ptr)
|
||||
asm("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4+240];" : "=r"(ret.s1.s1.s1.s1.x), "=r"(ret.s1.s1.s1.s1.y), "=r"(ret.s1.s1.s1.s1.z), "=r"(ret.s1.s1.s1.s1.w) : __LDG_PTR(ptr));
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static __forceinline__ __device__ uint8 swapvec(const uint8 &buf)
|
||||
{
|
||||
uint8 vec;
|
||||
vec.s0 = cuda_swab32(buf.s0);
|
||||
vec.s1 = cuda_swab32(buf.s1);
|
||||
vec.s2 = cuda_swab32(buf.s2);
|
||||
vec.s3 = cuda_swab32(buf.s3);
|
||||
vec.s4 = cuda_swab32(buf.s4);
|
||||
vec.s5 = cuda_swab32(buf.s5);
|
||||
vec.s6 = cuda_swab32(buf.s6);
|
||||
vec.s7 = cuda_swab32(buf.s7);
|
||||
return vec;
|
||||
}
|
||||
|
||||
static __forceinline__ __device__ uint8 swapvec(const uint8 *buf)
|
||||
{
|
||||
uint8 vec;
|
||||
vec.s0 = cuda_swab32(buf[0].s0);
|
||||
vec.s1 = cuda_swab32(buf[0].s1);
|
||||
vec.s2 = cuda_swab32(buf[0].s2);
|
||||
vec.s3 = cuda_swab32(buf[0].s3);
|
||||
vec.s4 = cuda_swab32(buf[0].s4);
|
||||
vec.s5 = cuda_swab32(buf[0].s5);
|
||||
vec.s6 = cuda_swab32(buf[0].s6);
|
||||
vec.s7 = cuda_swab32(buf[0].s7);
|
||||
return vec;
|
||||
}
|
||||
|
||||
static __forceinline__ __device__ uint16 swapvec(const uint16 *buf)
|
||||
{
|
||||
uint16 vec;
|
||||
vec.s0 = cuda_swab32(buf[0].s0);
|
||||
vec.s1 = cuda_swab32(buf[0].s1);
|
||||
vec.s2 = cuda_swab32(buf[0].s2);
|
||||
vec.s3 = cuda_swab32(buf[0].s3);
|
||||
vec.s4 = cuda_swab32(buf[0].s4);
|
||||
vec.s5 = cuda_swab32(buf[0].s5);
|
||||
vec.s6 = cuda_swab32(buf[0].s6);
|
||||
vec.s7 = cuda_swab32(buf[0].s7);
|
||||
vec.s8 = cuda_swab32(buf[0].s8);
|
||||
vec.s9 = cuda_swab32(buf[0].s9);
|
||||
vec.sa = cuda_swab32(buf[0].sa);
|
||||
vec.sb = cuda_swab32(buf[0].sb);
|
||||
vec.sc = cuda_swab32(buf[0].sc);
|
||||
vec.sd = cuda_swab32(buf[0].sd);
|
||||
vec.se = cuda_swab32(buf[0].se);
|
||||
vec.sf = cuda_swab32(buf[0].sf);
|
||||
return vec;
|
||||
}
|
||||
|
||||
static __forceinline__ __device__ uint16 swapvec(const uint16 &buf)
|
||||
{
|
||||
uint16 vec;
|
||||
vec.s0 = cuda_swab32(buf.s0);
|
||||
vec.s1 = cuda_swab32(buf.s1);
|
||||
vec.s2 = cuda_swab32(buf.s2);
|
||||
vec.s3 = cuda_swab32(buf.s3);
|
||||
vec.s4 = cuda_swab32(buf.s4);
|
||||
vec.s5 = cuda_swab32(buf.s5);
|
||||
vec.s6 = cuda_swab32(buf.s6);
|
||||
vec.s7 = cuda_swab32(buf.s7);
|
||||
vec.s8 = cuda_swab32(buf.s8);
|
||||
vec.s9 = cuda_swab32(buf.s9);
|
||||
vec.sa = cuda_swab32(buf.sa);
|
||||
vec.sb = cuda_swab32(buf.sb);
|
||||
vec.sc = cuda_swab32(buf.sc);
|
||||
vec.sd = cuda_swab32(buf.sd);
|
||||
vec.se = cuda_swab32(buf.se);
|
||||
vec.sf = cuda_swab32(buf.sf);
|
||||
return vec;
|
||||
}
|
||||
|
||||
#endif // #ifndef CUDA_VECTOR_H
|
||||
|
@ -23,7 +23,7 @@ typedef enum
|
||||
SIMPLE
|
||||
} MemoryAccess;
|
||||
|
||||
#if __CUDA_ARCH__ < 350
|
||||
#if __CUDA_ARCH__ < 320
|
||||
// Kepler (Compute 3.0)
|
||||
#define __ldg(x) (*(x))
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user