mirror of
https://github.com/GOSTSec/sgminer
synced 2025-01-24 21:44:38 +00:00
387bb28952
Pushed the wrong kernel previously and I changed the TC to not override the intensity. As with scrypt, intensity will calculate TC only if TC is not set.
525 lines
15 KiB
Common Lisp
525 lines
15 KiB
Common Lisp
/* NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20 */
|
|
/* Adapted and improved for 14.x drivers by Wolf9466 (Wolf`) */
|
|
|
|
// Stupid AMD compiler ignores the unroll pragma in these two
|
|
#define SALSA_SMALL_UNROLL 3
|
|
#define CHACHA_SMALL_UNROLL 3
|
|
|
|
// If SMALL_BLAKE2S is defined, BLAKE2S_UNROLL is interpreted
|
|
// as the unroll factor; must divide cleanly into ten.
|
|
// Usually a bad idea.
|
|
//#define SMALL_BLAKE2S
|
|
//#define BLAKE2S_UNROLL 5
|
|
|
|
#define BLOCK_SIZE 64U
|
|
#define FASTKDF_BUFFER_SIZE 256U
|
|
#ifndef PASSWORD_LEN
|
|
#define PASSWORD_LEN 80U
|
|
#endif
|
|
|
|
#if !defined(cl_khr_byte_addressable_store)
|
|
#error "Device does not support unaligned stores"
|
|
#endif
|
|
|
|
// Swaps 128 bytes at a time without using temp vars
|
|
void SwapBytes128(void *restrict A, void *restrict B, uint len)
|
|
{
|
|
#pragma unroll 2
|
|
for(int i = 0; i < (len >> 7); ++i)
|
|
{
|
|
((ulong16 *)A)[i] ^= ((ulong16 *)B)[i];
|
|
((ulong16 *)B)[i] ^= ((ulong16 *)A)[i];
|
|
((ulong16 *)A)[i] ^= ((ulong16 *)B)[i];
|
|
}
|
|
}
|
|
|
|
void CopyBytes128(void *restrict dst, const void *restrict src, uint len)
|
|
{
|
|
#pragma unroll 2
|
|
for(int i = 0; i < len; ++i)
|
|
((ulong16 *)dst)[i] = ((ulong16 *)src)[i];
|
|
}
|
|
|
|
void CopyBytes(void *restrict dst, const void *restrict src, uint len)
|
|
{
|
|
for(int i = 0; i < len; ++i)
|
|
((uchar *)dst)[i] = ((uchar *)src)[i];
|
|
}
|
|
|
|
void XORBytesInPlace(void *restrict dst, const void *restrict src, uint len)
|
|
{
|
|
for(int i = 0; i < len; ++i)
|
|
((uchar *)dst)[i] ^= ((uchar *)src)[i];
|
|
}
|
|
|
|
void XORBytes(void *restrict dst, const void *restrict src1, const void *restrict src2, uint len)
|
|
{
|
|
#pragma unroll 1
|
|
for(int i = 0; i < len; ++i)
|
|
((uchar *)dst)[i] = ((uchar *)src1)[i] ^ ((uchar *)src2)[i];
|
|
}
|
|
|
|
// Blake2S
|
|
|
|
#define BLAKE2S_BLOCK_SIZE 64U
|
|
#define BLAKE2S_OUT_SIZE 32U
|
|
#define BLAKE2S_KEY_SIZE 32U
|
|
|
|
static const __constant uint BLAKE2S_IV[8] =
|
|
{
|
|
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
|
|
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
|
|
};
|
|
|
|
static const __constant uchar BLAKE2S_SIGMA[10][16] =
|
|
{
|
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
|
|
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
|
|
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
|
|
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
|
|
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
|
|
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
|
|
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
|
|
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
|
|
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
|
|
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } ,
|
|
};
|
|
|
|
#define BLAKE_G(idx0, idx1, a, b, c, d, key) do { \
|
|
a += b + key[BLAKE2S_SIGMA[idx0][idx1]]; \
|
|
d = rotate(d ^ a, 16U); \
|
|
c += d; \
|
|
b = rotate(b ^ c, 20U); \
|
|
a += b + key[BLAKE2S_SIGMA[idx0][idx1 + 1]]; \
|
|
d = rotate(d ^ a, 24U); \
|
|
c += d; \
|
|
b = rotate(b ^ c, 25U); \
|
|
} while(0)
|
|
|
|
void Blake2S(uint *restrict inout, const uint *restrict inkey)
|
|
{
|
|
uint16 V;
|
|
uint8 tmpblock;
|
|
|
|
// Load first block (IV into V.lo) and constants (IV into V.hi)
|
|
V.lo = V.hi = vload8(0U, BLAKE2S_IV);
|
|
|
|
// XOR with initial constant
|
|
V.s0 ^= 0x01012020;
|
|
|
|
// Copy input block for later
|
|
tmpblock = V.lo;
|
|
|
|
// XOR length of message so far (including this block)
|
|
// There are two uints for this field, but high uint is zero
|
|
V.sc ^= BLAKE2S_BLOCK_SIZE;
|
|
|
|
// Compress state, using the key as the key
|
|
#ifdef SMALL_BLAKE2S
|
|
#pragma unroll BLAKE2S_UNROLL
|
|
#else
|
|
#pragma unroll
|
|
#endif
|
|
for(int x = 0; x < 10; ++x)
|
|
{
|
|
BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inkey);
|
|
BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inkey);
|
|
BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inkey);
|
|
BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inkey);
|
|
BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inkey);
|
|
BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inkey);
|
|
BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inkey);
|
|
BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inkey);
|
|
}
|
|
|
|
// XOR low part of state with the high part,
|
|
// then with the original input block.
|
|
V.lo ^= V.hi ^ tmpblock;
|
|
|
|
// Load constants (IV into V.hi)
|
|
V.hi = vload8(0U, BLAKE2S_IV);
|
|
|
|
// Copy input block for later
|
|
tmpblock = V.lo;
|
|
|
|
// XOR length of message into block again
|
|
V.sc ^= BLAKE2S_BLOCK_SIZE << 1;
|
|
|
|
// Last block compression - XOR final constant into state
|
|
V.se ^= 0xFFFFFFFFU;
|
|
|
|
// Compress block, using the input as the key
|
|
#ifdef SMALL_BLAKE2S
|
|
#pragma unroll BLAKE2S_UNROLL
|
|
#else
|
|
#pragma unroll
|
|
#endif
|
|
for(int x = 0; x < 10; ++x)
|
|
{
|
|
BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inout);
|
|
BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inout);
|
|
BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inout);
|
|
BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inout);
|
|
BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inout);
|
|
BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inout);
|
|
BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inout);
|
|
BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inout);
|
|
}
|
|
|
|
// XOR low part of state with high part, then with input block
|
|
V.lo ^= V.hi ^ tmpblock;
|
|
|
|
// Store result in input/output buffer
|
|
vstore8(V.lo, 0, inout);
|
|
}
|
|
|
|
/* FastKDF, a fast buffered key derivation function:
|
|
* FASTKDF_BUFFER_SIZE must be a power of 2;
|
|
* password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE;
|
|
* prf_output_size must be <= prf_key_size; */
|
|
void fastkdf(const uchar *restrict password, const uchar *restrict salt, const uint salt_len, uchar *restrict output, uint output_len)
|
|
{
|
|
|
|
/* WARNING!
|
|
* This algorithm uses byte-wise addressing for memory blocks.
|
|
* Or in other words, trying to copy an unaligned memory region
|
|
* will significantly slow down the algorithm, when copying uses
|
|
* words or bigger entities. It even may corrupt the data, when
|
|
* the device does not support it properly.
|
|
* Therefore use byte copying, which will not the fastest but at
|
|
* least get reliable results. */
|
|
|
|
// BLOCK_SIZE 64U
|
|
// FASTKDF_BUFFER_SIZE 256U
|
|
// BLAKE2S_BLOCK_SIZE 64U
|
|
// BLAKE2S_KEY_SIZE 32U
|
|
// BLAKE2S_OUT_SIZE 32U
|
|
uchar bufidx = 0;
|
|
uint8 Abuffer[9], Bbuffer[9] = { (uint8)(0) };
|
|
uchar *A = (uchar *)Abuffer, *B = (uchar *)Bbuffer;
|
|
|
|
// Initialize the password buffer
|
|
#pragma unroll 1
|
|
for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)A)[i] = ((ulong *)password)[i % 10];
|
|
|
|
((uint16 *)(A + FASTKDF_BUFFER_SIZE))[0] = ((uint16 *)password)[0];
|
|
|
|
// Initialize the salt buffer
|
|
if(salt_len == FASTKDF_BUFFER_SIZE)
|
|
{
|
|
((ulong16 *)B)[0] = ((ulong16 *)B)[2] = ((ulong16 *)salt)[0];
|
|
((ulong16 *)B)[1] = ((ulong16 *)B)[3] = ((ulong16 *)salt)[1];
|
|
}
|
|
else
|
|
{
|
|
// salt_len is 80 bytes here
|
|
#pragma unroll 1
|
|
for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)B)[i] = ((ulong *)salt)[i % 10];
|
|
|
|
// Initialized the rest to zero earlier
|
|
#pragma unroll 1
|
|
for(int i = 0; i < 10; ++i) ((ulong *)(B + FASTKDF_BUFFER_SIZE))[i] = ((ulong *)salt)[i];
|
|
}
|
|
|
|
// The primary iteration
|
|
#pragma unroll 1
|
|
for(int i = 0; i < 32; ++i)
|
|
{
|
|
// Make the key buffer twice the size of the key so it fits a Blake2S block
|
|
// This way, we don't need a temp buffer in the Blake2S function.
|
|
uchar input[BLAKE2S_BLOCK_SIZE], key[BLAKE2S_BLOCK_SIZE] = { 0 };
|
|
|
|
// Copy input and key to their buffers
|
|
CopyBytes(input, A + bufidx, BLAKE2S_BLOCK_SIZE);
|
|
CopyBytes(key, B + bufidx, BLAKE2S_KEY_SIZE);
|
|
|
|
// PRF
|
|
Blake2S((uint *)input, (uint *)key);
|
|
|
|
// Calculate the next buffer pointer
|
|
bufidx = 0;
|
|
|
|
for(int x = 0; x < BLAKE2S_OUT_SIZE; ++x)
|
|
bufidx += input[x];
|
|
|
|
// bufidx a uchar now - always mod 255
|
|
//bufidx &= (FASTKDF_BUFFER_SIZE - 1);
|
|
|
|
// Modify the salt buffer
|
|
XORBytesInPlace(B + bufidx, input, BLAKE2S_OUT_SIZE);
|
|
|
|
if(bufidx < BLAKE2S_KEY_SIZE)
|
|
{
|
|
// Head modified, tail updated
|
|
// this was made off the original code... wtf
|
|
//CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx));
|
|
CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, BLAKE2S_KEY_SIZE - bufidx);
|
|
}
|
|
else if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE)
|
|
{
|
|
// Tail modified, head updated
|
|
CopyBytes(B, B + FASTKDF_BUFFER_SIZE, BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx));
|
|
}
|
|
}
|
|
|
|
// Modify and copy into the output buffer
|
|
|
|
// Damned compiler crashes
|
|
// Fuck you, AMD
|
|
|
|
//for(uint i = 0; i < output_len; ++i, ++bufidx)
|
|
// output[i] = B[bufidx] ^ A[i];
|
|
|
|
uint left = FASTKDF_BUFFER_SIZE - bufidx;
|
|
//uint left = (~bufidx) + 1
|
|
|
|
if(left < output_len)
|
|
{
|
|
XORBytes(output, B + bufidx, A, left);
|
|
XORBytes(output + left, B, A + left, output_len - left);
|
|
}
|
|
else
|
|
{
|
|
XORBytes(output, B + bufidx, A, output_len);
|
|
}
|
|
}
|
|
|
|
#define SALSA_CORE(state) do { \
|
|
state.s4 ^= rotate(state.s0 + state.sc, 7U); state.s8 ^= rotate(state.s4 + state.s0, 9U); state.sc ^= rotate(state.s8 + state.s4, 13U); state.s0 ^= rotate(state.sc + state.s8, 18U); \
|
|
state.s9 ^= rotate(state.s5 + state.s1, 7U); state.sd ^= rotate(state.s9 + state.s5, 9U); state.s1 ^= rotate(state.sd + state.s9, 13U); state.s5 ^= rotate(state.s1 + state.sd, 18U); \
|
|
state.se ^= rotate(state.sa + state.s6, 7U); state.s2 ^= rotate(state.se + state.sa, 9U); state.s6 ^= rotate(state.s2 + state.se, 13U); state.sa ^= rotate(state.s6 + state.s2, 18U); \
|
|
state.s3 ^= rotate(state.sf + state.sb, 7U); state.s7 ^= rotate(state.s3 + state.sf, 9U); state.sb ^= rotate(state.s7 + state.s3, 13U); state.sf ^= rotate(state.sb + state.s7, 18U); \
|
|
state.s1 ^= rotate(state.s0 + state.s3, 7U); state.s2 ^= rotate(state.s1 + state.s0, 9U); state.s3 ^= rotate(state.s2 + state.s1, 13U); state.s0 ^= rotate(state.s3 + state.s2, 18U); \
|
|
state.s6 ^= rotate(state.s5 + state.s4, 7U); state.s7 ^= rotate(state.s6 + state.s5, 9U); state.s4 ^= rotate(state.s7 + state.s6, 13U); state.s5 ^= rotate(state.s4 + state.s7, 18U); \
|
|
state.sb ^= rotate(state.sa + state.s9, 7U); state.s8 ^= rotate(state.sb + state.sa, 9U); state.s9 ^= rotate(state.s8 + state.sb, 13U); state.sa ^= rotate(state.s9 + state.s8, 18U); \
|
|
state.sc ^= rotate(state.sf + state.se, 7U); state.sd ^= rotate(state.sc + state.sf, 9U); state.se ^= rotate(state.sd + state.sc, 13U); state.sf ^= rotate(state.se + state.sd, 18U); \
|
|
} while(0)
|
|
|
|
uint16 salsa_small_scalar_rnd(uint16 X)
|
|
{
|
|
uint16 st = X;
|
|
|
|
#if SALSA_SMALL_UNROLL == 1
|
|
|
|
for(int i = 0; i < 10; ++i)
|
|
{
|
|
SALSA_CORE(st);
|
|
}
|
|
|
|
#elif SALSA_SMALL_UNROLL == 2
|
|
|
|
for(int i = 0; i < 5; ++i)
|
|
{
|
|
SALSA_CORE(st);
|
|
SALSA_CORE(st);
|
|
}
|
|
|
|
#elif SALSA_SMALL_UNROLL == 3
|
|
|
|
for(int i = 0; i < 4; ++i)
|
|
{
|
|
SALSA_CORE(st);
|
|
if(i == 3) break;
|
|
SALSA_CORE(st);
|
|
SALSA_CORE(st);
|
|
}
|
|
|
|
#elif SALSA_SMALL_UNROLL == 4
|
|
|
|
for(int i = 0; i < 3; ++i)
|
|
{
|
|
SALSA_CORE(st);
|
|
SALSA_CORE(st);
|
|
if(i == 2) break;
|
|
SALSA_CORE(st);
|
|
SALSA_CORE(st);
|
|
}
|
|
|
|
#else
|
|
|
|
for(int i = 0; i < 2; ++i)
|
|
{
|
|
SALSA_CORE(st);
|
|
SALSA_CORE(st);
|
|
SALSA_CORE(st);
|
|
SALSA_CORE(st);
|
|
SALSA_CORE(st);
|
|
}
|
|
|
|
#endif
|
|
|
|
return(X + st);
|
|
}
|
|
|
|
#define CHACHA_CORE_PARALLEL(state) do { \
|
|
state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], (uint4)(16U, 16U, 16U, 16U)); \
|
|
state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], (uint4)(12U, 12U, 12U, 12U)); \
|
|
state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], (uint4)(8U, 8U, 8U, 8U)); \
|
|
state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], (uint4)(7U, 7U, 7U, 7U)); \
|
|
\
|
|
state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], (uint4)(16U, 16U, 16U, 16U)); \
|
|
state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, (uint4)(12U, 12U, 12U, 12U)); \
|
|
state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], (uint4)(8U, 8U, 8U, 8U)); \
|
|
state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, (uint4)(7U, 7U, 7U, 7U)); \
|
|
} while(0)
|
|
|
|
uint16 chacha_small_parallel_rnd(uint16 X)
|
|
{
|
|
uint4 t, st[4];
|
|
|
|
((uint16 *)st)[0] = X;
|
|
|
|
#if CHACHA_SMALL_UNROLL == 1
|
|
|
|
for(int i = 0; i < 10; ++i)
|
|
{
|
|
CHACHA_CORE_PARALLEL(st);
|
|
}
|
|
|
|
#elif CHACHA_SMALL_UNROLL == 2
|
|
|
|
for(int i = 0; i < 5; ++i)
|
|
{
|
|
CHACHA_CORE_PARALLEL(st);
|
|
CHACHA_CORE_PARALLEL(st);
|
|
}
|
|
|
|
#elif CHACHA_SMALL_UNROLL == 3
|
|
|
|
for(int i = 0; i < 4; ++i)
|
|
{
|
|
CHACHA_CORE_PARALLEL(st);
|
|
if(i == 3) break;
|
|
CHACHA_CORE_PARALLEL(st);
|
|
CHACHA_CORE_PARALLEL(st);
|
|
}
|
|
|
|
#elif CHACHA_SMALL_UNROLL == 4
|
|
|
|
for(int i = 0; i < 3; ++i)
|
|
{
|
|
CHACHA_CORE_PARALLEL(st);
|
|
CHACHA_CORE_PARALLEL(st);
|
|
if(i == 2) break;
|
|
CHACHA_CORE_PARALLEL(st);
|
|
CHACHA_CORE_PARALLEL(st);
|
|
}
|
|
|
|
#else
|
|
|
|
for(int i = 0; i < 2; ++i)
|
|
{
|
|
CHACHA_CORE_PARALLEL(st);
|
|
CHACHA_CORE_PARALLEL(st);
|
|
CHACHA_CORE_PARALLEL(st);
|
|
CHACHA_CORE_PARALLEL(st);
|
|
CHACHA_CORE_PARALLEL(st);
|
|
}
|
|
|
|
#endif
|
|
|
|
return(X + ((uint16 *)st)[0]);
|
|
}
|
|
|
|
void neoscrypt_blkmix(uint16 *XV, bool alg)
|
|
{
|
|
|
|
/* NeoScrypt flow: Scrypt flow:
|
|
Xa ^= Xd; M(Xa'); Ya = Xa"; Xa ^= Xb; M(Xa'); Ya = Xa";
|
|
Xb ^= Xa"; M(Xb'); Yb = Xb"; Xb ^= Xa"; M(Xb'); Yb = Xb";
|
|
Xc ^= Xb"; M(Xc'); Yc = Xc"; Xa" = Ya;
|
|
Xd ^= Xc"; M(Xd'); Yd = Xd"; Xb" = Yb;
|
|
Xa" = Ya; Xb" = Yc;
|
|
Xc" = Yb; Xd" = Yd; */
|
|
|
|
XV[0] ^= XV[3];
|
|
|
|
if(!alg)
|
|
{
|
|
XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0];
|
|
XV[1] = salsa_small_scalar_rnd(XV[1]); XV[2] ^= XV[1];
|
|
XV[2] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[2];
|
|
XV[3] = salsa_small_scalar_rnd(XV[3]);
|
|
}
|
|
else
|
|
{
|
|
XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0];
|
|
XV[1] = chacha_small_parallel_rnd(XV[1]); XV[2] ^= XV[1];
|
|
XV[2] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[2];
|
|
XV[3] = chacha_small_parallel_rnd(XV[3]);
|
|
}
|
|
|
|
XV[1] ^= XV[2];
|
|
XV[2] ^= XV[1];
|
|
XV[1] ^= XV[2];
|
|
}
|
|
|
|
void ScratchpadStore(__global void *V, void *X, uchar idx)
|
|
{
|
|
((__global ulong16 *)V)[idx << 1] = ((ulong16 *)X)[0];
|
|
((__global ulong16 *)V)[(idx << 1) + 1] = ((ulong16 *)X)[1];
|
|
}
|
|
|
|
void ScratchpadMix(void *X, const __global void *V, uchar idx)
|
|
{
|
|
((ulong16 *)X)[0] ^= ((__global ulong16 *)V)[idx << 1];
|
|
((ulong16 *)X)[1] ^= ((__global ulong16 *)V)[(idx << 1) + 1];
|
|
}
|
|
|
|
void SMix(uint16 *X, __global uint16 *V, bool flag)
|
|
{
|
|
#pragma unroll 1
|
|
for(int i = 0; i < 128; ++i)
|
|
{
|
|
ScratchpadStore(V, X, i);
|
|
neoscrypt_blkmix(X, flag);
|
|
}
|
|
|
|
#pragma unroll 1
|
|
for(int i = 0; i < 128; ++i)
|
|
{
|
|
const uint idx = convert_uchar(((uint *)X)[48] & 0x7F);
|
|
ScratchpadMix(X, V, idx);
|
|
neoscrypt_blkmix(X, flag);
|
|
}
|
|
}
|
|
|
|
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
|
|
__kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, const uint target)
|
|
{
|
|
#define CONSTANT_N 128
|
|
#define CONSTANT_r 2
|
|
// X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha
|
|
uint16 X[4], Z[4];
|
|
/* V = CONSTANT_N * CONSTANT_r * 2 * BLOCK_SIZE */
|
|
__global ulong16 *V = (__global ulong16 *)(padcache + (0x8000 * (get_global_id(0) % MAX_GLOBAL_THREADS)));
|
|
uchar outbuf[32];
|
|
uchar data[PASSWORD_LEN];
|
|
|
|
((ulong8 *)data)[0] = ((__global const ulong8 *)input)[0];
|
|
((ulong *)data)[8] = ((__global const ulong *)input)[8];
|
|
((uint *)data)[18] = ((__global const uint *)input)[18];
|
|
((uint *)data)[19] = get_global_id(0);
|
|
|
|
// X = KDF(password, salt)
|
|
fastkdf(data, data, PASSWORD_LEN, (uchar *)X, 256);
|
|
|
|
// Process ChaCha 1st, Salsa 2nd and XOR them - run that through PBKDF2
|
|
CopyBytes128(Z, X, 2);
|
|
|
|
// X = SMix(X); X & Z are swapped, repeat.
|
|
for(bool flag = false;; ++flag)
|
|
{
|
|
SMix(X, V, flag);
|
|
if(flag) break;
|
|
SwapBytes128(X, Z, 256);
|
|
}
|
|
|
|
// blkxor(X, Z)
|
|
((ulong16 *)X)[0] ^= ((ulong16 *)Z)[0];
|
|
((ulong16 *)X)[1] ^= ((ulong16 *)Z)[1];
|
|
|
|
// output = KDF(password, X)
|
|
fastkdf(data, (uchar *)X, FASTKDF_BUFFER_SIZE, outbuf, 32);
|
|
if(((uint *)outbuf)[7] <= target) output[atomic_add(output + 0xFF, 1)] = get_global_id(0);
|
|
} |