OpenCL GPU miner
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

709 lines
24 KiB

/* NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20 */
/* Adapted and improved for 14.x drivers by Wolf9466 (Wolf`) */
#define rotl(x,y) rotate(x,y)
#define Ch(x,y,z) bitselect(z,y,x)
#define Maj(x,y,z) Ch((x^z),y,z)
#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
#define ROTL32(a,b) rotate(a,as_uint(b))
__constant uint ES[2] = { 0x00FF00FF, 0xFF00FF00 };
#define EndianSwap(n) (rotate(n & ES[0], 24U)|rotate(n & ES[1], 8U))
#define BLOCK_SIZE 64U
#define FASTKDF_BUFFER_SIZE 256U
#ifndef PASSWORD_LEN
#define PASSWORD_LEN 80U
#endif
#ifdef TEST
__constant uchar testsalt[]= {
135, 99, 188, 101, 252, 81, 54, 91, 243, 212, 78, 99, 46, 1, 113, 232, 9, 208, 203, 88, 25, 93, 218, 215, 53, 112, 105, 136, 238, 114, 242, 24, 194, 144, 239, 172, 37, 158, 113, 15, 116, 114, 47, 53, 51, 167, 178, 107, 192, 90, 92, 37, 59, 116, 234, 107, 80, 251, 2, 251, 145, 185, 119, 89, 115, 112, 94, 154, 117, 126, 233, 100, 15, 24, 246, 137, 220, 124, 244, 244, 129, 246, 244, 180, 78, 247, 146, 229, 69, 177, 143, 94, 2, 144, 63, 33, 89, 136, 234, 174, 38, 37, 183, 62, 176, 243, 136, 30, 249, 195, 129, 227, 146, 216, 38, 118, 185, 43, 175, 217, 246, 203, 251, 211, 222, 237, 21, 231, 133, 218, 206, 9, 148, 229, 20, 229, 101, 146, 183, 120, 155, 91, 16, 10, 86, 198, 185, 179, 1, 197, 69, 95, 44, 133, 49, 225, 2, 115, 182, 6, 82, 166, 35, 3, 19, 59, 193, 253, 14, 239, 65, 79, 105, 154, 70, 146, 169, 233, 227, 20, 66, 15, 52, 223, 228, 202, 158, 207, 6, 245, 204, 212, 220, 108, 204, 39, 136, 66, 215, 186, 247, 184, 92, 171, 56, 166, 162, 105, 126, 162, 127, 175, 181, 227, 236, 233, 127, 219, 115, 30, 136, 108, 169, 14, 172, 71, 82, 250, 141, 209, 98, 216, 221, 165, 132, 146, 98, 76, 194, 239, 123, 90, 91, 193, 58, 121, 235, 161, 51, 144, 5, 41, 216, 160, 93, 173
};
#endif
/* When changing the optimal type, make sure the loops unrolled
in _blkcopy, _blkswp and _blkxor are modified accordingly. */
#define OPTIMAL_TYPE uint
/* Fast 32-bit / 64-bit memcpy();
* len must be a multiple of 32 bytes */
void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len) {
OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp;
OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp;
uint i;
#ifdef WITH_UNROLL
#pragma unroll(1<< max(0, (32- sizeof(OPTIMAL_TYPE))>> 2))
for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); ++i)
dst[i] = src[i];
#else
for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i += 4) {
dst[i] = src[i];
dst[i + 1] = src[i + 1];
dst[i + 2] = src[i + 2];
dst[i + 3] = src[i + 3];
}
#endif
}
void neoscrypt_gl_blkcpy(__global void *dstp, const void *srcp, uint len) {
__global OPTIMAL_TYPE *dst = (__global OPTIMAL_TYPE *) dstp;
OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp;
uint i;
for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i += 4) {
dst[i] = src[i];
dst[i + 1] = src[i + 1];
dst[i + 2] = src[i + 2];
dst[i + 3] = src[i + 3];
}
}
/* Fast 32-bit / 64-bit block swapper;
* len must be a multiple of 32 bytes */
void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len) {
OPTIMAL_TYPE *blkA = (OPTIMAL_TYPE *) blkAp;
OPTIMAL_TYPE *blkB = (OPTIMAL_TYPE *) blkBp;
OPTIMAL_TYPE t0, t1, t2, t3;
uint i;
for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i += 4) {
t0 = blkA[i];
t1 = blkA[i + 1];
t2 = blkA[i + 2];
t3 = blkA[i + 3];
blkA[i] = blkB[i];
blkA[i + 1] = blkB[i + 1];
blkA[i + 2] = blkB[i + 2];
blkA[i + 3] = blkB[i + 3];
blkB[i] = t0;
blkB[i + 1] = t1;
blkB[i + 2] = t2;
blkB[i + 3] = t3;
}
}
/* Fast 32-bit / 64-bit block XOR engine;
* len must be a multiple of 32 bytes */
void neoscrypt_blkxor(void *dstp, const void *srcp, uint len) {
OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp;
OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp;
uint i;
for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i += 4) {
dst[i] ^= src[i];
dst[i + 1] ^= src[i + 1];
dst[i + 2] ^= src[i + 2];
dst[i + 3] ^= src[i + 3];
}
}
void neoscrypt_gl_blkxor(void *dstp, __global void *srcp, uint len) {
OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp;
__global OPTIMAL_TYPE *src = (__global OPTIMAL_TYPE *) srcp;
uint i;
for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i += 4) {
dst[i] ^= src[i];
dst[i + 1] ^= src[i + 1];
dst[i + 2] ^= src[i + 2];
dst[i + 3] ^= src[i + 3];
}
}
/* 32-bit / 64-bit / 128-bit optimised memcpy() */
void neoscrypt_copy(void *dstp, const void *srcp, uint len) {
OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp;
OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp;
uint i, tail;
const uint c_len= len/ sizeof(OPTIMAL_TYPE);
for(i= 0; i< c_len; ++i)
dst[i] = src[i];
tail= len- c_len* sizeof(OPTIMAL_TYPE);
if(tail) {
#if defined(cl_khr_byte_addressable_store) && !defined(FORCE_BYTE_COPY)
uchar *dstb = (uchar *) dstp;
uchar *srcb = (uchar *) srcp;
for(i= len- tail; i< len; i++)
dstb[i] = srcb[i];
#else
uint *dsti = (uint *) dstp;
uint *srci = (uint *) srcp;
for(i*= (sizeof(OPTIMAL_TYPE)/ sizeof(uint)); i< (len>> 2); ++i)
dsti[i] = srci[i];
#endif
}
}
/* 32-bit / 64-bit / 128-bit optimised memcpy() */
void neoscrypt_gl_copy(__global uchar *dstp, const void *srcp, uint len) {
__global OPTIMAL_TYPE *dst = (__global OPTIMAL_TYPE *) dstp;
OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp;
uint i, tail;
for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i++)
dst[i] = src[i];
tail = len & (sizeof(OPTIMAL_TYPE) - 1);
if(tail) {
uchar *srcb = (uchar *) srcp;
for(i = len - tail; i < len; i++)
dstp[i] = srcb[i];
}
}
/* 32-bit / 64-bit optimised memory erase aka memset() to zero */
void neoscrypt_erase(void *dstp, uint len) {
const OPTIMAL_TYPE null = 0;
OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp;
uint i, tail;
for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i++)
dst[i] = null;
tail = len & (sizeof(OPTIMAL_TYPE) - 1);
if(tail) {
#if defined(cl_khr_byte_addressable_store) && !defined(FORCE_BYTE_COPY)
uchar *dstb = (uchar *) dstp;
for(i = len - tail; i < len; i++)
dstb[i] = 0u;
#else
uint *dsti = (uint *) dstp;
for(i*= sizeof(OPTIMAL_TYPE)/ sizeof(uint); i< (len>> 2); ++i)
dsti[i] = 0u;
#endif
}
}
/* 32-bit / 64-bit optimised XOR engine */
void neoscrypt_xor(void *dstp, const void *srcp, uint len) {
OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp;
OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp;
uint i, tail;
const unsigned c_len= len/ sizeof(OPTIMAL_TYPE);
for(i= 0; i< c_len; ++i)
dst[i]^= src[i];
//tail = len & (sizeof(OPTIMAL_TYPE) - 1);
tail= len- c_len* sizeof(OPTIMAL_TYPE);
if(tail) {
#if defined(cl_khr_byte_addressable_store) && !defined(FORCE_BYTE_COPY)
uchar *dstb = (uchar *) dstp;
uchar *srcb = (uchar *) srcp;
for(i = len - tail; i < len; i++)
dstb[i] ^= srcb[i];
#else
uint *dsti = (uint *) dstp;
uint *srci = (uint *) srcp;
for(i*= (sizeof(OPTIMAL_TYPE)/ sizeof(uint)); i < (len>> 2); ++i)
dsti[i]^= srci[i];
#endif
}
}
/* BLAKE2s */
#define BLAKE2S_BLOCK_SIZE 64U
#define BLAKE2S_OUT_SIZE 32U
#define BLAKE2S_KEY_SIZE 32U
/* Parameter block of 32 bytes */
typedef struct blake2s_param_t {
uchar digest_length;
uchar key_length;
uchar fanout;
uchar depth;
uint leaf_length;
uchar node_offset[6];
uchar node_depth;
uchar inner_length;
uchar salt[8];
uchar personal[8];
} blake2s_param;
/* State block of 180 bytes */
typedef struct blake2s_state_t {
uint h[8];
uint t[2];
uint f[2];
uchar buf[2 * BLAKE2S_BLOCK_SIZE];
uint buflen;
} blake2s_state;
__constant uint blake2s_IV[8] = {
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};
__constant uchar blake2s_sigma[10][16] = {
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } ,
};
void blake2s_compress(blake2s_state *S, const uint *buf) {
uint i;
uint m[16];
uint v[16];
neoscrypt_copy(m, buf, 64);
neoscrypt_copy(v, S->h, 32);
v[ 8] = blake2s_IV[0];
v[ 9] = blake2s_IV[1];
v[10] = blake2s_IV[2];
v[11] = blake2s_IV[3];
v[12] = S->t[0] ^ blake2s_IV[4];
v[13] = S->t[1] ^ blake2s_IV[5];
v[14] = S->f[0] ^ blake2s_IV[6];
v[15] = S->f[1] ^ blake2s_IV[7];
#define G(r,i,a,b,c,d) \
do { \
a = a + b + m[blake2s_sigma[r][2*i+0]]; \
d = ROTR32(d ^ a, 16); \
c = c + d; \
b = ROTR32(b ^ c, 12); \
a = a + b + m[blake2s_sigma[r][2*i+1]]; \
d = ROTR32(d ^ a, 8); \
c = c + d; \
b = ROTR32(b ^ c, 7); \
} while(0)
#define ROUND(r) \
do { \
G(r, 0, v[ 0], v[ 4], v[ 8], v[12]); \
G(r, 1, v[ 1], v[ 5], v[ 9], v[13]); \
G(r, 2, v[ 2], v[ 6], v[10], v[14]); \
G(r, 3, v[ 3], v[ 7], v[11], v[15]); \
G(r, 4, v[ 0], v[ 5], v[10], v[15]); \
G(r, 5, v[ 1], v[ 6], v[11], v[12]); \
G(r, 6, v[ 2], v[ 7], v[ 8], v[13]); \
G(r, 7, v[ 3], v[ 4], v[ 9], v[14]); \
} while(0)
ROUND(0);
ROUND(1);
ROUND(2);
ROUND(3);
ROUND(4);
ROUND(5);
ROUND(6);
ROUND(7);
ROUND(8);
ROUND(9);
for(i = 0; i < 8; i++)
S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
#undef G
#undef ROUND
}
void blake2s_update(blake2s_state *S, const uchar *input, uint input_size) {
uint left, fill;
while(input_size > 0) {
left = S->buflen;
fill = 2 * BLAKE2S_BLOCK_SIZE - left;
if(input_size > fill) {
/* Buffer fill */
neoscrypt_copy(&S->buf[left], input, fill);
S->buflen += fill;
/* Counter increment */
S->t[0] += BLAKE2S_BLOCK_SIZE;
/* Compress */
blake2s_compress(S, (uint *) S->buf);
/* Shift buffer left */
neoscrypt_copy(S->buf, &S->buf[BLAKE2S_BLOCK_SIZE], BLAKE2S_BLOCK_SIZE);
S->buflen -= BLAKE2S_BLOCK_SIZE;
input += fill;
input_size -= fill;
} else {
neoscrypt_copy(&S->buf[left], input, input_size);
S->buflen += input_size;
/* Do not compress */
//input += input_size;
input_size = 0;
}
}
}
void blake2s(const void *input, const uint input_size,
const void *key, const uchar key_size,
void *output, const uchar output_size) {
uchar block[BLAKE2S_BLOCK_SIZE];
blake2s_param P;
blake2s_state S;
/* Initialise */
neoscrypt_erase(&P, sizeof(blake2s_param));
P.digest_length = output_size;
P.key_length = key_size;
P.fanout = 1;
P.depth = 1;
neoscrypt_erase(&S, sizeof(blake2s_state));
// Initialize the state
for(int i= 0; i< 8; ++i)
S.h[i]= blake2s_IV[i];
// neoscrypt_xor(&S, &P, 32);
S.h[0]^= ((uint)output_size)| (((uint)key_size)<< 8)| (1U<< 16)| (1U<< 24);
// All other values of P are unset yet.
neoscrypt_erase(block, BLAKE2S_BLOCK_SIZE);
neoscrypt_copy(block, key, key_size);
blake2s_update(&S, block, BLAKE2S_BLOCK_SIZE);
/* Update */
blake2s_update(&S, (uchar *) input, input_size);
/* Finish */
if(S.buflen > BLAKE2S_BLOCK_SIZE) {
S.t[0] += BLAKE2S_BLOCK_SIZE;
blake2s_compress(&S, (uint *) S.buf);
S.buflen -= BLAKE2S_BLOCK_SIZE;
neoscrypt_copy(S.buf, &S.buf[BLAKE2S_BLOCK_SIZE], S.buflen);
}
S.t[0] += S.buflen;
S.f[0] = ~0U;
neoscrypt_erase(&S.buf[S.buflen], 2 * BLAKE2S_BLOCK_SIZE - S.buflen);
blake2s_compress(&S, (uint *) S.buf);
/* Write back */
neoscrypt_copy(output, S.h, output_size);
}
/* FastKDF, a fast buffered key derivation function:
* FASTKDF_BUFFER_SIZE must be a power of 2;
* password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE;
* prf_output_size must be <= prf_key_size; */
void fastkdf(const uchar *password, const uchar *salt, const uint salt_len,
uint N, uchar *output, uint output_len) {
/* WARNING!
* This algorithm uses byte-wise addressing for memory blocks.
* Or in other words, trying to copy an unaligned memory region
* will significantly slow down the algorithm, when copying uses
* words or bigger entities. It even may corrupt the data, when
* the device does not support it properly.
* Therefore use byte copying, which will not the fastest but at
* least get reliable results. */
// BLOCK_SIZE 64U
// FASTKDF_BUFFER_SIZE 256U
// BLAKE2S_BLOCK_SIZE 64U
// BLAKE2S_KEY_SIZE 32U
// BLAKE2S_OUT_SIZE 32U
uchar A[FASTKDF_BUFFER_SIZE + BLAKE2S_BLOCK_SIZE];
uchar B[FASTKDF_BUFFER_SIZE + BLAKE2S_KEY_SIZE];
uchar prf_output[BLAKE2S_OUT_SIZE], prf_input[BLAKE2S_BLOCK_SIZE],
prf_key[BLAKE2S_KEY_SIZE];
uint bufidx, a, b, i, j;
/* Initialise the password buffer */
a = FASTKDF_BUFFER_SIZE / PASSWORD_LEN;
for(i = 0, j= 0; i < a; ++i, j+= PASSWORD_LEN)
neoscrypt_copy(&A[j], (uchar *)password, PASSWORD_LEN);
b= FASTKDF_BUFFER_SIZE- j;
if(b)
neoscrypt_copy(&A[j], (uchar *)password, b);
#if (PASSWORD_LEN< BLAKE2S_BLOCK_SIZE)
/* Initialise the password buffer */
a = BLAKE2S_BLOCK_SIZE / PASSWORD_LEN;
for(i = 0, j= 0; i < a; ++i, j+= PASSWORD_LEN)
neoscrypt_copy(&A[j], (uchar *)password, PASSWORD_LEN);
b= BLAKE2S_BLOCK_SIZE- j;
if(b)
neoscrypt_copy(&A[j], (uchar *)password, b);
//neoscrypt_copy(&A[FASTKDF_BUFFER_SIZE], (uchar *)password, PASSWORD_LEN);
//// Erase the remainder of the blake-block, when the password length is smaller
//neoscrypt_erase(&A[FASTKDF_BUFFER_SIZE+ PASSWORD_LEN], BLAKE2S_BLOCK_SIZE- PASSWORD_LEN);
#else
neoscrypt_copy(&A[FASTKDF_BUFFER_SIZE], (uchar *)password, BLAKE2S_BLOCK_SIZE);
#endif
/* Initialise the salt buffer */
a = FASTKDF_BUFFER_SIZE/ salt_len;
for(i = 0, j= 0; i< a; ++i, j+= salt_len)
neoscrypt_copy(&B[j], salt, salt_len);
b= FASTKDF_BUFFER_SIZE- j;
if(b)
neoscrypt_copy(&B[j], (uchar *)salt, b);
if(salt_len< BLAKE2S_BLOCK_SIZE) {
neoscrypt_copy(&B[FASTKDF_BUFFER_SIZE], (uchar *)salt, salt_len);
// Erase the remainder of the blake-block, when the password length is smaller
neoscrypt_erase(&B[FASTKDF_BUFFER_SIZE+ salt_len], BLAKE2S_BLOCK_SIZE- salt_len);
} else
neoscrypt_copy(&B[FASTKDF_BUFFER_SIZE], salt, BLAKE2S_KEY_SIZE);
/* The primary iteration */
for(i = 0, bufidx = 0; i < N; ++i) {
/* Copy the PRF input buffer byte by byte to make sure prf_input
starts at a well aligned address. Missing to do so may slow down
computation. */
for(j= 0, a= bufidx; j< BLAKE2S_BLOCK_SIZE; ++j, ++a)
prf_input[j]= A[a];
/* Copy the PRF key buffer */
for(j= 0, a= bufidx; j< BLAKE2S_KEY_SIZE; ++j, ++a)
prf_key[j]= B[a];
/* PRF */
blake2s(prf_input, BLAKE2S_BLOCK_SIZE,
prf_key, BLAKE2S_KEY_SIZE,
prf_output, BLAKE2S_OUT_SIZE);
/* Calculate the next buffer pointer */
for(j = 0, bufidx = 0; j < BLAKE2S_OUT_SIZE; j++)
bufidx += prf_output[j];
bufidx &= (FASTKDF_BUFFER_SIZE - 1);
/* Modify the salt buffer */
//neoscrypt_xor(&B[bufidx], &prf_output[0], BLAKE2S_OUT_SIZE);
for(j= 0, a= bufidx; j< BLAKE2S_OUT_SIZE; ++j, ++a)
B[a]^= prf_output[j];
/* Head modified, tail updated */
if(bufidx < BLAKE2S_KEY_SIZE)
//neoscrypt_copy(&B[FASTKDF_BUFFER_SIZE + bufidx], &B[bufidx],
// min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx));
for(j= 0, a= FASTKDF_BUFFER_SIZE + bufidx, b= bufidx;
j< min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE- bufidx); ++j, ++a, ++b)
B[a]= B[b];
/* Tail modified, head updated */
if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE)
neoscrypt_copy(B, &B[FASTKDF_BUFFER_SIZE],
BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx));
}
/* Modify and copy into the output buffer */
if(output_len > FASTKDF_BUFFER_SIZE)
output_len = FASTKDF_BUFFER_SIZE;
a = FASTKDF_BUFFER_SIZE - bufidx;
if(a >= output_len) {
for(j= 0, i= bufidx; j< output_len; ++j, ++i)
output[j]= B[i]^ A[j];
} else {
for(j= 0, i= bufidx; j< a; ++j, ++i)
output[j]= B[i]^ A[j];
for(j= a, i= 0; i< output_len- a; ++j, ++i)
output[j]= B[i]^ A[j];
}
}
uint16 neoscrypt_salsa(uint16 X)
{
uint16 tmp = X;
for(int i = 0; i < 10; ++i)
{
tmp.s4 ^= rotate(tmp.s0 + tmp.sc, 7U); tmp.s8 ^= rotate(tmp.s4 + tmp.s0, 9U); tmp.sc ^= rotate(tmp.s8 + tmp.s4, 13U); tmp.s0 ^= rotate(tmp.sc + tmp.s8, 18U);
tmp.s9 ^= rotate(tmp.s5 + tmp.s1, 7U); tmp.sd ^= rotate(tmp.s9 + tmp.s5, 9U); tmp.s1 ^= rotate(tmp.sd + tmp.s9, 13U); tmp.s5 ^= rotate(tmp.s1 + tmp.sd, 18U);
tmp.se ^= rotate(tmp.sa + tmp.s6, 7U); tmp.s2 ^= rotate(tmp.se + tmp.sa, 9U); tmp.s6 ^= rotate(tmp.s2 + tmp.se, 13U); tmp.sa ^= rotate(tmp.s6 + tmp.s2, 18U);
tmp.s3 ^= rotate(tmp.sf + tmp.sb, 7U); tmp.s7 ^= rotate(tmp.s3 + tmp.sf, 9U); tmp.sb ^= rotate(tmp.s7 + tmp.s3, 13U); tmp.sf ^= rotate(tmp.sb + tmp.s7, 18U);
tmp.s1 ^= rotate(tmp.s0 + tmp.s3, 7U); tmp.s2 ^= rotate(tmp.s1 + tmp.s0, 9U); tmp.s3 ^= rotate(tmp.s2 + tmp.s1, 13U); tmp.s0 ^= rotate(tmp.s3 + tmp.s2, 18U);
tmp.s6 ^= rotate(tmp.s5 + tmp.s4, 7U); tmp.s7 ^= rotate(tmp.s6 + tmp.s5, 9U); tmp.s4 ^= rotate(tmp.s7 + tmp.s6, 13U); tmp.s5 ^= rotate(tmp.s4 + tmp.s7, 18U);
tmp.sb ^= rotate(tmp.sa + tmp.s9, 7U); tmp.s8 ^= rotate(tmp.sb + tmp.sa, 9U); tmp.s9 ^= rotate(tmp.s8 + tmp.sb, 13U); tmp.sa ^= rotate(tmp.s9 + tmp.s8, 18U);
tmp.sc ^= rotate(tmp.sf + tmp.se, 7U); tmp.sd ^= rotate(tmp.sc + tmp.sf, 9U); tmp.se ^= rotate(tmp.sd + tmp.sc, 13U); tmp.sf ^= rotate(tmp.se + tmp.sd, 18U);
}
return(X + tmp);
}
uint16 neoscrypt_chacha(uint16 X)
{
uint16 tmp = X;
for(int i = 0; i < 10; ++i)
{
tmp.s0 += tmp.s4; tmp.sc = rotate(tmp.sc ^ tmp.s0, 16U); tmp.s8 += tmp.sc; tmp.s4 = rotate(tmp.s4 ^ tmp.s8, 12U); tmp.s0 += tmp.s4; tmp.sc = rotate(tmp.sc ^ tmp.s0, 8U); tmp.s8 += tmp.sc; tmp.s4 = rotate(tmp.s4 ^ tmp.s8, 7U);
tmp.s1 += tmp.s5; tmp.sd = rotate(tmp.sd ^ tmp.s1, 16U); tmp.s9 += tmp.sd; tmp.s5 = rotate(tmp.s5 ^ tmp.s9, 12U); tmp.s1 += tmp.s5; tmp.sd = rotate(tmp.sd ^ tmp.s1, 8U); tmp.s9 += tmp.sd; tmp.s5 = rotate(tmp.s5 ^ tmp.s9, 7U);
tmp.s2 += tmp.s6; tmp.se = rotate(tmp.se ^ tmp.s2, 16U); tmp.sa += tmp.se; tmp.s6 = rotate(tmp.s6 ^ tmp.sa, 12U); tmp.s2 += tmp.s6; tmp.se = rotate(tmp.se ^ tmp.s2, 8U); tmp.sa += tmp.se; tmp.s6 = rotate(tmp.s6 ^ tmp.sa, 7U);
tmp.s3 += tmp.s7; tmp.sf = rotate(tmp.sf ^ tmp.s3, 16U); tmp.sb += tmp.sf; tmp.s7 = rotate(tmp.s7 ^ tmp.sb, 12U); tmp.s3 += tmp.s7; tmp.sf = rotate(tmp.sf ^ tmp.s3, 8U); tmp.sb += tmp.sf; tmp.s7 = rotate(tmp.s7 ^ tmp.sb, 7U);
tmp.s0 += tmp.s5; tmp.sf = rotate(tmp.sf ^ tmp.s0, 16U); tmp.sa += tmp.sf; tmp.s5 = rotate(tmp.s5 ^ tmp.sa, 12U); tmp.s0 += tmp.s5; tmp.sf = rotate(tmp.sf ^ tmp.s0, 8U); tmp.sa += tmp.sf; tmp.s5 = rotate(tmp.s5 ^ tmp.sa, 7U);
tmp.s1 += tmp.s6; tmp.sc = rotate(tmp.sc ^ tmp.s1, 16U); tmp.sb += tmp.sc; tmp.s6 = rotate(tmp.s6 ^ tmp.sb, 12U); tmp.s1 += tmp.s6; tmp.sc = rotate(tmp.sc ^ tmp.s1, 8U); tmp.sb += tmp.sc; tmp.s6 = rotate(tmp.s6 ^ tmp.sb, 7U);
tmp.s2 += tmp.s7; tmp.sd = rotate(tmp.sd ^ tmp.s2, 16U); tmp.s8 += tmp.sd; tmp.s7 = rotate(tmp.s7 ^ tmp.s8, 12U); tmp.s2 += tmp.s7; tmp.sd = rotate(tmp.sd ^ tmp.s2, 8U); tmp.s8 += tmp.sd; tmp.s7 = rotate(tmp.s7 ^ tmp.s8, 7U);
tmp.s3 += tmp.s4; tmp.se = rotate(tmp.se ^ tmp.s3, 16U); tmp.s9 += tmp.se; tmp.s4 = rotate(tmp.s4 ^ tmp.s9, 12U); tmp.s3 += tmp.s4; tmp.se = rotate(tmp.se ^ tmp.s3, 8U); tmp.s9 += tmp.se; tmp.s4 = rotate(tmp.s4 ^ tmp.s9, 7U);
}
return(X + tmp);
}
void neoscrypt_blkmix(uint16 *XV, uint mixmode)
{
/* NeoScrypt flow: Scrypt flow:
Xa ^= Xd; M(Xa'); Ya = Xa"; Xa ^= Xb; M(Xa'); Ya = Xa";
Xb ^= Xa"; M(Xb'); Yb = Xb"; Xb ^= Xa"; M(Xb'); Yb = Xb";
Xc ^= Xb"; M(Xc'); Yc = Xc"; Xa" = Ya;
Xd ^= Xc"; M(Xd'); Yd = Xd"; Xb" = Yb;
Xa" = Ya; Xb" = Yc;
Xc" = Yb; Xd" = Yd; */
XV[0] ^= XV[3];
if(!mixmode) XV[0] = neoscrypt_salsa(XV[0]);
else XV[0] = neoscrypt_chacha(XV[0]);
XV[1] ^= XV[0];
if(!mixmode) XV[1] = neoscrypt_salsa(XV[1]);
else XV[1] = neoscrypt_chacha(XV[1]);
XV[2] ^= XV[1];
if(!mixmode) XV[2] = neoscrypt_salsa(XV[2]);
else XV[2] = neoscrypt_chacha(XV[2]);
XV[3] ^= XV[2];
if(!mixmode) XV[3] = neoscrypt_salsa(XV[3]);
else XV[3] = neoscrypt_chacha(XV[3]);
neoscrypt_blkswp(&XV[1], &XV[2], BLOCK_SIZE);
}
/* NeoScrypt core engine:
* p = 1, salt = password;
* Basic customisation (required):
* profile bit 0:
* 0 = NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20;
* 1 = Scrypt(1024, 1, 1) with Salsa20/8;
* profile bits 4 to 1:
* 0000 = FastKDF-BLAKE2s;
* 0001 = PBKDF2-HMAC-SHA256;
* 0010 = PBKDF2-HMAC-BLAKE256;
* Extended customisation (optional):
* profile bit 31:
* 0 = extended customisation absent;
* 1 = extended customisation present;
* profile bits 7 to 5 (rfactor):
* 000 = r of 1;
* 001 = r of 2;
* 010 = r of 4;
* ...
* 111 = r of 128;
* profile bits 12 to 8 (Nfactor):
* 00000 = N of 2;
* 00001 = N of 4;
* 00010 = N of 8;
* .....
* 00110 = N of 128;
* .....
* 01001 = N of 1024;
* .....
* 11110 = N of 2147483648;
* profile bits 30 to 13 are reserved */
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(__global const uchar* restrict input,
#ifdef TEST
__global uchar* restrict output,
#else
volatile __global uint* restrict output,
#endif
__global uchar* padcache,
const uint target)
{
#define CONSTANT_N 128
#define CONSTANT_r 2
/* Ensure stack alignment by putting those first. */
/* X = CONSTANT_r * 2 * BLOCK_SIZE(64) */
uchar X[FASTKDF_BUFFER_SIZE];
/* Z is a copy of X for ChaCha */
uchar Z[FASTKDF_BUFFER_SIZE];
/* V = CONSTANT_N * CONSTANT_r * 2 * BLOCK_SIZE */
__global uchar *V= &padcache[CONSTANT_N * CONSTANT_r * 2 * BLOCK_SIZE*
(get_global_id(0)% MAX_GLOBAL_THREADS)];
#ifndef TEST
uchar outbuf[32];
uchar data[PASSWORD_LEN];
uint i, j;
for(i= 0; i< PASSWORD_LEN- 4; ++i)
data[i]= input[i];
((uint *)data)[(PASSWORD_LEN- 4)/ sizeof(uint)]= get_global_id(0);
#else
uchar outbuf[OUTPUT_LEN];
uchar data[PASSWORD_LEN];
uint i, j;
for(i= 0; i< PASSWORD_LEN; ++i)
data[i]= input[i];
#endif
const uint mixmode = 0x14;
#ifdef TEST
#ifdef BLAKE2S_TEST
blake2s(data, 64, data, 32, outbuf, OUTPUT_LEN);
for(i= 0; i< OUTPUT_LEN; ++i)
output[i]= outbuf[i];
return;
#elif defined(FASTKDF_TEST)
for(i= 0; i< FASTKDF_BUFFER_SIZE; ++i)
X[i]= testsalt[i];
fastkdf(data, X, FASTKDF_BUFFER_SIZE, 32, outbuf, 32);
for(i= 0; i< OUTPUT_LEN; ++i)
output[i]= outbuf[i];
return;
#endif
#endif
/* X = KDF(password, salt) */
fastkdf(data, data, PASSWORD_LEN, 32, X, CONSTANT_r * 2 * BLOCK_SIZE);
/* Process ChaCha 1st, Salsa 2nd and XOR them into PBKDF2 */
neoscrypt_blkcpy(Z, X, CONSTANT_r * 2 * BLOCK_SIZE);
for(int y = 0; y < 2; ++y)
{
for(i = 0; i < 128; ++i)
{
neoscrypt_gl_blkcpy(&V[i << 8], &X[0], 256);
neoscrypt_blkmix((uint16 *)X, y);
}
for(i = 0; i < 128; ++i)
{
neoscrypt_gl_blkxor(&X[0], &V[(((uint *)X)[48] & 127) << 8], 256);
neoscrypt_blkmix((uint16 *)X, y);
}
if(!y) neoscrypt_blkswp(&X[0], &Z[0], 256);
}
/* blkxor(X, Z) */
neoscrypt_blkxor(&X[0], &Z[0], CONSTANT_r * 2 * BLOCK_SIZE);
#ifdef TEST
fastkdf(data, X, FASTKDF_BUFFER_SIZE, 32, outbuf, 32);
//((uint *)outbuf)[8]= target;
for(i= 0; i< OUTPUT_LEN; ++i)
output[i]= outbuf[i];
#else
/* output = KDF(password, X) */
fastkdf(data, X, FASTKDF_BUFFER_SIZE, 32, outbuf, 32);
#define SCRYPT_FOUND (0xFF)
#ifdef cl_khr_global_int32_base_atomics
#define SETFOUND(Xnonce) output[atomic_add(&output[SCRYPT_FOUND], 1)]= Xnonce
#else
#define SETFOUND(Xnonce) output[output[SCRYPT_FOUND]++] = Xnonce
#endif
if (((uint *)outbuf)[7]<= target)
SETFOUND(get_global_id(0));
#endif
}