mirror of https://github.com/GOSTSec/sgminer
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
709 lines
24 KiB
709 lines
24 KiB
/* NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20 */ |
|
/* Adapted and improved for 14.x drivers by Wolf9466 (Wolf`) */ |
|
|
|
#define rotl(x,y) rotate(x,y) |
|
#define Ch(x,y,z) bitselect(z,y,x) |
|
#define Maj(x,y,z) Ch((x^z),y,z) |
|
#define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) |
|
#define ROTL32(a,b) rotate(a,as_uint(b)) |
|
|
|
__constant uint ES[2] = { 0x00FF00FF, 0xFF00FF00 }; |
|
#define EndianSwap(n) (rotate(n & ES[0], 24U)|rotate(n & ES[1], 8U)) |
|
|
|
#define BLOCK_SIZE 64U |
|
#define FASTKDF_BUFFER_SIZE 256U |
|
#ifndef PASSWORD_LEN |
|
#define PASSWORD_LEN 80U |
|
#endif |
|
|
|
#ifdef TEST |
|
__constant uchar testsalt[]= { |
|
135, 99, 188, 101, 252, 81, 54, 91, 243, 212, 78, 99, 46, 1, 113, 232, 9, 208, 203, 88, 25, 93, 218, 215, 53, 112, 105, 136, 238, 114, 242, 24, 194, 144, 239, 172, 37, 158, 113, 15, 116, 114, 47, 53, 51, 167, 178, 107, 192, 90, 92, 37, 59, 116, 234, 107, 80, 251, 2, 251, 145, 185, 119, 89, 115, 112, 94, 154, 117, 126, 233, 100, 15, 24, 246, 137, 220, 124, 244, 244, 129, 246, 244, 180, 78, 247, 146, 229, 69, 177, 143, 94, 2, 144, 63, 33, 89, 136, 234, 174, 38, 37, 183, 62, 176, 243, 136, 30, 249, 195, 129, 227, 146, 216, 38, 118, 185, 43, 175, 217, 246, 203, 251, 211, 222, 237, 21, 231, 133, 218, 206, 9, 148, 229, 20, 229, 101, 146, 183, 120, 155, 91, 16, 10, 86, 198, 185, 179, 1, 197, 69, 95, 44, 133, 49, 225, 2, 115, 182, 6, 82, 166, 35, 3, 19, 59, 193, 253, 14, 239, 65, 79, 105, 154, 70, 146, 169, 233, 227, 20, 66, 15, 52, 223, 228, 202, 158, 207, 6, 245, 204, 212, 220, 108, 204, 39, 136, 66, 215, 186, 247, 184, 92, 171, 56, 166, 162, 105, 126, 162, 127, 175, 181, 227, 236, 233, 127, 219, 115, 30, 136, 108, 169, 14, 172, 71, 82, 250, 141, 209, 98, 216, 221, 165, 132, 146, 98, 76, 194, 239, 123, 90, 91, 193, 58, 121, 235, 161, 51, 144, 5, 41, 216, 160, 93, 173 |
|
}; |
|
#endif |
|
|
|
/* When changing the optimal type, make sure the loops unrolled |
|
in _blkcopy, _blkswp and _blkxor are modified accordingly. */ |
|
#define OPTIMAL_TYPE uint |
|
|
|
/* Fast 32-bit / 64-bit memcpy(); |
|
* len must be a multiple of 32 bytes */ |
|
void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len) { |
|
OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp; |
|
OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp; |
|
uint i; |
|
|
|
#ifdef WITH_UNROLL |
|
#pragma unroll(1<< max(0, (32- sizeof(OPTIMAL_TYPE))>> 2)) |
|
for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); ++i) |
|
dst[i] = src[i]; |
|
#else |
|
for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i += 4) { |
|
dst[i] = src[i]; |
|
dst[i + 1] = src[i + 1]; |
|
dst[i + 2] = src[i + 2]; |
|
dst[i + 3] = src[i + 3]; |
|
} |
|
#endif |
|
} |
|
void neoscrypt_gl_blkcpy(__global void *dstp, const void *srcp, uint len) { |
|
__global OPTIMAL_TYPE *dst = (__global OPTIMAL_TYPE *) dstp; |
|
OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp; |
|
uint i; |
|
|
|
for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i += 4) { |
|
dst[i] = src[i]; |
|
dst[i + 1] = src[i + 1]; |
|
dst[i + 2] = src[i + 2]; |
|
dst[i + 3] = src[i + 3]; |
|
} |
|
} |
|
|
|
/* Fast 32-bit / 64-bit block swapper; |
|
* len must be a multiple of 32 bytes */ |
|
void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len) { |
|
OPTIMAL_TYPE *blkA = (OPTIMAL_TYPE *) blkAp; |
|
OPTIMAL_TYPE *blkB = (OPTIMAL_TYPE *) blkBp; |
|
OPTIMAL_TYPE t0, t1, t2, t3; |
|
uint i; |
|
|
|
for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i += 4) { |
|
t0 = blkA[i]; |
|
t1 = blkA[i + 1]; |
|
t2 = blkA[i + 2]; |
|
t3 = blkA[i + 3]; |
|
blkA[i] = blkB[i]; |
|
blkA[i + 1] = blkB[i + 1]; |
|
blkA[i + 2] = blkB[i + 2]; |
|
blkA[i + 3] = blkB[i + 3]; |
|
blkB[i] = t0; |
|
blkB[i + 1] = t1; |
|
blkB[i + 2] = t2; |
|
blkB[i + 3] = t3; |
|
} |
|
} |
|
|
|
/* Fast 32-bit / 64-bit block XOR engine; |
|
* len must be a multiple of 32 bytes */ |
|
void neoscrypt_blkxor(void *dstp, const void *srcp, uint len) { |
|
OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp; |
|
OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp; |
|
uint i; |
|
|
|
for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i += 4) { |
|
dst[i] ^= src[i]; |
|
dst[i + 1] ^= src[i + 1]; |
|
dst[i + 2] ^= src[i + 2]; |
|
dst[i + 3] ^= src[i + 3]; |
|
} |
|
} |
|
|
|
void neoscrypt_gl_blkxor(void *dstp, __global void *srcp, uint len) { |
|
OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp; |
|
__global OPTIMAL_TYPE *src = (__global OPTIMAL_TYPE *) srcp; |
|
uint i; |
|
|
|
for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i += 4) { |
|
dst[i] ^= src[i]; |
|
dst[i + 1] ^= src[i + 1]; |
|
dst[i + 2] ^= src[i + 2]; |
|
dst[i + 3] ^= src[i + 3]; |
|
} |
|
} |
|
|
|
/* 32-bit / 64-bit / 128-bit optimised memcpy() */ |
|
void neoscrypt_copy(void *dstp, const void *srcp, uint len) { |
|
OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp; |
|
OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp; |
|
uint i, tail; |
|
const uint c_len= len/ sizeof(OPTIMAL_TYPE); |
|
|
|
for(i= 0; i< c_len; ++i) |
|
dst[i] = src[i]; |
|
|
|
tail= len- c_len* sizeof(OPTIMAL_TYPE); |
|
if(tail) { |
|
#if defined(cl_khr_byte_addressable_store) && !defined(FORCE_BYTE_COPY) |
|
uchar *dstb = (uchar *) dstp; |
|
uchar *srcb = (uchar *) srcp; |
|
|
|
for(i= len- tail; i< len; i++) |
|
dstb[i] = srcb[i]; |
|
#else |
|
uint *dsti = (uint *) dstp; |
|
uint *srci = (uint *) srcp; |
|
|
|
for(i*= (sizeof(OPTIMAL_TYPE)/ sizeof(uint)); i< (len>> 2); ++i) |
|
dsti[i] = srci[i]; |
|
#endif |
|
} |
|
} |
|
|
|
/* 32-bit / 64-bit / 128-bit optimised memcpy() */ |
|
void neoscrypt_gl_copy(__global uchar *dstp, const void *srcp, uint len) { |
|
__global OPTIMAL_TYPE *dst = (__global OPTIMAL_TYPE *) dstp; |
|
OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp; |
|
uint i, tail; |
|
|
|
for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i++) |
|
dst[i] = src[i]; |
|
|
|
tail = len & (sizeof(OPTIMAL_TYPE) - 1); |
|
if(tail) { |
|
uchar *srcb = (uchar *) srcp; |
|
|
|
for(i = len - tail; i < len; i++) |
|
dstp[i] = srcb[i]; |
|
} |
|
} |
|
|
|
/* 32-bit / 64-bit optimised memory erase aka memset() to zero */ |
|
void neoscrypt_erase(void *dstp, uint len) { |
|
const OPTIMAL_TYPE null = 0; |
|
OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp; |
|
uint i, tail; |
|
|
|
for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i++) |
|
dst[i] = null; |
|
|
|
tail = len & (sizeof(OPTIMAL_TYPE) - 1); |
|
if(tail) { |
|
#if defined(cl_khr_byte_addressable_store) && !defined(FORCE_BYTE_COPY) |
|
uchar *dstb = (uchar *) dstp; |
|
|
|
for(i = len - tail; i < len; i++) |
|
dstb[i] = 0u; |
|
#else |
|
uint *dsti = (uint *) dstp; |
|
|
|
for(i*= sizeof(OPTIMAL_TYPE)/ sizeof(uint); i< (len>> 2); ++i) |
|
dsti[i] = 0u; |
|
#endif |
|
} |
|
} |
|
|
|
/* 32-bit / 64-bit optimised XOR engine */ |
|
void neoscrypt_xor(void *dstp, const void *srcp, uint len) { |
|
OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp; |
|
OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp; |
|
uint i, tail; |
|
const unsigned c_len= len/ sizeof(OPTIMAL_TYPE); |
|
|
|
for(i= 0; i< c_len; ++i) |
|
dst[i]^= src[i]; |
|
|
|
//tail = len & (sizeof(OPTIMAL_TYPE) - 1); |
|
tail= len- c_len* sizeof(OPTIMAL_TYPE); |
|
if(tail) { |
|
#if defined(cl_khr_byte_addressable_store) && !defined(FORCE_BYTE_COPY) |
|
uchar *dstb = (uchar *) dstp; |
|
uchar *srcb = (uchar *) srcp; |
|
|
|
for(i = len - tail; i < len; i++) |
|
dstb[i] ^= srcb[i]; |
|
#else |
|
uint *dsti = (uint *) dstp; |
|
uint *srci = (uint *) srcp; |
|
|
|
for(i*= (sizeof(OPTIMAL_TYPE)/ sizeof(uint)); i < (len>> 2); ++i) |
|
dsti[i]^= srci[i]; |
|
#endif |
|
} |
|
} |
|
|
|
/* BLAKE2s */ |
|
|
|
#define BLAKE2S_BLOCK_SIZE 64U |
|
#define BLAKE2S_OUT_SIZE 32U |
|
#define BLAKE2S_KEY_SIZE 32U |
|
|
|
/* Parameter block of 32 bytes */ |
|
typedef struct blake2s_param_t { |
|
uchar digest_length; |
|
uchar key_length; |
|
uchar fanout; |
|
uchar depth; |
|
uint leaf_length; |
|
uchar node_offset[6]; |
|
uchar node_depth; |
|
uchar inner_length; |
|
uchar salt[8]; |
|
uchar personal[8]; |
|
} blake2s_param; |
|
|
|
/* State block of 180 bytes */ |
|
typedef struct blake2s_state_t { |
|
uint h[8]; |
|
uint t[2]; |
|
uint f[2]; |
|
uchar buf[2 * BLAKE2S_BLOCK_SIZE]; |
|
uint buflen; |
|
} blake2s_state; |
|
|
|
__constant uint blake2s_IV[8] = { |
|
0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, |
|
0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 |
|
}; |
|
|
|
__constant uchar blake2s_sigma[10][16] = { |
|
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , |
|
{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , |
|
{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , |
|
{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , |
|
{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , |
|
{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , |
|
{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , |
|
{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , |
|
{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , |
|
{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , |
|
}; |
|
|
|
void blake2s_compress(blake2s_state *S, const uint *buf) { |
|
uint i; |
|
uint m[16]; |
|
uint v[16]; |
|
|
|
neoscrypt_copy(m, buf, 64); |
|
neoscrypt_copy(v, S->h, 32); |
|
|
|
v[ 8] = blake2s_IV[0]; |
|
v[ 9] = blake2s_IV[1]; |
|
v[10] = blake2s_IV[2]; |
|
v[11] = blake2s_IV[3]; |
|
v[12] = S->t[0] ^ blake2s_IV[4]; |
|
v[13] = S->t[1] ^ blake2s_IV[5]; |
|
v[14] = S->f[0] ^ blake2s_IV[6]; |
|
v[15] = S->f[1] ^ blake2s_IV[7]; |
|
#define G(r,i,a,b,c,d) \ |
|
do { \ |
|
a = a + b + m[blake2s_sigma[r][2*i+0]]; \ |
|
d = ROTR32(d ^ a, 16); \ |
|
c = c + d; \ |
|
b = ROTR32(b ^ c, 12); \ |
|
a = a + b + m[blake2s_sigma[r][2*i+1]]; \ |
|
d = ROTR32(d ^ a, 8); \ |
|
c = c + d; \ |
|
b = ROTR32(b ^ c, 7); \ |
|
} while(0) |
|
#define ROUND(r) \ |
|
do { \ |
|
G(r, 0, v[ 0], v[ 4], v[ 8], v[12]); \ |
|
G(r, 1, v[ 1], v[ 5], v[ 9], v[13]); \ |
|
G(r, 2, v[ 2], v[ 6], v[10], v[14]); \ |
|
G(r, 3, v[ 3], v[ 7], v[11], v[15]); \ |
|
G(r, 4, v[ 0], v[ 5], v[10], v[15]); \ |
|
G(r, 5, v[ 1], v[ 6], v[11], v[12]); \ |
|
G(r, 6, v[ 2], v[ 7], v[ 8], v[13]); \ |
|
G(r, 7, v[ 3], v[ 4], v[ 9], v[14]); \ |
|
} while(0) |
|
ROUND(0); |
|
ROUND(1); |
|
ROUND(2); |
|
ROUND(3); |
|
ROUND(4); |
|
ROUND(5); |
|
ROUND(6); |
|
ROUND(7); |
|
ROUND(8); |
|
ROUND(9); |
|
|
|
for(i = 0; i < 8; i++) |
|
S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; |
|
|
|
#undef G |
|
#undef ROUND |
|
} |
|
|
|
void blake2s_update(blake2s_state *S, const uchar *input, uint input_size) { |
|
uint left, fill; |
|
|
|
while(input_size > 0) { |
|
left = S->buflen; |
|
fill = 2 * BLAKE2S_BLOCK_SIZE - left; |
|
if(input_size > fill) { |
|
/* Buffer fill */ |
|
neoscrypt_copy(&S->buf[left], input, fill); |
|
S->buflen += fill; |
|
/* Counter increment */ |
|
S->t[0] += BLAKE2S_BLOCK_SIZE; |
|
/* Compress */ |
|
blake2s_compress(S, (uint *) S->buf); |
|
/* Shift buffer left */ |
|
neoscrypt_copy(S->buf, &S->buf[BLAKE2S_BLOCK_SIZE], BLAKE2S_BLOCK_SIZE); |
|
S->buflen -= BLAKE2S_BLOCK_SIZE; |
|
input += fill; |
|
input_size -= fill; |
|
} else { |
|
neoscrypt_copy(&S->buf[left], input, input_size); |
|
S->buflen += input_size; |
|
/* Do not compress */ |
|
//input += input_size; |
|
input_size = 0; |
|
} |
|
} |
|
} |
|
|
|
void blake2s(const void *input, const uint input_size, |
|
const void *key, const uchar key_size, |
|
void *output, const uchar output_size) { |
|
uchar block[BLAKE2S_BLOCK_SIZE]; |
|
blake2s_param P; |
|
blake2s_state S; |
|
|
|
/* Initialise */ |
|
neoscrypt_erase(&P, sizeof(blake2s_param)); |
|
P.digest_length = output_size; |
|
P.key_length = key_size; |
|
P.fanout = 1; |
|
P.depth = 1; |
|
|
|
neoscrypt_erase(&S, sizeof(blake2s_state)); |
|
// Initialize the state |
|
for(int i= 0; i< 8; ++i) |
|
S.h[i]= blake2s_IV[i]; |
|
// neoscrypt_xor(&S, &P, 32); |
|
S.h[0]^= ((uint)output_size)| (((uint)key_size)<< 8)| (1U<< 16)| (1U<< 24); |
|
// All other values of P are unset yet. |
|
|
|
neoscrypt_erase(block, BLAKE2S_BLOCK_SIZE); |
|
neoscrypt_copy(block, key, key_size); |
|
blake2s_update(&S, block, BLAKE2S_BLOCK_SIZE); |
|
/* Update */ |
|
blake2s_update(&S, (uchar *) input, input_size); |
|
|
|
/* Finish */ |
|
if(S.buflen > BLAKE2S_BLOCK_SIZE) { |
|
S.t[0] += BLAKE2S_BLOCK_SIZE; |
|
blake2s_compress(&S, (uint *) S.buf); |
|
S.buflen -= BLAKE2S_BLOCK_SIZE; |
|
neoscrypt_copy(S.buf, &S.buf[BLAKE2S_BLOCK_SIZE], S.buflen); |
|
} |
|
S.t[0] += S.buflen; |
|
S.f[0] = ~0U; |
|
neoscrypt_erase(&S.buf[S.buflen], 2 * BLAKE2S_BLOCK_SIZE - S.buflen); |
|
blake2s_compress(&S, (uint *) S.buf); |
|
/* Write back */ |
|
neoscrypt_copy(output, S.h, output_size); |
|
} |
|
|
|
/* FastKDF, a fast buffered key derivation function: |
|
* FASTKDF_BUFFER_SIZE must be a power of 2; |
|
* password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE; |
|
* prf_output_size must be <= prf_key_size; */ |
|
void fastkdf(const uchar *password, const uchar *salt, const uint salt_len, |
|
uint N, uchar *output, uint output_len) { |
|
|
|
/* WARNING! |
|
* This algorithm uses byte-wise addressing for memory blocks. |
|
* Or in other words, trying to copy an unaligned memory region |
|
* will significantly slow down the algorithm, when copying uses |
|
* words or bigger entities. It even may corrupt the data, when |
|
* the device does not support it properly. |
|
* Therefore use byte copying, which will not the fastest but at |
|
* least get reliable results. */ |
|
|
|
// BLOCK_SIZE 64U |
|
// FASTKDF_BUFFER_SIZE 256U |
|
// BLAKE2S_BLOCK_SIZE 64U |
|
// BLAKE2S_KEY_SIZE 32U |
|
// BLAKE2S_OUT_SIZE 32U |
|
uchar A[FASTKDF_BUFFER_SIZE + BLAKE2S_BLOCK_SIZE]; |
|
uchar B[FASTKDF_BUFFER_SIZE + BLAKE2S_KEY_SIZE]; |
|
uchar prf_output[BLAKE2S_OUT_SIZE], prf_input[BLAKE2S_BLOCK_SIZE], |
|
prf_key[BLAKE2S_KEY_SIZE]; |
|
uint bufidx, a, b, i, j; |
|
|
|
/* Initialise the password buffer */ |
|
a = FASTKDF_BUFFER_SIZE / PASSWORD_LEN; |
|
for(i = 0, j= 0; i < a; ++i, j+= PASSWORD_LEN) |
|
neoscrypt_copy(&A[j], (uchar *)password, PASSWORD_LEN); |
|
b= FASTKDF_BUFFER_SIZE- j; |
|
if(b) |
|
neoscrypt_copy(&A[j], (uchar *)password, b); |
|
#if (PASSWORD_LEN< BLAKE2S_BLOCK_SIZE) |
|
/* Initialise the password buffer */ |
|
a = BLAKE2S_BLOCK_SIZE / PASSWORD_LEN; |
|
for(i = 0, j= 0; i < a; ++i, j+= PASSWORD_LEN) |
|
neoscrypt_copy(&A[j], (uchar *)password, PASSWORD_LEN); |
|
b= BLAKE2S_BLOCK_SIZE- j; |
|
if(b) |
|
neoscrypt_copy(&A[j], (uchar *)password, b); |
|
//neoscrypt_copy(&A[FASTKDF_BUFFER_SIZE], (uchar *)password, PASSWORD_LEN); |
|
//// Erase the remainder of the blake-block, when the password length is smaller |
|
//neoscrypt_erase(&A[FASTKDF_BUFFER_SIZE+ PASSWORD_LEN], BLAKE2S_BLOCK_SIZE- PASSWORD_LEN); |
|
#else |
|
neoscrypt_copy(&A[FASTKDF_BUFFER_SIZE], (uchar *)password, BLAKE2S_BLOCK_SIZE); |
|
#endif |
|
|
|
/* Initialise the salt buffer */ |
|
a = FASTKDF_BUFFER_SIZE/ salt_len; |
|
for(i = 0, j= 0; i< a; ++i, j+= salt_len) |
|
neoscrypt_copy(&B[j], salt, salt_len); |
|
b= FASTKDF_BUFFER_SIZE- j; |
|
if(b) |
|
neoscrypt_copy(&B[j], (uchar *)salt, b); |
|
if(salt_len< BLAKE2S_BLOCK_SIZE) { |
|
neoscrypt_copy(&B[FASTKDF_BUFFER_SIZE], (uchar *)salt, salt_len); |
|
// Erase the remainder of the blake-block, when the password length is smaller |
|
neoscrypt_erase(&B[FASTKDF_BUFFER_SIZE+ salt_len], BLAKE2S_BLOCK_SIZE- salt_len); |
|
} else |
|
neoscrypt_copy(&B[FASTKDF_BUFFER_SIZE], salt, BLAKE2S_KEY_SIZE); |
|
|
|
/* The primary iteration */ |
|
for(i = 0, bufidx = 0; i < N; ++i) { |
|
/* Copy the PRF input buffer byte by byte to make sure prf_input |
|
starts at a well aligned address. Missing to do so may slow down |
|
computation. */ |
|
for(j= 0, a= bufidx; j< BLAKE2S_BLOCK_SIZE; ++j, ++a) |
|
prf_input[j]= A[a]; |
|
|
|
/* Copy the PRF key buffer */ |
|
for(j= 0, a= bufidx; j< BLAKE2S_KEY_SIZE; ++j, ++a) |
|
prf_key[j]= B[a]; |
|
|
|
/* PRF */ |
|
blake2s(prf_input, BLAKE2S_BLOCK_SIZE, |
|
prf_key, BLAKE2S_KEY_SIZE, |
|
prf_output, BLAKE2S_OUT_SIZE); |
|
|
|
/* Calculate the next buffer pointer */ |
|
for(j = 0, bufidx = 0; j < BLAKE2S_OUT_SIZE; j++) |
|
bufidx += prf_output[j]; |
|
bufidx &= (FASTKDF_BUFFER_SIZE - 1); |
|
|
|
/* Modify the salt buffer */ |
|
//neoscrypt_xor(&B[bufidx], &prf_output[0], BLAKE2S_OUT_SIZE); |
|
for(j= 0, a= bufidx; j< BLAKE2S_OUT_SIZE; ++j, ++a) |
|
B[a]^= prf_output[j]; |
|
|
|
/* Head modified, tail updated */ |
|
if(bufidx < BLAKE2S_KEY_SIZE) |
|
//neoscrypt_copy(&B[FASTKDF_BUFFER_SIZE + bufidx], &B[bufidx], |
|
// min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx)); |
|
for(j= 0, a= FASTKDF_BUFFER_SIZE + bufidx, b= bufidx; |
|
j< min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE- bufidx); ++j, ++a, ++b) |
|
B[a]= B[b]; |
|
|
|
/* Tail modified, head updated */ |
|
if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE) |
|
neoscrypt_copy(B, &B[FASTKDF_BUFFER_SIZE], |
|
BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx)); |
|
|
|
} |
|
|
|
/* Modify and copy into the output buffer */ |
|
if(output_len > FASTKDF_BUFFER_SIZE) |
|
output_len = FASTKDF_BUFFER_SIZE; |
|
|
|
a = FASTKDF_BUFFER_SIZE - bufidx; |
|
if(a >= output_len) { |
|
for(j= 0, i= bufidx; j< output_len; ++j, ++i) |
|
output[j]= B[i]^ A[j]; |
|
} else { |
|
for(j= 0, i= bufidx; j< a; ++j, ++i) |
|
output[j]= B[i]^ A[j]; |
|
for(j= a, i= 0; i< output_len- a; ++j, ++i) |
|
output[j]= B[i]^ A[j]; |
|
} |
|
} |
|
|
|
uint16 neoscrypt_salsa(uint16 X) |
|
{ |
|
uint16 tmp = X; |
|
|
|
for(int i = 0; i < 10; ++i) |
|
{ |
|
tmp.s4 ^= rotate(tmp.s0 + tmp.sc, 7U); tmp.s8 ^= rotate(tmp.s4 + tmp.s0, 9U); tmp.sc ^= rotate(tmp.s8 + tmp.s4, 13U); tmp.s0 ^= rotate(tmp.sc + tmp.s8, 18U); |
|
tmp.s9 ^= rotate(tmp.s5 + tmp.s1, 7U); tmp.sd ^= rotate(tmp.s9 + tmp.s5, 9U); tmp.s1 ^= rotate(tmp.sd + tmp.s9, 13U); tmp.s5 ^= rotate(tmp.s1 + tmp.sd, 18U); |
|
tmp.se ^= rotate(tmp.sa + tmp.s6, 7U); tmp.s2 ^= rotate(tmp.se + tmp.sa, 9U); tmp.s6 ^= rotate(tmp.s2 + tmp.se, 13U); tmp.sa ^= rotate(tmp.s6 + tmp.s2, 18U); |
|
tmp.s3 ^= rotate(tmp.sf + tmp.sb, 7U); tmp.s7 ^= rotate(tmp.s3 + tmp.sf, 9U); tmp.sb ^= rotate(tmp.s7 + tmp.s3, 13U); tmp.sf ^= rotate(tmp.sb + tmp.s7, 18U); |
|
tmp.s1 ^= rotate(tmp.s0 + tmp.s3, 7U); tmp.s2 ^= rotate(tmp.s1 + tmp.s0, 9U); tmp.s3 ^= rotate(tmp.s2 + tmp.s1, 13U); tmp.s0 ^= rotate(tmp.s3 + tmp.s2, 18U); |
|
tmp.s6 ^= rotate(tmp.s5 + tmp.s4, 7U); tmp.s7 ^= rotate(tmp.s6 + tmp.s5, 9U); tmp.s4 ^= rotate(tmp.s7 + tmp.s6, 13U); tmp.s5 ^= rotate(tmp.s4 + tmp.s7, 18U); |
|
tmp.sb ^= rotate(tmp.sa + tmp.s9, 7U); tmp.s8 ^= rotate(tmp.sb + tmp.sa, 9U); tmp.s9 ^= rotate(tmp.s8 + tmp.sb, 13U); tmp.sa ^= rotate(tmp.s9 + tmp.s8, 18U); |
|
tmp.sc ^= rotate(tmp.sf + tmp.se, 7U); tmp.sd ^= rotate(tmp.sc + tmp.sf, 9U); tmp.se ^= rotate(tmp.sd + tmp.sc, 13U); tmp.sf ^= rotate(tmp.se + tmp.sd, 18U); |
|
} |
|
|
|
return(X + tmp); |
|
} |
|
|
|
uint16 neoscrypt_chacha(uint16 X) |
|
{ |
|
uint16 tmp = X; |
|
|
|
for(int i = 0; i < 10; ++i) |
|
{ |
|
tmp.s0 += tmp.s4; tmp.sc = rotate(tmp.sc ^ tmp.s0, 16U); tmp.s8 += tmp.sc; tmp.s4 = rotate(tmp.s4 ^ tmp.s8, 12U); tmp.s0 += tmp.s4; tmp.sc = rotate(tmp.sc ^ tmp.s0, 8U); tmp.s8 += tmp.sc; tmp.s4 = rotate(tmp.s4 ^ tmp.s8, 7U); |
|
tmp.s1 += tmp.s5; tmp.sd = rotate(tmp.sd ^ tmp.s1, 16U); tmp.s9 += tmp.sd; tmp.s5 = rotate(tmp.s5 ^ tmp.s9, 12U); tmp.s1 += tmp.s5; tmp.sd = rotate(tmp.sd ^ tmp.s1, 8U); tmp.s9 += tmp.sd; tmp.s5 = rotate(tmp.s5 ^ tmp.s9, 7U); |
|
tmp.s2 += tmp.s6; tmp.se = rotate(tmp.se ^ tmp.s2, 16U); tmp.sa += tmp.se; tmp.s6 = rotate(tmp.s6 ^ tmp.sa, 12U); tmp.s2 += tmp.s6; tmp.se = rotate(tmp.se ^ tmp.s2, 8U); tmp.sa += tmp.se; tmp.s6 = rotate(tmp.s6 ^ tmp.sa, 7U); |
|
tmp.s3 += tmp.s7; tmp.sf = rotate(tmp.sf ^ tmp.s3, 16U); tmp.sb += tmp.sf; tmp.s7 = rotate(tmp.s7 ^ tmp.sb, 12U); tmp.s3 += tmp.s7; tmp.sf = rotate(tmp.sf ^ tmp.s3, 8U); tmp.sb += tmp.sf; tmp.s7 = rotate(tmp.s7 ^ tmp.sb, 7U); |
|
tmp.s0 += tmp.s5; tmp.sf = rotate(tmp.sf ^ tmp.s0, 16U); tmp.sa += tmp.sf; tmp.s5 = rotate(tmp.s5 ^ tmp.sa, 12U); tmp.s0 += tmp.s5; tmp.sf = rotate(tmp.sf ^ tmp.s0, 8U); tmp.sa += tmp.sf; tmp.s5 = rotate(tmp.s5 ^ tmp.sa, 7U); |
|
tmp.s1 += tmp.s6; tmp.sc = rotate(tmp.sc ^ tmp.s1, 16U); tmp.sb += tmp.sc; tmp.s6 = rotate(tmp.s6 ^ tmp.sb, 12U); tmp.s1 += tmp.s6; tmp.sc = rotate(tmp.sc ^ tmp.s1, 8U); tmp.sb += tmp.sc; tmp.s6 = rotate(tmp.s6 ^ tmp.sb, 7U); |
|
tmp.s2 += tmp.s7; tmp.sd = rotate(tmp.sd ^ tmp.s2, 16U); tmp.s8 += tmp.sd; tmp.s7 = rotate(tmp.s7 ^ tmp.s8, 12U); tmp.s2 += tmp.s7; tmp.sd = rotate(tmp.sd ^ tmp.s2, 8U); tmp.s8 += tmp.sd; tmp.s7 = rotate(tmp.s7 ^ tmp.s8, 7U); |
|
tmp.s3 += tmp.s4; tmp.se = rotate(tmp.se ^ tmp.s3, 16U); tmp.s9 += tmp.se; tmp.s4 = rotate(tmp.s4 ^ tmp.s9, 12U); tmp.s3 += tmp.s4; tmp.se = rotate(tmp.se ^ tmp.s3, 8U); tmp.s9 += tmp.se; tmp.s4 = rotate(tmp.s4 ^ tmp.s9, 7U); |
|
} |
|
|
|
return(X + tmp); |
|
} |
|
|
|
void neoscrypt_blkmix(uint16 *XV, uint mixmode) |
|
{ |
|
|
|
/* NeoScrypt flow: Scrypt flow: |
|
Xa ^= Xd; M(Xa'); Ya = Xa"; Xa ^= Xb; M(Xa'); Ya = Xa"; |
|
Xb ^= Xa"; M(Xb'); Yb = Xb"; Xb ^= Xa"; M(Xb'); Yb = Xb"; |
|
Xc ^= Xb"; M(Xc'); Yc = Xc"; Xa" = Ya; |
|
Xd ^= Xc"; M(Xd'); Yd = Xd"; Xb" = Yb; |
|
Xa" = Ya; Xb" = Yc; |
|
Xc" = Yb; Xd" = Yd; */ |
|
|
|
XV[0] ^= XV[3]; |
|
|
|
if(!mixmode) XV[0] = neoscrypt_salsa(XV[0]); |
|
else XV[0] = neoscrypt_chacha(XV[0]); |
|
|
|
XV[1] ^= XV[0]; |
|
|
|
if(!mixmode) XV[1] = neoscrypt_salsa(XV[1]); |
|
else XV[1] = neoscrypt_chacha(XV[1]); |
|
|
|
XV[2] ^= XV[1]; |
|
|
|
if(!mixmode) XV[2] = neoscrypt_salsa(XV[2]); |
|
else XV[2] = neoscrypt_chacha(XV[2]); |
|
|
|
XV[3] ^= XV[2]; |
|
|
|
if(!mixmode) XV[3] = neoscrypt_salsa(XV[3]); |
|
else XV[3] = neoscrypt_chacha(XV[3]); |
|
|
|
neoscrypt_blkswp(&XV[1], &XV[2], BLOCK_SIZE); |
|
} |
|
|
|
/* NeoScrypt core engine: |
|
* p = 1, salt = password; |
|
* Basic customisation (required): |
|
* profile bit 0: |
|
* 0 = NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20; |
|
* 1 = Scrypt(1024, 1, 1) with Salsa20/8; |
|
* profile bits 4 to 1: |
|
* 0000 = FastKDF-BLAKE2s; |
|
* 0001 = PBKDF2-HMAC-SHA256; |
|
* 0010 = PBKDF2-HMAC-BLAKE256; |
|
* Extended customisation (optional): |
|
* profile bit 31: |
|
* 0 = extended customisation absent; |
|
* 1 = extended customisation present; |
|
* profile bits 7 to 5 (rfactor): |
|
* 000 = r of 1; |
|
* 001 = r of 2; |
|
* 010 = r of 4; |
|
* ... |
|
* 111 = r of 128; |
|
* profile bits 12 to 8 (Nfactor): |
|
* 00000 = N of 2; |
|
* 00001 = N of 4; |
|
* 00010 = N of 8; |
|
* ..... |
|
* 00110 = N of 128; |
|
* ..... |
|
* 01001 = N of 1024; |
|
* ..... |
|
* 11110 = N of 2147483648; |
|
* profile bits 30 to 13 are reserved */ |
|
__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) |
|
__kernel void search(__global const uchar* restrict input, |
|
#ifdef TEST |
|
__global uchar* restrict output, |
|
#else |
|
volatile __global uint* restrict output, |
|
#endif |
|
__global uchar* padcache, |
|
const uint target) |
|
{ |
|
#define CONSTANT_N 128 |
|
#define CONSTANT_r 2 |
|
/* Ensure stack alignment by putting those first. */ |
|
/* X = CONSTANT_r * 2 * BLOCK_SIZE(64) */ |
|
uchar X[FASTKDF_BUFFER_SIZE]; |
|
/* Z is a copy of X for ChaCha */ |
|
uchar Z[FASTKDF_BUFFER_SIZE]; |
|
/* V = CONSTANT_N * CONSTANT_r * 2 * BLOCK_SIZE */ |
|
__global uchar *V= &padcache[CONSTANT_N * CONSTANT_r * 2 * BLOCK_SIZE* |
|
(get_global_id(0)% MAX_GLOBAL_THREADS)]; |
|
#ifndef TEST |
|
uchar outbuf[32]; |
|
uchar data[PASSWORD_LEN]; |
|
uint i, j; |
|
for(i= 0; i< PASSWORD_LEN- 4; ++i) |
|
data[i]= input[i]; |
|
((uint *)data)[(PASSWORD_LEN- 4)/ sizeof(uint)]= get_global_id(0); |
|
#else |
|
uchar outbuf[OUTPUT_LEN]; |
|
uchar data[PASSWORD_LEN]; |
|
uint i, j; |
|
for(i= 0; i< PASSWORD_LEN; ++i) |
|
data[i]= input[i]; |
|
#endif |
|
const uint mixmode = 0x14; |
|
|
|
#ifdef TEST |
|
#ifdef BLAKE2S_TEST |
|
blake2s(data, 64, data, 32, outbuf, OUTPUT_LEN); |
|
for(i= 0; i< OUTPUT_LEN; ++i) |
|
output[i]= outbuf[i]; |
|
return; |
|
#elif defined(FASTKDF_TEST) |
|
for(i= 0; i< FASTKDF_BUFFER_SIZE; ++i) |
|
X[i]= testsalt[i]; |
|
fastkdf(data, X, FASTKDF_BUFFER_SIZE, 32, outbuf, 32); |
|
for(i= 0; i< OUTPUT_LEN; ++i) |
|
output[i]= outbuf[i]; |
|
return; |
|
#endif |
|
#endif |
|
|
|
/* X = KDF(password, salt) */ |
|
fastkdf(data, data, PASSWORD_LEN, 32, X, CONSTANT_r * 2 * BLOCK_SIZE); |
|
|
|
/* Process ChaCha 1st, Salsa 2nd and XOR them into PBKDF2 */ |
|
neoscrypt_blkcpy(Z, X, CONSTANT_r * 2 * BLOCK_SIZE); |
|
|
|
for(int y = 0; y < 2; ++y) |
|
{ |
|
for(i = 0; i < 128; ++i) |
|
{ |
|
neoscrypt_gl_blkcpy(&V[i << 8], &X[0], 256); |
|
neoscrypt_blkmix((uint16 *)X, y); |
|
} |
|
|
|
for(i = 0; i < 128; ++i) |
|
{ |
|
neoscrypt_gl_blkxor(&X[0], &V[(((uint *)X)[48] & 127) << 8], 256); |
|
neoscrypt_blkmix((uint16 *)X, y); |
|
} |
|
if(!y) neoscrypt_blkswp(&X[0], &Z[0], 256); |
|
} |
|
|
|
/* blkxor(X, Z) */ |
|
neoscrypt_blkxor(&X[0], &Z[0], CONSTANT_r * 2 * BLOCK_SIZE); |
|
|
|
#ifdef TEST |
|
fastkdf(data, X, FASTKDF_BUFFER_SIZE, 32, outbuf, 32); |
|
//((uint *)outbuf)[8]= target; |
|
for(i= 0; i< OUTPUT_LEN; ++i) |
|
output[i]= outbuf[i]; |
|
#else |
|
/* output = KDF(password, X) */ |
|
fastkdf(data, X, FASTKDF_BUFFER_SIZE, 32, outbuf, 32); |
|
|
|
#define SCRYPT_FOUND (0xFF) |
|
#ifdef cl_khr_global_int32_base_atomics |
|
#define SETFOUND(Xnonce) output[atomic_add(&output[SCRYPT_FOUND], 1)]= Xnonce |
|
#else |
|
#define SETFOUND(Xnonce) output[output[SCRYPT_FOUND]++] = Xnonce |
|
#endif |
|
|
|
if (((uint *)outbuf)[7]<= target) |
|
SETFOUND(get_global_id(0)); |
|
#endif |
|
} |