/* NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20 */ /* Adapted and improved for 14.x drivers by Wolf9466 (Wolf`) */ #define rotl(x,y) rotate(x,y) #define Ch(x,y,z) bitselect(z,y,x) #define Maj(x,y,z) Ch((x^z),y,z) #define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b))) #define ROTL32(a,b) rotate(a,as_uint(b)) __constant uint ES[2] = { 0x00FF00FF, 0xFF00FF00 }; #define EndianSwap(n) (rotate(n & ES[0], 24U)|rotate(n & ES[1], 8U)) #define BLOCK_SIZE 64U #define FASTKDF_BUFFER_SIZE 256U #ifndef PASSWORD_LEN #define PASSWORD_LEN 80U #endif #ifdef TEST __constant uchar testsalt[]= { 135, 99, 188, 101, 252, 81, 54, 91, 243, 212, 78, 99, 46, 1, 113, 232, 9, 208, 203, 88, 25, 93, 218, 215, 53, 112, 105, 136, 238, 114, 242, 24, 194, 144, 239, 172, 37, 158, 113, 15, 116, 114, 47, 53, 51, 167, 178, 107, 192, 90, 92, 37, 59, 116, 234, 107, 80, 251, 2, 251, 145, 185, 119, 89, 115, 112, 94, 154, 117, 126, 233, 100, 15, 24, 246, 137, 220, 124, 244, 244, 129, 246, 244, 180, 78, 247, 146, 229, 69, 177, 143, 94, 2, 144, 63, 33, 89, 136, 234, 174, 38, 37, 183, 62, 176, 243, 136, 30, 249, 195, 129, 227, 146, 216, 38, 118, 185, 43, 175, 217, 246, 203, 251, 211, 222, 237, 21, 231, 133, 218, 206, 9, 148, 229, 20, 229, 101, 146, 183, 120, 155, 91, 16, 10, 86, 198, 185, 179, 1, 197, 69, 95, 44, 133, 49, 225, 2, 115, 182, 6, 82, 166, 35, 3, 19, 59, 193, 253, 14, 239, 65, 79, 105, 154, 70, 146, 169, 233, 227, 20, 66, 15, 52, 223, 228, 202, 158, 207, 6, 245, 204, 212, 220, 108, 204, 39, 136, 66, 215, 186, 247, 184, 92, 171, 56, 166, 162, 105, 126, 162, 127, 175, 181, 227, 236, 233, 127, 219, 115, 30, 136, 108, 169, 14, 172, 71, 82, 250, 141, 209, 98, 216, 221, 165, 132, 146, 98, 76, 194, 239, 123, 90, 91, 193, 58, 121, 235, 161, 51, 144, 5, 41, 216, 160, 93, 173 }; #endif /* When changing the optimal type, make sure the loops unrolled in _blkcopy, _blkswp and _blkxor are modified accordingly. */ #define OPTIMAL_TYPE uint /* Fast 32-bit / 64-bit memcpy(); * len must be a multiple of 32 bytes */ void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len) { OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp; OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp; uint i; #ifdef WITH_UNROLL #pragma unroll(1<< max(0, (32- sizeof(OPTIMAL_TYPE))>> 2)) for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); ++i) dst[i] = src[i]; #else for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i += 4) { dst[i] = src[i]; dst[i + 1] = src[i + 1]; dst[i + 2] = src[i + 2]; dst[i + 3] = src[i + 3]; } #endif } void neoscrypt_gl_blkcpy(__global void *dstp, const void *srcp, uint len) { __global OPTIMAL_TYPE *dst = (__global OPTIMAL_TYPE *) dstp; OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp; uint i; for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i += 4) { dst[i] = src[i]; dst[i + 1] = src[i + 1]; dst[i + 2] = src[i + 2]; dst[i + 3] = src[i + 3]; } } /* Fast 32-bit / 64-bit block swapper; * len must be a multiple of 32 bytes */ void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len) { OPTIMAL_TYPE *blkA = (OPTIMAL_TYPE *) blkAp; OPTIMAL_TYPE *blkB = (OPTIMAL_TYPE *) blkBp; OPTIMAL_TYPE t0, t1, t2, t3; uint i; for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i += 4) { t0 = blkA[i]; t1 = blkA[i + 1]; t2 = blkA[i + 2]; t3 = blkA[i + 3]; blkA[i] = blkB[i]; blkA[i + 1] = blkB[i + 1]; blkA[i + 2] = blkB[i + 2]; blkA[i + 3] = blkB[i + 3]; blkB[i] = t0; blkB[i + 1] = t1; blkB[i + 2] = t2; blkB[i + 3] = t3; } } /* Fast 32-bit / 64-bit block XOR engine; * len must be a multiple of 32 bytes */ void neoscrypt_blkxor(void *dstp, const void *srcp, uint len) { OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp; OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp; uint i; for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i += 4) { dst[i] ^= src[i]; dst[i + 1] ^= src[i + 1]; dst[i + 2] ^= src[i + 2]; dst[i + 3] ^= src[i + 3]; } } void neoscrypt_gl_blkxor(void *dstp, __global void *srcp, uint len) { OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp; __global OPTIMAL_TYPE *src = (__global OPTIMAL_TYPE *) srcp; uint i; for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i += 4) { dst[i] ^= src[i]; dst[i + 1] ^= src[i + 1]; dst[i + 2] ^= src[i + 2]; dst[i + 3] ^= src[i + 3]; } } /* 32-bit / 64-bit / 128-bit optimised memcpy() */ void neoscrypt_copy(void *dstp, const void *srcp, uint len) { OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp; OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp; uint i, tail; const uint c_len= len/ sizeof(OPTIMAL_TYPE); for(i= 0; i< c_len; ++i) dst[i] = src[i]; tail= len- c_len* sizeof(OPTIMAL_TYPE); if(tail) { #if defined(cl_khr_byte_addressable_store) && !defined(FORCE_BYTE_COPY) uchar *dstb = (uchar *) dstp; uchar *srcb = (uchar *) srcp; for(i= len- tail; i< len; i++) dstb[i] = srcb[i]; #else uint *dsti = (uint *) dstp; uint *srci = (uint *) srcp; for(i*= (sizeof(OPTIMAL_TYPE)/ sizeof(uint)); i< (len>> 2); ++i) dsti[i] = srci[i]; #endif } } /* 32-bit / 64-bit / 128-bit optimised memcpy() */ void neoscrypt_gl_copy(__global uchar *dstp, const void *srcp, uint len) { __global OPTIMAL_TYPE *dst = (__global OPTIMAL_TYPE *) dstp; OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp; uint i, tail; for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i++) dst[i] = src[i]; tail = len & (sizeof(OPTIMAL_TYPE) - 1); if(tail) { uchar *srcb = (uchar *) srcp; for(i = len - tail; i < len; i++) dstp[i] = srcb[i]; } } /* 32-bit / 64-bit optimised memory erase aka memset() to zero */ void neoscrypt_erase(void *dstp, uint len) { const OPTIMAL_TYPE null = 0; OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp; uint i, tail; for(i = 0; i < (len / sizeof(OPTIMAL_TYPE)); i++) dst[i] = null; tail = len & (sizeof(OPTIMAL_TYPE) - 1); if(tail) { #if defined(cl_khr_byte_addressable_store) && !defined(FORCE_BYTE_COPY) uchar *dstb = (uchar *) dstp; for(i = len - tail; i < len; i++) dstb[i] = 0u; #else uint *dsti = (uint *) dstp; for(i*= sizeof(OPTIMAL_TYPE)/ sizeof(uint); i< (len>> 2); ++i) dsti[i] = 0u; #endif } } /* 32-bit / 64-bit optimised XOR engine */ void neoscrypt_xor(void *dstp, const void *srcp, uint len) { OPTIMAL_TYPE *dst = (OPTIMAL_TYPE *) dstp; OPTIMAL_TYPE *src = (OPTIMAL_TYPE *) srcp; uint i, tail; const unsigned c_len= len/ sizeof(OPTIMAL_TYPE); for(i= 0; i< c_len; ++i) dst[i]^= src[i]; //tail = len & (sizeof(OPTIMAL_TYPE) - 1); tail= len- c_len* sizeof(OPTIMAL_TYPE); if(tail) { #if defined(cl_khr_byte_addressable_store) && !defined(FORCE_BYTE_COPY) uchar *dstb = (uchar *) dstp; uchar *srcb = (uchar *) srcp; for(i = len - tail; i < len; i++) dstb[i] ^= srcb[i]; #else uint *dsti = (uint *) dstp; uint *srci = (uint *) srcp; for(i*= (sizeof(OPTIMAL_TYPE)/ sizeof(uint)); i < (len>> 2); ++i) dsti[i]^= srci[i]; #endif } } /* BLAKE2s */ #define BLAKE2S_BLOCK_SIZE 64U #define BLAKE2S_OUT_SIZE 32U #define BLAKE2S_KEY_SIZE 32U /* Parameter block of 32 bytes */ typedef struct blake2s_param_t { uchar digest_length; uchar key_length; uchar fanout; uchar depth; uint leaf_length; uchar node_offset[6]; uchar node_depth; uchar inner_length; uchar salt[8]; uchar personal[8]; } blake2s_param; /* State block of 180 bytes */ typedef struct blake2s_state_t { uint h[8]; uint t[2]; uint f[2]; uchar buf[2 * BLAKE2S_BLOCK_SIZE]; uint buflen; } blake2s_state; __constant uint blake2s_IV[8] = { 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 }; __constant uchar blake2s_sigma[10][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , }; void blake2s_compress(blake2s_state *S, const uint *buf) { uint i; uint m[16]; uint v[16]; neoscrypt_copy(m, buf, 64); neoscrypt_copy(v, S->h, 32); v[ 8] = blake2s_IV[0]; v[ 9] = blake2s_IV[1]; v[10] = blake2s_IV[2]; v[11] = blake2s_IV[3]; v[12] = S->t[0] ^ blake2s_IV[4]; v[13] = S->t[1] ^ blake2s_IV[5]; v[14] = S->f[0] ^ blake2s_IV[6]; v[15] = S->f[1] ^ blake2s_IV[7]; #define G(r,i,a,b,c,d) \ do { \ a = a + b + m[blake2s_sigma[r][2*i+0]]; \ d = ROTR32(d ^ a, 16); \ c = c + d; \ b = ROTR32(b ^ c, 12); \ a = a + b + m[blake2s_sigma[r][2*i+1]]; \ d = ROTR32(d ^ a, 8); \ c = c + d; \ b = ROTR32(b ^ c, 7); \ } while(0) #define ROUND(r) \ do { \ G(r, 0, v[ 0], v[ 4], v[ 8], v[12]); \ G(r, 1, v[ 1], v[ 5], v[ 9], v[13]); \ G(r, 2, v[ 2], v[ 6], v[10], v[14]); \ G(r, 3, v[ 3], v[ 7], v[11], v[15]); \ G(r, 4, v[ 0], v[ 5], v[10], v[15]); \ G(r, 5, v[ 1], v[ 6], v[11], v[12]); \ G(r, 6, v[ 2], v[ 7], v[ 8], v[13]); \ G(r, 7, v[ 3], v[ 4], v[ 9], v[14]); \ } while(0) ROUND(0); ROUND(1); ROUND(2); ROUND(3); ROUND(4); ROUND(5); ROUND(6); ROUND(7); ROUND(8); ROUND(9); for(i = 0; i < 8; i++) S->h[i] = S->h[i] ^ v[i] ^ v[i + 8]; #undef G #undef ROUND } void blake2s_update(blake2s_state *S, const uchar *input, uint input_size) { uint left, fill; while(input_size > 0) { left = S->buflen; fill = 2 * BLAKE2S_BLOCK_SIZE - left; if(input_size > fill) { /* Buffer fill */ neoscrypt_copy(&S->buf[left], input, fill); S->buflen += fill; /* Counter increment */ S->t[0] += BLAKE2S_BLOCK_SIZE; /* Compress */ blake2s_compress(S, (uint *) S->buf); /* Shift buffer left */ neoscrypt_copy(S->buf, &S->buf[BLAKE2S_BLOCK_SIZE], BLAKE2S_BLOCK_SIZE); S->buflen -= BLAKE2S_BLOCK_SIZE; input += fill; input_size -= fill; } else { neoscrypt_copy(&S->buf[left], input, input_size); S->buflen += input_size; /* Do not compress */ //input += input_size; input_size = 0; } } } void blake2s(const void *input, const uint input_size, const void *key, const uchar key_size, void *output, const uchar output_size) { uchar block[BLAKE2S_BLOCK_SIZE]; blake2s_param P; blake2s_state S; /* Initialise */ neoscrypt_erase(&P, sizeof(blake2s_param)); P.digest_length = output_size; P.key_length = key_size; P.fanout = 1; P.depth = 1; neoscrypt_erase(&S, sizeof(blake2s_state)); // Initialize the state for(int i= 0; i< 8; ++i) S.h[i]= blake2s_IV[i]; // neoscrypt_xor(&S, &P, 32); S.h[0]^= ((uint)output_size)| (((uint)key_size)<< 8)| (1U<< 16)| (1U<< 24); // All other values of P are unset yet. neoscrypt_erase(block, BLAKE2S_BLOCK_SIZE); neoscrypt_copy(block, key, key_size); blake2s_update(&S, block, BLAKE2S_BLOCK_SIZE); /* Update */ blake2s_update(&S, (uchar *) input, input_size); /* Finish */ if(S.buflen > BLAKE2S_BLOCK_SIZE) { S.t[0] += BLAKE2S_BLOCK_SIZE; blake2s_compress(&S, (uint *) S.buf); S.buflen -= BLAKE2S_BLOCK_SIZE; neoscrypt_copy(S.buf, &S.buf[BLAKE2S_BLOCK_SIZE], S.buflen); } S.t[0] += S.buflen; S.f[0] = ~0U; neoscrypt_erase(&S.buf[S.buflen], 2 * BLAKE2S_BLOCK_SIZE - S.buflen); blake2s_compress(&S, (uint *) S.buf); /* Write back */ neoscrypt_copy(output, S.h, output_size); } /* FastKDF, a fast buffered key derivation function: * FASTKDF_BUFFER_SIZE must be a power of 2; * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE; * prf_output_size must be <= prf_key_size; */ void fastkdf(const uchar *password, const uchar *salt, const uint salt_len, uint N, uchar *output, uint output_len) { /* WARNING! * This algorithm uses byte-wise addressing for memory blocks. * Or in other words, trying to copy an unaligned memory region * will significantly slow down the algorithm, when copying uses * words or bigger entities. It even may corrupt the data, when * the device does not support it properly. * Therefore use byte copying, which will not the fastest but at * least get reliable results. */ // BLOCK_SIZE 64U // FASTKDF_BUFFER_SIZE 256U // BLAKE2S_BLOCK_SIZE 64U // BLAKE2S_KEY_SIZE 32U // BLAKE2S_OUT_SIZE 32U uchar A[FASTKDF_BUFFER_SIZE + BLAKE2S_BLOCK_SIZE]; uchar B[FASTKDF_BUFFER_SIZE + BLAKE2S_KEY_SIZE]; uchar prf_output[BLAKE2S_OUT_SIZE], prf_input[BLAKE2S_BLOCK_SIZE], prf_key[BLAKE2S_KEY_SIZE]; uint bufidx, a, b, i, j; /* Initialise the password buffer */ a = FASTKDF_BUFFER_SIZE / PASSWORD_LEN; for(i = 0, j= 0; i < a; ++i, j+= PASSWORD_LEN) neoscrypt_copy(&A[j], (uchar *)password, PASSWORD_LEN); b= FASTKDF_BUFFER_SIZE- j; if(b) neoscrypt_copy(&A[j], (uchar *)password, b); #if (PASSWORD_LEN< BLAKE2S_BLOCK_SIZE) /* Initialise the password buffer */ a = BLAKE2S_BLOCK_SIZE / PASSWORD_LEN; for(i = 0, j= 0; i < a; ++i, j+= PASSWORD_LEN) neoscrypt_copy(&A[j], (uchar *)password, PASSWORD_LEN); b= BLAKE2S_BLOCK_SIZE- j; if(b) neoscrypt_copy(&A[j], (uchar *)password, b); //neoscrypt_copy(&A[FASTKDF_BUFFER_SIZE], (uchar *)password, PASSWORD_LEN); //// Erase the remainder of the blake-block, when the password length is smaller //neoscrypt_erase(&A[FASTKDF_BUFFER_SIZE+ PASSWORD_LEN], BLAKE2S_BLOCK_SIZE- PASSWORD_LEN); #else neoscrypt_copy(&A[FASTKDF_BUFFER_SIZE], (uchar *)password, BLAKE2S_BLOCK_SIZE); #endif /* Initialise the salt buffer */ a = FASTKDF_BUFFER_SIZE/ salt_len; for(i = 0, j= 0; i< a; ++i, j+= salt_len) neoscrypt_copy(&B[j], salt, salt_len); b= FASTKDF_BUFFER_SIZE- j; if(b) neoscrypt_copy(&B[j], (uchar *)salt, b); if(salt_len< BLAKE2S_BLOCK_SIZE) { neoscrypt_copy(&B[FASTKDF_BUFFER_SIZE], (uchar *)salt, salt_len); // Erase the remainder of the blake-block, when the password length is smaller neoscrypt_erase(&B[FASTKDF_BUFFER_SIZE+ salt_len], BLAKE2S_BLOCK_SIZE- salt_len); } else neoscrypt_copy(&B[FASTKDF_BUFFER_SIZE], salt, BLAKE2S_KEY_SIZE); /* The primary iteration */ for(i = 0, bufidx = 0; i < N; ++i) { /* Copy the PRF input buffer byte by byte to make sure prf_input starts at a well aligned address. Missing to do so may slow down computation. */ for(j= 0, a= bufidx; j< BLAKE2S_BLOCK_SIZE; ++j, ++a) prf_input[j]= A[a]; /* Copy the PRF key buffer */ for(j= 0, a= bufidx; j< BLAKE2S_KEY_SIZE; ++j, ++a) prf_key[j]= B[a]; /* PRF */ blake2s(prf_input, BLAKE2S_BLOCK_SIZE, prf_key, BLAKE2S_KEY_SIZE, prf_output, BLAKE2S_OUT_SIZE); /* Calculate the next buffer pointer */ for(j = 0, bufidx = 0; j < BLAKE2S_OUT_SIZE; j++) bufidx += prf_output[j]; bufidx &= (FASTKDF_BUFFER_SIZE - 1); /* Modify the salt buffer */ //neoscrypt_xor(&B[bufidx], &prf_output[0], BLAKE2S_OUT_SIZE); for(j= 0, a= bufidx; j< BLAKE2S_OUT_SIZE; ++j, ++a) B[a]^= prf_output[j]; /* Head modified, tail updated */ if(bufidx < BLAKE2S_KEY_SIZE) //neoscrypt_copy(&B[FASTKDF_BUFFER_SIZE + bufidx], &B[bufidx], // min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx)); for(j= 0, a= FASTKDF_BUFFER_SIZE + bufidx, b= bufidx; j< min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE- bufidx); ++j, ++a, ++b) B[a]= B[b]; /* Tail modified, head updated */ if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE) neoscrypt_copy(B, &B[FASTKDF_BUFFER_SIZE], BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx)); } /* Modify and copy into the output buffer */ if(output_len > FASTKDF_BUFFER_SIZE) output_len = FASTKDF_BUFFER_SIZE; a = FASTKDF_BUFFER_SIZE - bufidx; if(a >= output_len) { for(j= 0, i= bufidx; j< output_len; ++j, ++i) output[j]= B[i]^ A[j]; } else { for(j= 0, i= bufidx; j< a; ++j, ++i) output[j]= B[i]^ A[j]; for(j= a, i= 0; i< output_len- a; ++j, ++i) output[j]= B[i]^ A[j]; } } uint16 neoscrypt_salsa(uint16 X) { uint16 tmp = X; for(int i = 0; i < 10; ++i) { tmp.s4 ^= rotate(tmp.s0 + tmp.sc, 7U); tmp.s8 ^= rotate(tmp.s4 + tmp.s0, 9U); tmp.sc ^= rotate(tmp.s8 + tmp.s4, 13U); tmp.s0 ^= rotate(tmp.sc + tmp.s8, 18U); tmp.s9 ^= rotate(tmp.s5 + tmp.s1, 7U); tmp.sd ^= rotate(tmp.s9 + tmp.s5, 9U); tmp.s1 ^= rotate(tmp.sd + tmp.s9, 13U); tmp.s5 ^= rotate(tmp.s1 + tmp.sd, 18U); tmp.se ^= rotate(tmp.sa + tmp.s6, 7U); tmp.s2 ^= rotate(tmp.se + tmp.sa, 9U); tmp.s6 ^= rotate(tmp.s2 + tmp.se, 13U); tmp.sa ^= rotate(tmp.s6 + tmp.s2, 18U); tmp.s3 ^= rotate(tmp.sf + tmp.sb, 7U); tmp.s7 ^= rotate(tmp.s3 + tmp.sf, 9U); tmp.sb ^= rotate(tmp.s7 + tmp.s3, 13U); tmp.sf ^= rotate(tmp.sb + tmp.s7, 18U); tmp.s1 ^= rotate(tmp.s0 + tmp.s3, 7U); tmp.s2 ^= rotate(tmp.s1 + tmp.s0, 9U); tmp.s3 ^= rotate(tmp.s2 + tmp.s1, 13U); tmp.s0 ^= rotate(tmp.s3 + tmp.s2, 18U); tmp.s6 ^= rotate(tmp.s5 + tmp.s4, 7U); tmp.s7 ^= rotate(tmp.s6 + tmp.s5, 9U); tmp.s4 ^= rotate(tmp.s7 + tmp.s6, 13U); tmp.s5 ^= rotate(tmp.s4 + tmp.s7, 18U); tmp.sb ^= rotate(tmp.sa + tmp.s9, 7U); tmp.s8 ^= rotate(tmp.sb + tmp.sa, 9U); tmp.s9 ^= rotate(tmp.s8 + tmp.sb, 13U); tmp.sa ^= rotate(tmp.s9 + tmp.s8, 18U); tmp.sc ^= rotate(tmp.sf + tmp.se, 7U); tmp.sd ^= rotate(tmp.sc + tmp.sf, 9U); tmp.se ^= rotate(tmp.sd + tmp.sc, 13U); tmp.sf ^= rotate(tmp.se + tmp.sd, 18U); } return(X + tmp); } uint16 neoscrypt_chacha(uint16 X) { uint16 tmp = X; for(int i = 0; i < 10; ++i) { tmp.s0 += tmp.s4; tmp.sc = rotate(tmp.sc ^ tmp.s0, 16U); tmp.s8 += tmp.sc; tmp.s4 = rotate(tmp.s4 ^ tmp.s8, 12U); tmp.s0 += tmp.s4; tmp.sc = rotate(tmp.sc ^ tmp.s0, 8U); tmp.s8 += tmp.sc; tmp.s4 = rotate(tmp.s4 ^ tmp.s8, 7U); tmp.s1 += tmp.s5; tmp.sd = rotate(tmp.sd ^ tmp.s1, 16U); tmp.s9 += tmp.sd; tmp.s5 = rotate(tmp.s5 ^ tmp.s9, 12U); tmp.s1 += tmp.s5; tmp.sd = rotate(tmp.sd ^ tmp.s1, 8U); tmp.s9 += tmp.sd; tmp.s5 = rotate(tmp.s5 ^ tmp.s9, 7U); tmp.s2 += tmp.s6; tmp.se = rotate(tmp.se ^ tmp.s2, 16U); tmp.sa += tmp.se; tmp.s6 = rotate(tmp.s6 ^ tmp.sa, 12U); tmp.s2 += tmp.s6; tmp.se = rotate(tmp.se ^ tmp.s2, 8U); tmp.sa += tmp.se; tmp.s6 = rotate(tmp.s6 ^ tmp.sa, 7U); tmp.s3 += tmp.s7; tmp.sf = rotate(tmp.sf ^ tmp.s3, 16U); tmp.sb += tmp.sf; tmp.s7 = rotate(tmp.s7 ^ tmp.sb, 12U); tmp.s3 += tmp.s7; tmp.sf = rotate(tmp.sf ^ tmp.s3, 8U); tmp.sb += tmp.sf; tmp.s7 = rotate(tmp.s7 ^ tmp.sb, 7U); tmp.s0 += tmp.s5; tmp.sf = rotate(tmp.sf ^ tmp.s0, 16U); tmp.sa += tmp.sf; tmp.s5 = rotate(tmp.s5 ^ tmp.sa, 12U); tmp.s0 += tmp.s5; tmp.sf = rotate(tmp.sf ^ tmp.s0, 8U); tmp.sa += tmp.sf; tmp.s5 = rotate(tmp.s5 ^ tmp.sa, 7U); tmp.s1 += tmp.s6; tmp.sc = rotate(tmp.sc ^ tmp.s1, 16U); tmp.sb += tmp.sc; tmp.s6 = rotate(tmp.s6 ^ tmp.sb, 12U); tmp.s1 += tmp.s6; tmp.sc = rotate(tmp.sc ^ tmp.s1, 8U); tmp.sb += tmp.sc; tmp.s6 = rotate(tmp.s6 ^ tmp.sb, 7U); tmp.s2 += tmp.s7; tmp.sd = rotate(tmp.sd ^ tmp.s2, 16U); tmp.s8 += tmp.sd; tmp.s7 = rotate(tmp.s7 ^ tmp.s8, 12U); tmp.s2 += tmp.s7; tmp.sd = rotate(tmp.sd ^ tmp.s2, 8U); tmp.s8 += tmp.sd; tmp.s7 = rotate(tmp.s7 ^ tmp.s8, 7U); tmp.s3 += tmp.s4; tmp.se = rotate(tmp.se ^ tmp.s3, 16U); tmp.s9 += tmp.se; tmp.s4 = rotate(tmp.s4 ^ tmp.s9, 12U); tmp.s3 += tmp.s4; tmp.se = rotate(tmp.se ^ tmp.s3, 8U); tmp.s9 += tmp.se; tmp.s4 = rotate(tmp.s4 ^ tmp.s9, 7U); } return(X + tmp); } void neoscrypt_blkmix(uint16 *XV, uint mixmode) { /* NeoScrypt flow: Scrypt flow: Xa ^= Xd; M(Xa'); Ya = Xa"; Xa ^= Xb; M(Xa'); Ya = Xa"; Xb ^= Xa"; M(Xb'); Yb = Xb"; Xb ^= Xa"; M(Xb'); Yb = Xb"; Xc ^= Xb"; M(Xc'); Yc = Xc"; Xa" = Ya; Xd ^= Xc"; M(Xd'); Yd = Xd"; Xb" = Yb; Xa" = Ya; Xb" = Yc; Xc" = Yb; Xd" = Yd; */ XV[0] ^= XV[3]; if(!mixmode) XV[0] = neoscrypt_salsa(XV[0]); else XV[0] = neoscrypt_chacha(XV[0]); XV[1] ^= XV[0]; if(!mixmode) XV[1] = neoscrypt_salsa(XV[1]); else XV[1] = neoscrypt_chacha(XV[1]); XV[2] ^= XV[1]; if(!mixmode) XV[2] = neoscrypt_salsa(XV[2]); else XV[2] = neoscrypt_chacha(XV[2]); XV[3] ^= XV[2]; if(!mixmode) XV[3] = neoscrypt_salsa(XV[3]); else XV[3] = neoscrypt_chacha(XV[3]); neoscrypt_blkswp(&XV[1], &XV[2], BLOCK_SIZE); } /* NeoScrypt core engine: * p = 1, salt = password; * Basic customisation (required): * profile bit 0: * 0 = NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20; * 1 = Scrypt(1024, 1, 1) with Salsa20/8; * profile bits 4 to 1: * 0000 = FastKDF-BLAKE2s; * 0001 = PBKDF2-HMAC-SHA256; * 0010 = PBKDF2-HMAC-BLAKE256; * Extended customisation (optional): * profile bit 31: * 0 = extended customisation absent; * 1 = extended customisation present; * profile bits 7 to 5 (rfactor): * 000 = r of 1; * 001 = r of 2; * 010 = r of 4; * ... * 111 = r of 128; * profile bits 12 to 8 (Nfactor): * 00000 = N of 2; * 00001 = N of 4; * 00010 = N of 8; * ..... * 00110 = N of 128; * ..... * 01001 = N of 1024; * ..... * 11110 = N of 2147483648; * profile bits 30 to 13 are reserved */ __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search(__global const uchar* restrict input, #ifdef TEST __global uchar* restrict output, #else volatile __global uint* restrict output, #endif __global uchar* padcache, const uint target) { #define CONSTANT_N 128 #define CONSTANT_r 2 /* Ensure stack alignment by putting those first. */ /* X = CONSTANT_r * 2 * BLOCK_SIZE(64) */ uchar X[FASTKDF_BUFFER_SIZE]; /* Z is a copy of X for ChaCha */ uchar Z[FASTKDF_BUFFER_SIZE]; /* V = CONSTANT_N * CONSTANT_r * 2 * BLOCK_SIZE */ __global uchar *V= &padcache[CONSTANT_N * CONSTANT_r * 2 * BLOCK_SIZE* (get_global_id(0)% MAX_GLOBAL_THREADS)]; #ifndef TEST uchar outbuf[32]; uchar data[PASSWORD_LEN]; uint i, j; for(i= 0; i< PASSWORD_LEN- 4; ++i) data[i]= input[i]; ((uint *)data)[(PASSWORD_LEN- 4)/ sizeof(uint)]= get_global_id(0); #else uchar outbuf[OUTPUT_LEN]; uchar data[PASSWORD_LEN]; uint i, j; for(i= 0; i< PASSWORD_LEN; ++i) data[i]= input[i]; #endif const uint mixmode = 0x14; #ifdef TEST #ifdef BLAKE2S_TEST blake2s(data, 64, data, 32, outbuf, OUTPUT_LEN); for(i= 0; i< OUTPUT_LEN; ++i) output[i]= outbuf[i]; return; #elif defined(FASTKDF_TEST) for(i= 0; i< FASTKDF_BUFFER_SIZE; ++i) X[i]= testsalt[i]; fastkdf(data, X, FASTKDF_BUFFER_SIZE, 32, outbuf, 32); for(i= 0; i< OUTPUT_LEN; ++i) output[i]= outbuf[i]; return; #endif #endif /* X = KDF(password, salt) */ fastkdf(data, data, PASSWORD_LEN, 32, X, CONSTANT_r * 2 * BLOCK_SIZE); /* Process ChaCha 1st, Salsa 2nd and XOR them into PBKDF2 */ neoscrypt_blkcpy(Z, X, CONSTANT_r * 2 * BLOCK_SIZE); for(int y = 0; y < 2; ++y) { for(i = 0; i < 128; ++i) { neoscrypt_gl_blkcpy(&V[i << 8], &X[0], 256); neoscrypt_blkmix((uint16 *)X, y); } for(i = 0; i < 128; ++i) { neoscrypt_gl_blkxor(&X[0], &V[(((uint *)X)[48] & 127) << 8], 256); neoscrypt_blkmix((uint16 *)X, y); } if(!y) neoscrypt_blkswp(&X[0], &Z[0], 256); } /* blkxor(X, Z) */ neoscrypt_blkxor(&X[0], &Z[0], CONSTANT_r * 2 * BLOCK_SIZE); #ifdef TEST fastkdf(data, X, FASTKDF_BUFFER_SIZE, 32, outbuf, 32); //((uint *)outbuf)[8]= target; for(i= 0; i< OUTPUT_LEN; ++i) output[i]= outbuf[i]; #else /* output = KDF(password, X) */ fastkdf(data, X, FASTKDF_BUFFER_SIZE, 32, outbuf, 32); #define SCRYPT_FOUND (0xFF) #ifdef cl_khr_global_int32_base_atomics #define SETFOUND(Xnonce) output[atomic_add(&output[SCRYPT_FOUND], 1)]= Xnonce #else #define SETFOUND(Xnonce) output[output[SCRYPT_FOUND]++] = Xnonce #endif if (((uint *)outbuf)[7]<= target) SETFOUND(get_global_id(0)); #endif }