// NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20 // Stupid AMD compiler ignores the unroll pragma in these two #define SALSA_SMALL_UNROLL 3 #define CHACHA_SMALL_UNROLL 3 // If SMALL_BLAKE2S is defined, BLAKE2S_UNROLL is interpreted // as the unroll factor; must divide cleanly into ten. // Usually a bad idea. //#define SMALL_BLAKE2S //#define BLAKE2S_UNROLL 5 #define BLOCK_SIZE 64U #define FASTKDF_BUFFER_SIZE 256U #ifndef PASSWORD_LEN #define PASSWORD_LEN 80U #endif #if !defined(cl_khr_byte_addressable_store) #error "Device does not support unaligned stores" #endif // Swaps 128 bytes at a time without using temp vars void SwapBytes128(void *restrict A, void *restrict B, uint len) { #pragma unroll 2 for(int i = 0; i < (len >> 7); ++i) { ((ulong16 *)A)[i] ^= ((ulong16 *)B)[i]; ((ulong16 *)B)[i] ^= ((ulong16 *)A)[i]; ((ulong16 *)A)[i] ^= ((ulong16 *)B)[i]; } } void CopyBytes128(void *restrict dst, const void *restrict src, uint len) { #pragma unroll 2 for(int i = 0; i < len; ++i) ((ulong16 *)dst)[i] = ((ulong16 *)src)[i]; } void CopyBytes(void *restrict dst, const void *restrict src, uint len) { for(int i = 0; i < len; ++i) ((uchar *)dst)[i] = ((uchar *)src)[i]; } void XORBytesInPlace(void *restrict dst, const void *restrict src, uint len) { for(int i = 0; i < len; ++i) ((uchar *)dst)[i] ^= ((uchar *)src)[i]; } void XORBytes(void *restrict dst, const void *restrict src1, const void *restrict src2, uint len) { #pragma unroll 1 for(int i = 0; i < len; ++i) ((uchar *)dst)[i] = ((uchar *)src1)[i] ^ ((uchar *)src2)[i]; } // Blake2S #define BLAKE2S_BLOCK_SIZE 64U #define BLAKE2S_OUT_SIZE 32U #define BLAKE2S_KEY_SIZE 32U static const __constant uint BLAKE2S_IV[8] = { 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 }; static const __constant uchar BLAKE2S_SIGMA[10][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , }; #define BLAKE_G(idx0, idx1, a, b, c, d, key) do { \ a += b + key[BLAKE2S_SIGMA[idx0][idx1]]; \ d = rotate(d ^ a, 16U); \ c += d; \ b = rotate(b ^ c, 20U); \ a += b + key[BLAKE2S_SIGMA[idx0][idx1 + 1]]; \ d = rotate(d ^ a, 24U); \ c += d; \ b = rotate(b ^ c, 25U); \ } while(0) void Blake2S(uint *restrict inout, const uint *restrict inkey) { uint16 V; uint8 tmpblock; // Load first block (IV into V.lo) and constants (IV into V.hi) V.lo = V.hi = vload8(0U, BLAKE2S_IV); // XOR with initial constant V.s0 ^= 0x01012020; // Copy input block for later tmpblock = V.lo; // XOR length of message so far (including this block) // There are two uints for this field, but high uint is zero V.sc ^= BLAKE2S_BLOCK_SIZE; // Compress state, using the key as the key #ifdef SMALL_BLAKE2S #pragma unroll BLAKE2S_UNROLL #else #pragma unroll #endif for(int x = 0; x < 10; ++x) { BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inkey); BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inkey); BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inkey); BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inkey); BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inkey); BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inkey); BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inkey); BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inkey); } // XOR low part of state with the high part, // then with the original input block. V.lo ^= V.hi ^ tmpblock; // Load constants (IV into V.hi) V.hi = vload8(0U, BLAKE2S_IV); // Copy input block for later tmpblock = V.lo; // XOR length of message into block again V.sc ^= BLAKE2S_BLOCK_SIZE << 1; // Last block compression - XOR final constant into state V.se ^= 0xFFFFFFFFU; // Compress block, using the input as the key #ifdef SMALL_BLAKE2S #pragma unroll BLAKE2S_UNROLL #else #pragma unroll #endif for(int x = 0; x < 10; ++x) { BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inout); BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inout); BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inout); BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inout); BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inout); BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inout); BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inout); BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inout); } // XOR low part of state with high part, then with input block V.lo ^= V.hi ^ tmpblock; // Store result in input/output buffer vstore8(V.lo, 0, inout); } /* FastKDF, a fast buffered key derivation function: * FASTKDF_BUFFER_SIZE must be a power of 2; * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE; * prf_output_size must be <= prf_key_size; */ void fastkdf(const uchar *restrict password, const uchar *restrict salt, const uint salt_len, uchar *restrict output, uint output_len) { /* WARNING! * This algorithm uses byte-wise addressing for memory blocks. * Or in other words, trying to copy an unaligned memory region * will significantly slow down the algorithm, when copying uses * words or bigger entities. It even may corrupt the data, when * the device does not support it properly. * Therefore use byte copying, which will not the fastest but at * least get reliable results. */ // BLOCK_SIZE 64U // FASTKDF_BUFFER_SIZE 256U // BLAKE2S_BLOCK_SIZE 64U // BLAKE2S_KEY_SIZE 32U // BLAKE2S_OUT_SIZE 32U uchar bufidx = 0; uint8 Abuffer[9], Bbuffer[9] = { (uint8)(0) }; uchar *A = (uchar *)Abuffer, *B = (uchar *)Bbuffer; // Initialize the password buffer #pragma unroll 1 for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)A)[i] = ((ulong *)password)[i % 10]; ((uint16 *)(A + FASTKDF_BUFFER_SIZE))[0] = ((uint16 *)password)[0]; // Initialize the salt buffer if(salt_len == FASTKDF_BUFFER_SIZE) { ((ulong16 *)B)[0] = ((ulong16 *)B)[2] = ((ulong16 *)salt)[0]; ((ulong16 *)B)[1] = ((ulong16 *)B)[3] = ((ulong16 *)salt)[1]; } else { // salt_len is 80 bytes here #pragma unroll 1 for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)B)[i] = ((ulong *)salt)[i % 10]; // Initialized the rest to zero earlier #pragma unroll 1 for(int i = 0; i < 10; ++i) ((ulong *)(B + FASTKDF_BUFFER_SIZE))[i] = ((ulong *)salt)[i]; } // The primary iteration #pragma unroll 1 for(int i = 0; i < 32; ++i) { // Make the key buffer twice the size of the key so it fits a Blake2S block // This way, we don't need a temp buffer in the Blake2S function. uchar input[BLAKE2S_BLOCK_SIZE], key[BLAKE2S_BLOCK_SIZE] = { 0 }; // Copy input and key to their buffers CopyBytes(input, A + bufidx, BLAKE2S_BLOCK_SIZE); CopyBytes(key, B + bufidx, BLAKE2S_KEY_SIZE); // PRF Blake2S((uint *)input, (uint *)key); // Calculate the next buffer pointer bufidx = 0; for(int x = 0; x < BLAKE2S_OUT_SIZE; ++x) bufidx += input[x]; // bufidx a uchar now - always mod 255 //bufidx &= (FASTKDF_BUFFER_SIZE - 1); // Modify the salt buffer XORBytesInPlace(B + bufidx, input, BLAKE2S_OUT_SIZE); if(bufidx < BLAKE2S_KEY_SIZE) { // Head modified, tail updated // this was made off the original code... wtf //CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx)); CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, BLAKE2S_KEY_SIZE - bufidx); } else if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE) { // Tail modified, head updated CopyBytes(B, B + FASTKDF_BUFFER_SIZE, BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx)); } } // Modify and copy into the output buffer // Damned compiler crashes // Fuck you, AMD //for(uint i = 0; i < output_len; ++i, ++bufidx) // output[i] = B[bufidx] ^ A[i]; uint left = FASTKDF_BUFFER_SIZE - bufidx; //uint left = (~bufidx) + 1 if(left < output_len) { XORBytes(output, B + bufidx, A, left); XORBytes(output + left, B, A + left, output_len - left); } else { XORBytes(output, B + bufidx, A, output_len); } } #define SALSA_CORE(state) do { \ state.s4 ^= rotate(state.s0 + state.sc, 7U); state.s8 ^= rotate(state.s4 + state.s0, 9U); state.sc ^= rotate(state.s8 + state.s4, 13U); state.s0 ^= rotate(state.sc + state.s8, 18U); \ state.s9 ^= rotate(state.s5 + state.s1, 7U); state.sd ^= rotate(state.s9 + state.s5, 9U); state.s1 ^= rotate(state.sd + state.s9, 13U); state.s5 ^= rotate(state.s1 + state.sd, 18U); \ state.se ^= rotate(state.sa + state.s6, 7U); state.s2 ^= rotate(state.se + state.sa, 9U); state.s6 ^= rotate(state.s2 + state.se, 13U); state.sa ^= rotate(state.s6 + state.s2, 18U); \ state.s3 ^= rotate(state.sf + state.sb, 7U); state.s7 ^= rotate(state.s3 + state.sf, 9U); state.sb ^= rotate(state.s7 + state.s3, 13U); state.sf ^= rotate(state.sb + state.s7, 18U); \ state.s1 ^= rotate(state.s0 + state.s3, 7U); state.s2 ^= rotate(state.s1 + state.s0, 9U); state.s3 ^= rotate(state.s2 + state.s1, 13U); state.s0 ^= rotate(state.s3 + state.s2, 18U); \ state.s6 ^= rotate(state.s5 + state.s4, 7U); state.s7 ^= rotate(state.s6 + state.s5, 9U); state.s4 ^= rotate(state.s7 + state.s6, 13U); state.s5 ^= rotate(state.s4 + state.s7, 18U); \ state.sb ^= rotate(state.sa + state.s9, 7U); state.s8 ^= rotate(state.sb + state.sa, 9U); state.s9 ^= rotate(state.s8 + state.sb, 13U); state.sa ^= rotate(state.s9 + state.s8, 18U); \ state.sc ^= rotate(state.sf + state.se, 7U); state.sd ^= rotate(state.sc + state.sf, 9U); state.se ^= rotate(state.sd + state.sc, 13U); state.sf ^= rotate(state.se + state.sd, 18U); \ } while(0) uint16 salsa_small_scalar_rnd(uint16 X) { uint16 st = X; #if SALSA_SMALL_UNROLL == 1 for(int i = 0; i < 10; ++i) { SALSA_CORE(st); } #elif SALSA_SMALL_UNROLL == 2 for(int i = 0; i < 5; ++i) { SALSA_CORE(st); SALSA_CORE(st); } #elif SALSA_SMALL_UNROLL == 3 for(int i = 0; i < 4; ++i) { SALSA_CORE(st); if(i == 3) break; SALSA_CORE(st); SALSA_CORE(st); } #elif SALSA_SMALL_UNROLL == 4 for(int i = 0; i < 3; ++i) { SALSA_CORE(st); SALSA_CORE(st); if(i == 2) break; SALSA_CORE(st); SALSA_CORE(st); } #else for(int i = 0; i < 2; ++i) { SALSA_CORE(st); SALSA_CORE(st); SALSA_CORE(st); SALSA_CORE(st); SALSA_CORE(st); } #endif return(X + st); } #define CHACHA_CORE(state) do { \ state.s0 += state.s4; state.sc = as_uint(as_ushort2(state.sc ^ state.s0).s10); state.s8 += state.sc; state.s4 = rotate(state.s4 ^ state.s8, 12U); state.s0 += state.s4; state.sc = rotate(state.sc ^ state.s0, 8U); state.s8 += state.sc; state.s4 = rotate(state.s4 ^ state.s8, 7U); \ state.s1 += state.s5; state.sd = as_uint(as_ushort2(state.sd ^ state.s1).s10); state.s9 += state.sd; state.s5 = rotate(state.s5 ^ state.s9, 12U); state.s1 += state.s5; state.sd = rotate(state.sd ^ state.s1, 8U); state.s9 += state.sd; state.s5 = rotate(state.s5 ^ state.s9, 7U); \ state.s2 += state.s6; state.se = as_uint(as_ushort2(state.se ^ state.s2).s10); state.sa += state.se; state.s6 = rotate(state.s6 ^ state.sa, 12U); state.s2 += state.s6; state.se = rotate(state.se ^ state.s2, 8U); state.sa += state.se; state.s6 = rotate(state.s6 ^ state.sa, 7U); \ state.s3 += state.s7; state.sf = as_uint(as_ushort2(state.sf ^ state.s3).s10); state.sb += state.sf; state.s7 = rotate(state.s7 ^ state.sb, 12U); state.s3 += state.s7; state.sf = rotate(state.sf ^ state.s3, 8U); state.sb += state.sf; state.s7 = rotate(state.s7 ^ state.sb, 7U); \ state.s0 += state.s5; state.sf = as_uint(as_ushort2(state.sf ^ state.s0).s10); state.sa += state.sf; state.s5 = rotate(state.s5 ^ state.sa, 12U); state.s0 += state.s5; state.sf = rotate(state.sf ^ state.s0, 8U); state.sa += state.sf; state.s5 = rotate(state.s5 ^ state.sa, 7U); \ state.s1 += state.s6; state.sc = as_uint(as_ushort2(state.sc ^ state.s1).s10); state.sb += state.sc; state.s6 = rotate(state.s6 ^ state.sb, 12U); state.s1 += state.s6; state.sc = rotate(state.sc ^ state.s1, 8U); state.sb += state.sc; state.s6 = rotate(state.s6 ^ state.sb, 7U); \ state.s2 += state.s7; state.sd = as_uint(as_ushort2(state.sd ^ state.s2).s10); state.s8 += state.sd; state.s7 = rotate(state.s7 ^ state.s8, 12U); state.s2 += state.s7; state.sd = rotate(state.sd ^ state.s2, 8U); state.s8 += state.sd; state.s7 = rotate(state.s7 ^ state.s8, 7U); \ state.s3 += state.s4; state.se = as_uint(as_ushort2(state.se ^ state.s3).s10); state.s9 += state.se; state.s4 = rotate(state.s4 ^ state.s9, 12U); state.s3 += state.s4; state.se = rotate(state.se ^ state.s3, 8U); state.s9 += state.se; state.s4 = rotate(state.s4 ^ state.s9, 7U); \ } while(0) uint16 chacha_small_scalar_rnd(uint16 X) { uint16 st = X; #if CHACHA_SMALL_UNROLL == 1 for(int i = 0; i < 10; ++i) { CHACHA_CORE(st); } #elif CHACHA_SMALL_UNROLL == 2 for(int i = 0; i < 5; ++i) { CHACHA_CORE(st); CHACHA_CORE(st); } #elif CHACHA_SMALL_UNROLL == 3 for(int i = 0; i < 4; ++i) { CHACHA_CORE(st); if(i == 3) break; CHACHA_CORE(st); CHACHA_CORE(st); } #elif CHACHA_SMALL_UNROLL == 4 for(int i = 0; i < 3; ++i) { CHACHA_CORE(st); CHACHA_CORE(st); if(i == 2) break; CHACHA_CORE(st); CHACHA_CORE(st); } #else for(int i = 0; i < 2; ++i) { CHACHA_CORE(st); CHACHA_CORE(st); CHACHA_CORE(st); CHACHA_CORE(st); CHACHA_CORE(st); } #endif return(X + st); } void neoscrypt_blkmix(uint16 *XV, bool alg) { /* NeoScrypt flow: Scrypt flow: Xa ^= Xd; M(Xa'); Ya = Xa"; Xa ^= Xb; M(Xa'); Ya = Xa"; Xb ^= Xa"; M(Xb'); Yb = Xb"; Xb ^= Xa"; M(Xb'); Yb = Xb"; Xc ^= Xb"; M(Xc'); Yc = Xc"; Xa" = Ya; Xd ^= Xc"; M(Xd'); Yd = Xd"; Xb" = Yb; Xa" = Ya; Xb" = Yc; Xc" = Yb; Xd" = Yd; */ XV[0] ^= XV[3]; if(!alg) { XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0]; XV[1] = salsa_small_scalar_rnd(XV[1]); XV[2] ^= XV[1]; XV[2] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[2]; XV[3] = salsa_small_scalar_rnd(XV[3]); } else { XV[0] = chacha_small_scalar_rnd(XV[0]); XV[1] ^= XV[0]; XV[1] = chacha_small_scalar_rnd(XV[1]); XV[2] ^= XV[1]; XV[2] = chacha_small_scalar_rnd(XV[2]); XV[3] ^= XV[2]; XV[3] = chacha_small_scalar_rnd(XV[3]); } XV[1] ^= XV[2]; XV[2] ^= XV[1]; XV[1] ^= XV[2]; } void ScratchpadStore(__global void *V, const void *X, uchar idx) { ((__global ulong16 *)V)[idx << 1] = ((ulong16 *)X)[0]; ((__global ulong16 *)V)[(idx << 1) + 1] = ((ulong16 *)X)[1]; } void ScratchpadMix(void *X, const __global void *V, uchar idx) { ((ulong16 *)X)[0] ^= ((__global ulong16 *)V)[idx << 1]; ((ulong16 *)X)[1] ^= ((__global ulong16 *)V)[(idx << 1) + 1]; } void ScratchpadLoad(void *X, const __global void *V, uchar idx) { ((ulong16 *)X)[0] = ((__global ulong16 *)V)[idx << 1]; ((ulong16 *)X)[1] = ((__global ulong16 *)V)[(idx << 1) + 1]; } void SMix(uint16 *X, __global uint16 *V, bool flag) { #pragma unroll 1 for(int i = 0; i < 64; ++i) { ScratchpadStore(V, X, i); neoscrypt_blkmix(X, flag); neoscrypt_blkmix(X, flag); } #pragma unroll 1 for(int i = 0; i < 128; ++i) { uint16 tmp[4]; const uint idx = convert_uchar(((uint *)X)[48] & 0x7F); ScratchpadLoad(tmp, V, idx >> 1); if(idx & 1) neoscrypt_blkmix(tmp, flag); ((ulong16 *)X)[0] ^= ((ulong16 *)tmp)[0]; ((ulong16 *)X)[1] ^= ((ulong16 *)tmp)[1]; neoscrypt_blkmix(X, flag); } } __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, const uint target) { #define CONSTANT_N 128 #define CONSTANT_r 2 // X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha uint16 X[4], Z[4]; /* V = CONSTANT_N * CONSTANT_r * 2 * BLOCK_SIZE */ //__global ulong16 *V = (__global ulong16 *)(padcache + (0x8000 * (get_global_id(0) % MAX_GLOBAL_THREADS))); __global ulong16 *V = (__global ulong16 *)(padcache + (0x4000 * (get_global_id(0) % MAX_GLOBAL_THREADS))); uchar outbuf[32]; uchar data[PASSWORD_LEN]; ((ulong8 *)data)[0] = ((__global const ulong8 *)input)[0]; ((ulong *)data)[8] = ((__global const ulong *)input)[8]; ((uint *)data)[18] = ((__global const uint *)input)[18]; ((uint *)data)[19] = get_global_id(0); // X = KDF(password, salt) fastkdf(data, data, PASSWORD_LEN, (uchar *)X, 256); // Process ChaCha 1st, Salsa 2nd and XOR them - run that through PBKDF2 CopyBytes128(Z, X, 2); // X = SMix(X); X & Z are swapped, repeat. for(bool flag = false;; ++flag) { SMix(X, V, flag); if(flag) break; SwapBytes128(X, Z, 256); } // blkxor(X, Z) ((ulong16 *)X)[0] ^= ((ulong16 *)Z)[0]; ((ulong16 *)X)[1] ^= ((ulong16 *)Z)[1]; // output = KDF(password, X) fastkdf(data, (uchar *)X, FASTKDF_BUFFER_SIZE, outbuf, 32); if(((uint *)outbuf)[7] <= target) output[atomic_add(output + 0xFF, 1)] = get_global_id(0); }