From 186b75d51c47743151dd65e054d0e78daea96202 Mon Sep 17 00:00:00 2001 From: elbandi Date: Thu, 11 Feb 2016 21:28:17 +0100 Subject: [PATCH] New neoscrypt kernel --- kernel/neoscrypt.cl | 1091 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 1031 insertions(+), 60 deletions(-) diff --git a/kernel/neoscrypt.cl b/kernel/neoscrypt.cl index 7939d7ed..9ffcad4b 100644 --- a/kernel/neoscrypt.cl +++ b/kernel/neoscrypt.cl @@ -1,9 +1,32 @@ -/* NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20 */ -/* Adapted and improved for 14.x drivers by Wolf9466 (Wolf`) */ +// NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20 +// By Wolf (Wolf0 aka Wolf9466) // Stupid AMD compiler ignores the unroll pragma in these two + +// Tahiti 3/2, +// Hawaii 4/4 + notneededswap +// Pitcairn 3/4 + notneededswap +#if defined(__Tahiti__) +#define SALSA_SMALL_UNROLL 4 +#define CHACHA_SMALL_UNROLL 2 +//#define SWAP 1 +//#define SHITMAIN 1 +//#define WIDE_STRIPE 1 +#elif defined(__Pitcairn__) + #define SALSA_SMALL_UNROLL 3 -#define CHACHA_SMALL_UNROLL 3 +#define CHACHA_SMALL_UNROLL 2 +//#define SWAP 1 +//#define SHITMAIN 1 +//#define WIDE_STRIPE 1 + +#else +#define SALSA_SMALL_UNROLL 4 +#define CHACHA_SMALL_UNROLL 4 +//#define SWAP 1 +//#define SHITMAIN 1 +//#define WIDE_STRIPE 1 +#endif // If SMALL_BLAKE2S is defined, BLAKE2S_UNROLL is interpreted // as the unroll factor; must divide cleanly into ten. @@ -96,6 +119,28 @@ static const __constant uchar BLAKE2S_SIGMA[10][16] = b = rotate(b ^ c, 25U); \ } while(0) +#define BLAKE_PARALLEL_G1(idx0, a, b, c, d, key) do { \ + a += b + (uint4)(key[BLAKE2S_SIGMA[idx0][0]], key[BLAKE2S_SIGMA[idx0][2]], key[BLAKE2S_SIGMA[idx0][4]], key[BLAKE2S_SIGMA[idx0][6]]); \ + d = rotate(d ^ a, 16U); \ + c += d; \ + b = rotate(b ^ c, 20U); \ + a += b + (uint4)(key[BLAKE2S_SIGMA[idx0][1]], key[BLAKE2S_SIGMA[idx0][3]], key[BLAKE2S_SIGMA[idx0][5]], key[BLAKE2S_SIGMA[idx0][7]]); \ + d = rotate(d ^ a, 24U); \ + c += d; \ + b = rotate(b ^ c, 25U); \ +} while(0) + +#define BLAKE_PARALLEL_G2(idx0, a, b, c, d, key) do { \ + a += b + (uint4)(key[BLAKE2S_SIGMA[idx0][8]], key[BLAKE2S_SIGMA[idx0][10]], key[BLAKE2S_SIGMA[idx0][12]], key[BLAKE2S_SIGMA[idx0][14]]); \ + d = rotate(d ^ a, 16U); \ + c += d; \ + b = rotate(b ^ c, 20U); \ + a += b + (uint4)(key[BLAKE2S_SIGMA[idx0][9]], key[BLAKE2S_SIGMA[idx0][11]], key[BLAKE2S_SIGMA[idx0][13]], key[BLAKE2S_SIGMA[idx0][15]]); \ + d = rotate(d ^ a, 24U); \ + c += d; \ + b = rotate(b ^ c, 25U); \ +} while(0) + void Blake2S(uint *restrict inout, const uint *restrict inkey) { uint16 V; @@ -122,14 +167,17 @@ void Blake2S(uint *restrict inout, const uint *restrict inkey) #endif for(int x = 0; x < 10; ++x) { - BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inkey); + /*BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inkey); BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inkey); BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inkey); BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inkey); BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inkey); BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inkey); BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inkey); - BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inkey); + BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inkey);*/ + + BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inkey); + BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inkey); } // XOR low part of state with the high part, @@ -156,14 +204,17 @@ void Blake2S(uint *restrict inout, const uint *restrict inkey) #endif for(int x = 0; x < 10; ++x) { - BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inout); + /*BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inout); BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inout); BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inout); BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inout); BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inout); BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inout); BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inout); - BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inout); + BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inout);*/ + + BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inout); + BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inout); } // XOR low part of state with high part, then with input block @@ -227,15 +278,73 @@ void fastkdf(const uchar *restrict password, const uchar *restrict salt, const u { // Make the key buffer twice the size of the key so it fits a Blake2S block // This way, we don't need a temp buffer in the Blake2S function. - uchar input[BLAKE2S_BLOCK_SIZE], key[BLAKE2S_BLOCK_SIZE] = { 0 }; + uchar input[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)), key[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)) = { 0 }; // Copy input and key to their buffers CopyBytes(input, A + bufidx, BLAKE2S_BLOCK_SIZE); CopyBytes(key, B + bufidx, BLAKE2S_KEY_SIZE); // PRF - Blake2S((uint *)input, (uint *)key); + //Blake2S((uint *)input, (uint *)key); + + uint *inkey = (uint *)key, *inout = (uint *)input; + + // PRF + uint16 V; + uint8 tmpblock; + + // Load first block (IV into V.lo) and constants (IV into V.hi) + V.lo = V.hi = vload8(0U, BLAKE2S_IV); + + // XOR with initial constant + V.s0 ^= 0x01012020; + + // Copy input block for later + tmpblock = V.lo; + + // XOR length of message so far (including this block) + // There are two uints for this field, but high uint is zero + V.sc ^= BLAKE2S_BLOCK_SIZE; + + // Compress state, using the key as the key + #pragma unroll + for(int x = 0; x < 10; ++x) + { + BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inkey); + BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inkey); + } + + // XOR low part of state with the high part, + // then with the original input block. + V.lo ^= V.hi ^ tmpblock; + // Load constants (IV into V.hi) + V.hi = vload8(0U, BLAKE2S_IV); + + // Copy input block for later + tmpblock = V.lo; + + // XOR length of message into block again + V.sc ^= BLAKE2S_BLOCK_SIZE << 1; + + // Last block compression - XOR final constant into state + V.se ^= 0xFFFFFFFFU; + + // Compress block, using the input as the key + #pragma unroll + for(int x = 0; x < 10; ++x) + { + BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inout); + BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inout); + } + + // XOR low part of state with high part, then with input block + V.lo ^= V.hi ^ tmpblock; + + // Store result in input/output buffer + vstore8(V.lo, 0, inout); + + // Calculate the next buffer pointer bufidx = 0; @@ -284,7 +393,475 @@ void fastkdf(const uchar *restrict password, const uchar *restrict salt, const u } } -#define SALSA_CORE(state) do { \ +/* FastKDF, a fast buffered key derivation function: + * FASTKDF_BUFFER_SIZE must be a power of 2; + * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE; + * prf_output_size must be <= prf_key_size; */ +void fastkdf1(const uchar password[80], uchar output[256]) +{ + + /* WARNING! + * This algorithm uses byte-wise addressing for memory blocks. + * Or in other words, trying to copy an unaligned memory region + * will significantly slow down the algorithm, when copying uses + * words or bigger entities. It even may corrupt the data, when + * the device does not support it properly. + * Therefore use byte copying, which will not the fastest but at + * least get reliable results. */ + + // BLOCK_SIZE 64U + // FASTKDF_BUFFER_SIZE 256U + // BLAKE2S_BLOCK_SIZE 64U + // BLAKE2S_KEY_SIZE 32U + // BLAKE2S_OUT_SIZE 32U + uchar bufidx = 0; + uint8 Abuffer[9], Bbuffer[9] = { (uint8)(0) }; + uchar *A = (uchar *)Abuffer, *B = (uchar *)Bbuffer; + + // Initialize the password buffer + #pragma unroll 1 + for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)B)[i] = ((ulong *)A)[i] = ((ulong *)password)[i % 10]; + + ((uint16 *)(B + FASTKDF_BUFFER_SIZE))[0] = ((uint16 *)(A + FASTKDF_BUFFER_SIZE))[0] = ((uint16 *)password)[0]; + + // The primary iteration + #pragma unroll 1 + for(int i = 0; i < 32; ++i) + { + // Make the key buffer twice the size of the key so it fits a Blake2S block + // This way, we don't need a temp buffer in the Blake2S function. + uchar input[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)), key[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)) = { 0 }; + + // Copy input and key to their buffers + CopyBytes(input, A + bufidx, BLAKE2S_BLOCK_SIZE); + CopyBytes(key, B + bufidx, BLAKE2S_KEY_SIZE); + + uint *inkey = (uint *)key, *inout = (uint *)input; + + #ifndef __Hawaii__ + + // PRF + uint4 V[4]; + uint8 tmpblock; + + tmpblock = vload8(0U, BLAKE2S_IV); + + V[0] = V[2] = tmpblock.lo; + V[1] = V[3] = tmpblock.hi; + + V[0].s0 ^= 0x01012020U; + tmpblock.lo = V[0]; + + V[3].s0 ^= BLAKE2S_BLOCK_SIZE; + + // Compress state, using the key as the key + #pragma unroll + for(int x = 0; x < 10; ++x) + { + BLAKE_PARALLEL_G1(x, V[0], V[1], V[2], V[3], inkey); + BLAKE_PARALLEL_G2(x, V[0], V[1].s1230, V[2].s2301, V[3].s3012, inkey); + } + + V[0] ^= V[2] ^ tmpblock.lo; + V[1] ^= V[3] ^ tmpblock.hi; + + V[2] = vload4(0U, BLAKE2S_IV); + V[3] = vload4(1U, BLAKE2S_IV); + + tmpblock.lo = V[0]; + tmpblock.hi = V[1]; + + V[3].s0 ^= BLAKE2S_BLOCK_SIZE << 1; + V[3].s2 ^= 0xFFFFFFFFU; + + // Compress block, using the input as the key + #pragma unroll + for(int x = 0; x < 10; ++x) + { + BLAKE_PARALLEL_G1(x, V[0], V[1], V[2], V[3], inout); + BLAKE_PARALLEL_G2(x, V[0], V[1].s1230, V[2].s2301, V[3].s3012, inout); + } + + V[0] ^= V[2] ^ tmpblock.lo; + V[1] ^= V[3] ^ tmpblock.hi; + + vstore4(V[0], 0, inout); + vstore4(V[1], 1, inout); + + #else + + // PRF + uint16 V; + uint8 tmpblock; + + // Load first block (IV into V.lo) and constants (IV into V.hi) + V.lo = V.hi = vload8(0U, BLAKE2S_IV); + + // XOR with initial constant + V.s0 ^= 0x01012020; + + // Copy input block for later + tmpblock = V.lo; + + // XOR length of message so far (including this block) + // There are two uints for this field, but high uint is zero + V.sc ^= BLAKE2S_BLOCK_SIZE; + + // Compress state, using the key as the key + #pragma unroll + for(int x = 0; x < 10; ++x) + { + BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inkey); + BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inkey); + } + + // XOR low part of state with the high part, + // then with the original input block. + V.lo ^= V.hi ^ tmpblock; + + // Load constants (IV into V.hi) + V.hi = vload8(0U, BLAKE2S_IV); + + // Copy input block for later + tmpblock = V.lo; + + // XOR length of message into block again + V.sc ^= BLAKE2S_BLOCK_SIZE << 1; + + // Last block compression - XOR final constant into state + V.se ^= 0xFFFFFFFFU; + + // Compress block, using the input as the key + #pragma unroll + for(int x = 0; x < 10; ++x) + { + BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inout); + BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inout); + } + + // XOR low part of state with high part, then with input block + V.lo ^= V.hi ^ tmpblock; + + // Store result in input/output buffer + vstore8(V.lo, 0, inout); + + #endif + + // Calculate the next buffer pointer + bufidx = 0; + + for(int x = 0; x < BLAKE2S_OUT_SIZE; ++x) + bufidx += input[x]; + + // bufidx a uchar now - always mod 255 + //bufidx &= (FASTKDF_BUFFER_SIZE - 1); + + // Modify the salt buffer + XORBytesInPlace(B + bufidx, input, BLAKE2S_OUT_SIZE); + + if(bufidx < BLAKE2S_KEY_SIZE) + { + // Head modified, tail updated + // this was made off the original code... wtf + //CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx)); + CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, BLAKE2S_KEY_SIZE - bufidx); + } + else if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE) + { + // Tail modified, head updated + CopyBytes(B, B + FASTKDF_BUFFER_SIZE, BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx)); + } + } + + // Modify and copy into the output buffer + + // Damned compiler crashes + // Fuck you, AMD + + //for(uint i = 0; i < output_len; ++i, ++bufidx) + // output[i] = B[bufidx] ^ A[i]; + + uint left = FASTKDF_BUFFER_SIZE - bufidx; + //uint left = (~bufidx) + 1 + + if(left < 256) + { + XORBytes(output, B + bufidx, A, left); + XORBytes(output + left, B, A + left, 256 - left); + } + else + { + XORBytes(output, B + bufidx, A, 256); + } +} + +/* FastKDF, a fast buffered key derivation function: + * FASTKDF_BUFFER_SIZE must be a power of 2; + * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE; + * prf_output_size must be <= prf_key_size; */ +void fastkdf2(const uchar password[80], const uchar salt[256], __global uint* restrict output, const uint target) +{ + + /* WARNING! + * This algorithm uses byte-wise addressing for memory blocks. + * Or in other words, trying to copy an unaligned memory region + * will significantly slow down the algorithm, when copying uses + * words or bigger entities. It even may corrupt the data, when + * the device does not support it properly. + * Therefore use byte copying, which will not the fastest but at + * least get reliable results. */ + + // BLOCK_SIZE 64U + // FASTKDF_BUFFER_SIZE 256U + // BLAKE2S_BLOCK_SIZE 64U + // BLAKE2S_KEY_SIZE 32U + // BLAKE2S_OUT_SIZE 32U + // salt_len == 256, output_len == 32 + uchar bufidx = 0; + uint8 Abuffer[9], Bbuffer[9] = { (uint8)(0) }; + uchar *A = (uchar *)Abuffer, *B = (uchar *)Bbuffer; + //uchar A[256], B[256]; + + // Initialize the password buffer + #pragma unroll 1 + for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)A)[i] = ((ulong *)password)[i % 10]; + + ((uint16 *)(A + FASTKDF_BUFFER_SIZE))[0] = ((uint16 *)password)[0]; + + // Initialize the salt buffer + ((ulong16 *)B)[0] = ((ulong16 *)B)[2] = ((ulong16 *)salt)[0]; + ((ulong16 *)B)[1] = ((ulong16 *)B)[3] = ((ulong16 *)salt)[1]; + + // The primary iteration + #pragma unroll 1 + for(int i = 0; i < 32; ++i) + { + // Make the key buffer twice the size of the key so it fits a Blake2S block + // This way, we don't need a temp buffer in the Blake2S function. + uchar input[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)), key[BLAKE2S_BLOCK_SIZE] __attribute__((aligned)) = { 0 }; + + // Copy input and key to their buffers + CopyBytes(input, A + bufidx, BLAKE2S_BLOCK_SIZE); + CopyBytes(key, B + bufidx, BLAKE2S_KEY_SIZE); + + uint *inkey = (uint *)key, *inout = (uint *)input; + + #ifndef __Hawaii__ + + // PRF + uint4 V[4]; + uint8 tmpblock; + + tmpblock = vload8(0U, BLAKE2S_IV); + + V[0] = V[2] = tmpblock.lo; + V[1] = V[3] = tmpblock.hi; + + V[0].s0 ^= 0x01012020U; + tmpblock.lo = V[0]; + + V[3].s0 ^= BLAKE2S_BLOCK_SIZE; + + // Compress state, using the key as the key + #pragma unroll + for(int x = 0; x < 10; ++x) + { + BLAKE_PARALLEL_G1(x, V[0], V[1], V[2], V[3], inkey); + BLAKE_PARALLEL_G2(x, V[0], V[1].s1230, V[2].s2301, V[3].s3012, inkey); + } + + V[0] ^= V[2] ^ tmpblock.lo; + V[1] ^= V[3] ^ tmpblock.hi; + + V[2] = vload4(0U, BLAKE2S_IV); + V[3] = vload4(1U, BLAKE2S_IV); + + tmpblock.lo = V[0]; + tmpblock.hi = V[1]; + + V[3].s0 ^= BLAKE2S_BLOCK_SIZE << 1; + V[3].s2 ^= 0xFFFFFFFFU; + + // Compress block, using the input as the key + #pragma unroll + for(int x = 0; x < 10; ++x) + { + BLAKE_PARALLEL_G1(x, V[0], V[1], V[2], V[3], inout); + BLAKE_PARALLEL_G2(x, V[0], V[1].s1230, V[2].s2301, V[3].s3012, inout); + } + + V[0] ^= V[2] ^ tmpblock.lo; + V[1] ^= V[3] ^ tmpblock.hi; + + vstore4(V[0], 0, inout); + vstore4(V[1], 1, inout); + + #else + + // PRF + uint16 V; + uint8 tmpblock; + + // Load first block (IV into V.lo) and constants (IV into V.hi) + V.lo = V.hi = vload8(0U, BLAKE2S_IV); + + // XOR with initial constant + V.s0 ^= 0x01012020; + + // Copy input block for later + tmpblock = V.lo; + + // XOR length of message so far (including this block) + // There are two uints for this field, but high uint is zero + V.sc ^= BLAKE2S_BLOCK_SIZE; + + // Compress state, using the key as the key + #pragma unroll + for(int x = 0; x < 10; ++x) + { + BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inkey); + BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inkey); + } + + // XOR low part of state with the high part, + // then with the original input block. + V.lo ^= V.hi ^ tmpblock; + + // Load constants (IV into V.hi) + V.hi = vload8(0U, BLAKE2S_IV); + + // Copy input block for later + tmpblock = V.lo; + + // XOR length of message into block again + V.sc ^= BLAKE2S_BLOCK_SIZE << 1; + + // Last block compression - XOR final constant into state + V.se ^= 0xFFFFFFFFU; + + // Compress block, using the input as the key + #pragma unroll + for(int x = 0; x < 10; ++x) + { + BLAKE_PARALLEL_G1(x, V.s0123, V.s4567, V.s89ab, V.scdef, inout); + BLAKE_PARALLEL_G2(x, V.s0123, V.s5674, V.sab89, V.sfcde, inout); + } + + // XOR low part of state with high part, then with input block + V.lo ^= V.hi ^ tmpblock; + + // Store result in input/output buffer + vstore8(V.lo, 0, inout); + #endif + + // Calculate the next buffer pointer + bufidx = 0; + + for(int x = 0; x < BLAKE2S_OUT_SIZE; ++x) + bufidx += input[x]; + + // bufidx a uchar now - always mod 255 + //bufidx &= (FASTKDF_BUFFER_SIZE - 1); + + // Modify the salt buffer + XORBytesInPlace(B + bufidx, input, BLAKE2S_OUT_SIZE); + + if(bufidx < BLAKE2S_KEY_SIZE) + { + // Head modified, tail updated + // this was made off the original code... wtf + //CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx)); + CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, BLAKE2S_KEY_SIZE - bufidx); + } + else if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE) + { + // Tail modified, head updated + CopyBytes(B, B + FASTKDF_BUFFER_SIZE, BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx)); + } + } + + // Modify and copy into the output buffer + + // Damned compiler crashes + // Fuck you, AMD + + uchar outbuf[32]; + + for(uint i = 0; i < 32; ++i, ++bufidx) + outbuf[i] = B[bufidx] ^ A[i]; + + /*uint left = FASTKDF_BUFFER_SIZE - bufidx; + //uint left = (~bufidx) + 1 + uchar outbuf[32]; + + if(left < 32) + { + XORBytes(outbuf, B + bufidx, A, left); + XORBytes(outbuf + left, B, A + left, 32 - left); + } + else + { + XORBytes(outbuf, B + bufidx, A, 32); + }*/ + + if(((uint *)outbuf)[7] <= target) output[atomic_add(output + 0xFF, 1)] = get_global_id(0); + +} + +/* + s0 s1 s2 s3 + s4 s5 s6 s7 + s8 s9 sa sb + sc sd se sf +shittify: +s0=s4 +s1=s9 +s2=se +s3=s3 +s4=s8 +s5=sd +s6=s2 +s7=s7 +s8=sc +s9=s1 +sa=s6 +sb=sb +sc=s0 +sd=s5 +se=sa +sf=sf +unshittify: +s0=sc +s1=s9 +s2=s6 +s3=s3 +s4=s0 +s5=sd +s6=sa +s7=s7 +s8=s4 +s9=s1 +sa=se +sb=sb +sc=s8 +sd=s5 +se=s2 +sf=sf + +*/ + +#define SALSA_CORE(state) do { \ + state[0] ^= rotate(state[3] + state[2], 7U); \ + state[1] ^= rotate(state[0] + state[3], 9U); \ + state[2] ^= rotate(state[1] + state[0], 13U); \ + state[3] ^= rotate(state[2] + state[1], 18U); \ + state[2] ^= rotate(state[3].wxyz + state[0].zwxy, 7U); \ + state[1] ^= rotate(state[2].wxyz + state[3].zwxy, 9U); \ + state[0] ^= rotate(state[1].wxyz + state[2].zwxy, 13U); \ + state[3] ^= rotate(state[0].wxyz + state[1].zwxy, 18U); \ +} while(0) + +#define SALSA_CORE_SCALAR(state) do { \ state.s4 ^= rotate(state.s0 + state.sc, 7U); state.s8 ^= rotate(state.s4 + state.s0, 9U); state.sc ^= rotate(state.s8 + state.s4, 13U); state.s0 ^= rotate(state.sc + state.s8, 18U); \ state.s9 ^= rotate(state.s5 + state.s1, 7U); state.sd ^= rotate(state.s9 + state.s5, 9U); state.s1 ^= rotate(state.sd + state.s9, 13U); state.s5 ^= rotate(state.s1 + state.sd, 18U); \ state.se ^= rotate(state.sa + state.s6, 7U); state.s2 ^= rotate(state.se + state.sa, 9U); state.s6 ^= rotate(state.s2 + state.se, 13U); state.sa ^= rotate(state.s6 + state.s2, 18U); \ @@ -295,10 +872,18 @@ void fastkdf(const uchar *restrict password, const uchar *restrict salt, const u state.sc ^= rotate(state.sf + state.se, 7U); state.sd ^= rotate(state.sc + state.sf, 9U); state.se ^= rotate(state.sd + state.sc, 13U); state.sf ^= rotate(state.se + state.sd, 18U); \ } while(0) -uint16 salsa_small_scalar_rnd(uint16 X) +uint16 salsa_small_parallel_rnd(uint16 X) { - uint16 st = X; - +#ifndef SHITMAIN + uint4 st[4] = { (uint4)(X.s4, X.s9, X.se, X.s3), + (uint4)(X.s8, X.sd, X.s2, X.s7), + (uint4)(X.sc, X.s1, X.s6, X.sb), + (uint4)(X.s0, X.s5, X.sa, X.sf) }; +#else + uint4 st[4]; + ((uint16 *)st)[0] = X; +#endif + #if SALSA_SMALL_UNROLL == 1 for(int i = 0; i < 10; ++i) @@ -335,7 +920,7 @@ uint16 salsa_small_scalar_rnd(uint16 X) SALSA_CORE(st); } - #else + #elif SALSA_SMALL_UNROLL == 5 for(int i = 0; i < 2; ++i) { @@ -346,26 +931,114 @@ uint16 salsa_small_scalar_rnd(uint16 X) SALSA_CORE(st); } + #else + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + SALSA_CORE(st); + #endif +#ifndef SHITMAIN + return(X + (uint16)( + st[3].x, st[2].y, st[1].z, st[0].w, + st[0].x, st[3].y, st[2].z, st[1].w, + st[1].x, st[0].y, st[3].z, st[2].w, + st[2].x, st[1].y, st[0].z, st[3].w)); +#else + return(X + ((uint16 *)st)[0]); +#endif +} + +uint16 salsa_small_scalar_rnd(uint16 X) +{ + uint16 st = X; + + #if SALSA_SMALL_UNROLL == 1 + + for(int i = 0; i < 10; ++i) + { + SALSA_CORE_SCALAR(st); + } + + #elif SALSA_SMALL_UNROLL == 2 + + for(int i = 0; i < 5; ++i) + { + SALSA_CORE_SCALAR(st); + SALSA_CORE_SCALAR(st); + } + + #elif SALSA_SMALL_UNROLL == 3 + + for(int i = 0; i < 4; ++i) + { + SALSA_CORE_SCALAR(st); + if(i == 3) break; + SALSA_CORE_SCALAR(st); + SALSA_CORE_SCALAR(st); + } + + #elif SALSA_SMALL_UNROLL == 4 + + for(int i = 0; i < 3; ++i) + { + SALSA_CORE_SCALAR(st); + SALSA_CORE_SCALAR(st); + if(i == 2) break; + SALSA_CORE_SCALAR(st); + SALSA_CORE_SCALAR(st); + } + + #else + + for(int i = 0; i < 2; ++i) + { + SALSA_CORE_SCALAR(st); + SALSA_CORE_SCALAR(st); + SALSA_CORE_SCALAR(st); + SALSA_CORE_SCALAR(st); + SALSA_CORE_SCALAR(st); + } + + #endif + return(X + st); } + #define CHACHA_CORE_PARALLEL(state) do { \ - state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], (uint4)(16U, 16U, 16U, 16U)); \ - state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], (uint4)(12U, 12U, 12U, 12U)); \ - state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], (uint4)(8U, 8U, 8U, 8U)); \ - state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], (uint4)(7U, 7U, 7U, 7U)); \ + state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], 16U); \ + state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], 12U); \ + state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], 8U); \ + state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], 7U); \ \ - state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], (uint4)(16U, 16U, 16U, 16U)); \ - state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, (uint4)(12U, 12U, 12U, 12U)); \ - state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], (uint4)(8U, 8U, 8U, 8U)); \ - state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, (uint4)(7U, 7U, 7U, 7U)); \ + state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], 16); \ + state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, 12U); \ + state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], 8U); \ + state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, 7U); \ +} while(0) + +#define CHACHA_CORE(state) do { \ + state.s0 += state.s4; state.sc = as_uint(as_ushort2(state.sc ^ state.s0).s10); state.s8 += state.sc; state.s4 = rotate(state.s4 ^ state.s8, 12U); state.s0 += state.s4; state.sc = rotate(state.sc ^ state.s0, 8U); state.s8 += state.sc; state.s4 = rotate(state.s4 ^ state.s8, 7U); \ + state.s1 += state.s5; state.sd = as_uint(as_ushort2(state.sd ^ state.s1).s10); state.s9 += state.sd; state.s5 = rotate(state.s5 ^ state.s9, 12U); state.s1 += state.s5; state.sd = rotate(state.sd ^ state.s1, 8U); state.s9 += state.sd; state.s5 = rotate(state.s5 ^ state.s9, 7U); \ + state.s2 += state.s6; state.se = as_uint(as_ushort2(state.se ^ state.s2).s10); state.sa += state.se; state.s6 = rotate(state.s6 ^ state.sa, 12U); state.s2 += state.s6; state.se = rotate(state.se ^ state.s2, 8U); state.sa += state.se; state.s6 = rotate(state.s6 ^ state.sa, 7U); \ + state.s3 += state.s7; state.sf = as_uint(as_ushort2(state.sf ^ state.s3).s10); state.sb += state.sf; state.s7 = rotate(state.s7 ^ state.sb, 12U); state.s3 += state.s7; state.sf = rotate(state.sf ^ state.s3, 8U); state.sb += state.sf; state.s7 = rotate(state.s7 ^ state.sb, 7U); \ + state.s0 += state.s5; state.sf = as_uint(as_ushort2(state.sf ^ state.s0).s10); state.sa += state.sf; state.s5 = rotate(state.s5 ^ state.sa, 12U); state.s0 += state.s5; state.sf = rotate(state.sf ^ state.s0, 8U); state.sa += state.sf; state.s5 = rotate(state.s5 ^ state.sa, 7U); \ + state.s1 += state.s6; state.sc = as_uint(as_ushort2(state.sc ^ state.s1).s10); state.sb += state.sc; state.s6 = rotate(state.s6 ^ state.sb, 12U); state.s1 += state.s6; state.sc = rotate(state.sc ^ state.s1, 8U); state.sb += state.sc; state.s6 = rotate(state.s6 ^ state.sb, 7U); \ + state.s2 += state.s7; state.sd = as_uint(as_ushort2(state.sd ^ state.s2).s10); state.s8 += state.sd; state.s7 = rotate(state.s7 ^ state.s8, 12U); state.s2 += state.s7; state.sd = rotate(state.sd ^ state.s2, 8U); state.s8 += state.sd; state.s7 = rotate(state.s7 ^ state.s8, 7U); \ + state.s3 += state.s4; state.se = as_uint(as_ushort2(state.se ^ state.s3).s10); state.s9 += state.se; state.s4 = rotate(state.s4 ^ state.s9, 12U); state.s3 += state.s4; state.se = rotate(state.se ^ state.s3, 8U); state.s9 += state.se; state.s4 = rotate(state.s4 ^ state.s9, 7U); \ } while(0) uint16 chacha_small_parallel_rnd(uint16 X) { - uint4 t, st[4]; + uint4 st[4]; ((uint16 *)st)[0] = X; @@ -405,7 +1078,7 @@ uint16 chacha_small_parallel_rnd(uint16 X) CHACHA_CORE_PARALLEL(st); } - #else + #elif CHACHA_SMALL_UNROLL == 5 for(int i = 0; i < 2; ++i) { @@ -415,15 +1088,95 @@ uint16 chacha_small_parallel_rnd(uint16 X) CHACHA_CORE_PARALLEL(st); CHACHA_CORE_PARALLEL(st); } + #else + + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); + CHACHA_CORE_PARALLEL(st); #endif return(X + ((uint16 *)st)[0]); } -void neoscrypt_blkmix(uint16 *XV, bool alg) -{ +uint16 chacha_small_scalar_rnd(uint16 X) +{ + uint16 st = X; + + #if CHACHA_SMALL_UNROLL == 1 + + for(int i = 0; i < 10; ++i) + { + CHACHA_CORE(st); + } + + #elif CHACHA_SMALL_UNROLL == 2 + + for(int i = 0; i < 5; ++i) + { + CHACHA_CORE(st); + CHACHA_CORE(st); + } + + #elif CHACHA_SMALL_UNROLL == 3 + + for(int i = 0; i < 4; ++i) + { + CHACHA_CORE(st); + if(i == 3) break; + CHACHA_CORE(st); + CHACHA_CORE(st); + } + + #elif CHACHA_SMALL_UNROLL == 4 + + for(int i = 0; i < 3; ++i) + { + CHACHA_CORE(st); + CHACHA_CORE(st); + if(i == 2) break; + CHACHA_CORE(st); + CHACHA_CORE(st); + } + + #elif CHACHA_SMALL_UNROLL == 5 + + for(int i = 0; i < 2; ++i) + { + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + } + + #else + + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + CHACHA_CORE(st); + + #endif + + return(X + st); +} +void neoscrypt_blkmix_salsa(uint16 XV[4]) +{ /* NeoScrypt flow: Scrypt flow: Xa ^= Xd; M(Xa'); Ya = Xa"; Xa ^= Xb; M(Xa'); Ya = Xa"; Xb ^= Xa"; M(Xb'); Yb = Xb"; Xb ^= Xa"; M(Xb'); Yb = Xb"; @@ -431,48 +1184,135 @@ void neoscrypt_blkmix(uint16 *XV, bool alg) Xd ^= Xc"; M(Xd'); Yd = Xd"; Xb" = Yb; Xa" = Ya; Xb" = Yc; Xc" = Yb; Xd" = Yd; */ - +#if 0 + for(int i = 0; i < 4; ++i) XV[i] = (uint16)( + XV[i].s4, XV[i].s9, XV[i].se, XV[i].s3, XV[i].s8, XV[i].sd, XV[i].s2, XV[i].s7, + XV[i].sc, XV[i].s1, XV[i].s6, XV[i].sb, XV[i].s0, XV[i].s5, XV[i].sa, XV[i].sf); +#endif XV[0] ^= XV[3]; - if(!alg) - { - XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0]; - XV[1] = salsa_small_scalar_rnd(XV[1]); XV[2] ^= XV[1]; - XV[2] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[2]; - XV[3] = salsa_small_scalar_rnd(XV[3]); - } - else - { - XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0]; - XV[1] = chacha_small_parallel_rnd(XV[1]); XV[2] ^= XV[1]; - XV[2] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[2]; - XV[3] = chacha_small_parallel_rnd(XV[3]); - } + XV[0] = salsa_small_parallel_rnd(XV[0]); XV[1] ^= XV[0]; + XV[1] = salsa_small_parallel_rnd(XV[1]); XV[2] ^= XV[1]; + XV[2] = salsa_small_parallel_rnd(XV[2]); XV[3] ^= XV[2]; + XV[3] = salsa_small_parallel_rnd(XV[3]); + + //XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0]; + //XV[1] = salsa_small_scalar_rnd(XV[1]); XV[2] ^= XV[1]; + //XV[2] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[2]; + //XV[3] = salsa_small_scalar_rnd(XV[3]); + + XV[1] ^= XV[2]; + XV[2] ^= XV[1]; + XV[1] ^= XV[2]; +#if 0 + XV[0] = (uint16)(XV[0].sc, XV[0].s9, XV[0].s6, XV[0].s3, XV[0].s0, XV[0].sd, XV[0].sa, XV[0].s7, XV[0].s4, XV[0].s1, XV[0].se, XV[0].sb, XV[0].s8, XV[0].s5, XV[0].s2, XV[0].sf); + XV[1] = (uint16)(XV[1].sc, XV[1].s9, XV[1].s6, XV[1].s3, XV[1].s0, XV[1].sd, XV[1].sa, XV[1].s7, XV[1].s4, XV[1].s1, XV[1].se, XV[1].sb, XV[1].s8, XV[1].s5, XV[1].s2, XV[1].sf); + XV[2] = (uint16)(XV[2].sc, XV[2].s9, XV[2].s6, XV[2].s3, XV[2].s0, XV[2].sd, XV[2].sa, XV[2].s7, XV[2].s4, XV[2].s1, XV[2].se, XV[2].sb, XV[2].s8, XV[2].s5, XV[2].s2, XV[2].sf); + XV[3] = (uint16)(XV[3].sc, XV[3].s9, XV[3].s6, XV[3].s3, XV[3].s0, XV[3].sd, XV[3].sa, XV[3].s7, XV[3].s4, XV[3].s1, XV[3].se, XV[3].sb, XV[3].s8, XV[3].s5, XV[3].s2, XV[3].sf); +#endif +} + +void neoscrypt_blkmix_chacha(uint16 XV[4]) +{ + + /* NeoScrypt flow: Scrypt flow: + Xa ^= Xd; M(Xa'); Ya = Xa"; Xa ^= Xb; M(Xa'); Ya = Xa"; + Xb ^= Xa"; M(Xb'); Yb = Xb"; Xb ^= Xa"; M(Xb'); Yb = Xb"; + Xc ^= Xb"; M(Xc'); Yc = Xc"; Xa" = Ya; + Xd ^= Xc"; M(Xd'); Yd = Xd"; Xb" = Yb; + Xa" = Ya; Xb" = Yc; + Xc" = Yb; Xd" = Yd; */ + XV[0] ^= XV[3]; + + #if 1 + + XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0]; + XV[1] = chacha_small_parallel_rnd(XV[1]); XV[2] ^= XV[1]; + XV[2] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[2]; + XV[3] = chacha_small_parallel_rnd(XV[3]); + + #else + + XV[0] = chacha_small_scalar_rnd(XV[0]); XV[1] ^= XV[0]; + XV[1] = chacha_small_scalar_rnd(XV[1]); XV[2] ^= XV[1]; + XV[2] = chacha_small_scalar_rnd(XV[2]); XV[3] ^= XV[2]; + XV[3] = chacha_small_scalar_rnd(XV[3]); + + #endif + XV[1] ^= XV[2]; XV[2] ^= XV[1]; XV[1] ^= XV[2]; } +#ifdef WIDE_STRIPE + +void ScratchpadStore(__global void *V, void *X, uchar idx) +{ + ((__global ulong16 *)V)[mul24(idx << 1, (int)get_global_size(0))] = ((ulong16 *)X)[0]; + ((__global ulong16 *)V)[mul24((idx << 1), (int)get_global_size(0)) + 1] = ((ulong16 *)X)[1]; + //const uint idx2 = mul24(idx << 2, (int)get_global_size(0)); + //#pragma unroll + //for(int i = 0; i < 4; ++i) ((__global uint16 *)V)[idx2 + i] = ((uint16 *)X)[i]; +} + +void ScratchpadMix(void *X, const __global void *V, uchar idx) +{ + ((ulong16 *)X)[0] ^= ((__global ulong16 *)V)[mul24(idx << 1, (int)get_global_size(0))]; + ((ulong16 *)X)[1] ^= ((__global ulong16 *)V)[mul24((idx << 1), (int)get_global_size(0)) + 1]; +} + +#else + void ScratchpadStore(__global void *V, void *X, uchar idx) { - ((__global ulong16 *)V)[idx << 1] = ((ulong16 *)X)[0]; - ((__global ulong16 *)V)[(idx << 1) + 1] = ((ulong16 *)X)[1]; + ((__global ulong16 *)V)[mul24(idx << 1, (int)get_global_size(0))] = ((ulong16 *)X)[0]; + ((__global ulong16 *)V)[mul24((idx << 1) + 1, (int)get_global_size(0))] = ((ulong16 *)X)[1]; } void ScratchpadMix(void *X, const __global void *V, uchar idx) { - ((ulong16 *)X)[0] ^= ((__global ulong16 *)V)[idx << 1]; - ((ulong16 *)X)[1] ^= ((__global ulong16 *)V)[(idx << 1) + 1]; + ((ulong16 *)X)[0] ^= ((__global ulong16 *)V)[mul24(idx << 1, (int)get_global_size(0))]; + ((ulong16 *)X)[1] ^= ((__global ulong16 *)V)[mul24((idx << 1) + 1, (int)get_global_size(0))]; +} + +#endif + + + +#define SALSA_PERM (uint16)(4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11, 0, 5, 10, 15) +#define SALSA_INV_PERM (uint16)(12, 9, 6, 3, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15) + +void SMix_Salsa(uint16 X[4], __global uint16 *V) +{ + #pragma unroll 1 + for(int i = 0; i < 128; ++i) + { + ScratchpadStore(V, X, i); + neoscrypt_blkmix_salsa(X); + } + + #pragma unroll 1 + for(int i = 0; i < 128; ++i) + { + #ifdef SHITMAIN + const uint idx = convert_uchar(((uint *)X)[60] & 0x7F); + #else + const uint idx = convert_uchar(((uint *)X)[48] & 0x7F); + #endif + ScratchpadMix(X, V, idx); + neoscrypt_blkmix_salsa(X); + } } -void SMix(uint16 *X, __global uint16 *V, bool flag) +void SMix_Chacha(uint16 X[4], __global uint16 *V) { #pragma unroll 1 for(int i = 0; i < 128; ++i) { ScratchpadStore(V, X, i); - neoscrypt_blkmix(X, flag); + neoscrypt_blkmix_chacha(X); } #pragma unroll 1 @@ -480,10 +1320,13 @@ void SMix(uint16 *X, __global uint16 *V, bool flag) { const uint idx = convert_uchar(((uint *)X)[48] & 0x7F); ScratchpadMix(X, V, idx); - neoscrypt_blkmix(X, flag); + neoscrypt_blkmix_chacha(X); } } +#define SALSA_PERM (uint16)(4, 9, 14, 3, 8, 13, 2, 7, 12, 1, 6, 11, 0, 5, 10, 15) +#define SALSA_INV_PERM (uint16)(12, 9, 6, 3, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15) + __attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) __kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, const uint target) { @@ -491,9 +1334,12 @@ __kernel void search(__global const uchar* restrict input, __global uint* restri #define CONSTANT_r 2 // X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha uint16 X[4], Z[4]; - /* V = CONSTANT_N * CONSTANT_r * 2 * BLOCK_SIZE */ - __global ulong16 *V = (__global ulong16 *)(padcache + (0x8000 * (get_global_id(0) % MAX_GLOBAL_THREADS))); - uchar outbuf[32]; + #ifdef WIDE_STRIPE + __global ulong16 *V = ((__global ulong16 *)padcache) + ((get_global_id(0) % get_global_size(0)) << 1); + #else + __global ulong16 *V = ((__global ulong16 *)(padcache) + (get_global_id(0) % get_global_size(0))); + #endif + //uchar outbuf[32]; uchar data[PASSWORD_LEN]; ((ulong8 *)data)[0] = ((__global const ulong8 *)input)[0]; @@ -502,24 +1348,149 @@ __kernel void search(__global const uchar* restrict input, __global uint* restri ((uint *)data)[19] = get_global_id(0); // X = KDF(password, salt) - fastkdf(data, data, PASSWORD_LEN, (uchar *)X, 256); - + //fastkdf(data, data, PASSWORD_LEN, (uchar *)X, 256); + fastkdf1(data, (uchar *)X); + + #ifndef SHITMAIN // Process ChaCha 1st, Salsa 2nd and XOR them - run that through PBKDF2 CopyBytes128(Z, X, 2); - + #else + + #pragma unroll + for(int i = 0; i < 4; ++i) ((uint16 *)Z)[i] = shuffle(((uint16 *)X)[i], SALSA_PERM); + + #endif + // X = SMix(X); X & Z are swapped, repeat. - for(bool flag = false;; ++flag) + for(int i = 0;; ++i) { - SMix(X, V, flag); - if(flag) break; + #ifdef SWAP + if (i) SMix_Salsa(X,V); else SMix_Chacha(X,V); + if(i) break; SwapBytes128(X, Z, 256); + #else + if (i) SMix_Chacha(X,V); else SMix_Salsa(Z,V); + if(i) break; + #endif } + + #if defined(SWAP) && defined(SHITMAIN) + #pragma unroll + for(int i = 0; i < 4; ++i) ((uint16 *)Z)[i] ^= shuffle(((uint16 *)X)[i], SALSA_INV_PERM); + fastkdf2(data, (uchar *)Z, output, target); + #elif defined(SHITMAIN) + #pragma unroll + for(int i = 0; i < 4; ++i) ((uint16 *)X)[i] ^= shuffle(((uint16 *)Z)[i], SALSA_INV_PERM); + fastkdf2(data, (uchar *)X, output, target); + #else + // blkxor(X, Z) + ((ulong16 *)X)[0] ^= ((ulong16 *)Z)[0]; + ((ulong16 *)X)[1] ^= ((ulong16 *)Z)[1]; + + // output = KDF(password, X) + //fastkdf(data, (uchar *)X, FASTKDF_BUFFER_SIZE, outbuf, 32); + fastkdf2(data, (uchar *)X, output, target); + #endif +} + + +/* +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search(__global const uchar* restrict input, __global uint16 *XZOutput) +{ +#define CONSTANT_N 128 +#define CONSTANT_r 2 + // X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha + uint16 X[4]; + XZOutput += (4 * 2 * get_global_id(0)); + + //uchar outbuf[32]; + uchar data[PASSWORD_LEN]; + + ((ulong8 *)data)[0] = ((__global const ulong8 *)input)[0]; + ((ulong *)data)[8] = ((__global const ulong *)input)[8]; + ((uint *)data)[18] = ((__global const uint *)input)[18]; + ((uint *)data)[19] = get_global_id(0); + // X = KDF(password, salt) + //fastkdf(data, data, PASSWORD_LEN, (uchar *)X, 256); + fastkdf1(data, (uchar *)X); + + for(int i = 0; i < 4; ++i) XZOutput[i] = X[i]; + for(int i = 0; i < 4; ++i) XZOutput[i + 4] = X[i]; + mem_fence(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search1(__global uint16 *XZOutput, __global uchar *padcache) +{ +#define CONSTANT_N 128 +#define CONSTANT_r 2 + // X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha + uint16 X[4], Z[4]; + #ifdef WIDE_STRIPE + __global ulong16 *V = ((__global ulong16 *)padcache) + ((get_global_id(0) % get_global_size(0)) << 1); + #else + __global ulong16 *V = ((__global ulong16 *)(padcache) + (get_global_id(0) % get_global_size(0))); + #endif + //uchar outbuf[32]; + + XZOutput += (4 * 2 * get_global_id(0)); + + for(int i = 0; i < 4; ++i) X[i] = XZOutput[i]; + + SMix_Salsa(X,V); + + for(int i = 0; i < 4; ++i) XZOutput[i] = X[i]; + mem_fence(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search2(__global uint16 *XZOutput, __global uchar *padcache) +{ +#define CONSTANT_N 128 +#define CONSTANT_r 2 + // X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha + uint16 X[4], Z[4]; + #ifdef WIDE_STRIPE + __global ulong16 *V = ((__global ulong16 *)padcache) + ((get_global_id(0) % get_global_size(0)) << 1); + #else + __global ulong16 *V = ((__global ulong16 *)(padcache) + (get_global_id(0) % get_global_size(0))); + #endif + //uchar outbuf[32]; + + XZOutput += (4 * 2 * get_global_id(0)); + + for(int i = 0; i < 4; ++i) X[i] = XZOutput[i + 4]; + + SMix_Chacha(X,V); + + for(int i = 0; i < 4; ++i) XZOutput[i + 4] = X[i]; + mem_fence(CLK_GLOBAL_MEM_FENCE); +} + +__attribute__((reqd_work_group_size(WORKSIZE, 1, 1))) +__kernel void search3(__global const uchar* restrict input, __global uint16 *XZOutput, __global uint* restrict output, const uint target) +{ + uint16 X[4], Z[4]; + uchar data[PASSWORD_LEN]; + + ((ulong8 *)data)[0] = ((__global const ulong8 *)input)[0]; + ((ulong *)data)[8] = ((__global const ulong *)input)[8]; + ((uint *)data)[18] = ((__global const uint *)input)[18]; + ((uint *)data)[19] = get_global_id(0); + + XZOutput += (4 * 2 * get_global_id(0)); + + for(int i = 0; i < 4; ++i) X[i] = XZOutput[i]; + for(int i = 0; i < 4; ++i) Z[i] = XZOutput[i + 4]; + // blkxor(X, Z) ((ulong16 *)X)[0] ^= ((ulong16 *)Z)[0]; ((ulong16 *)X)[1] ^= ((ulong16 *)Z)[1]; // output = KDF(password, X) - fastkdf(data, (uchar *)X, FASTKDF_BUFFER_SIZE, outbuf, 32); - if(((uint *)outbuf)[7] <= target) output[atomic_add(output + 0xFF, 1)] = get_global_id(0); -} \ No newline at end of file + //fastkdf(data, (uchar *)X, FASTKDF_BUFFER_SIZE, outbuf, 32); + fastkdf2(data, (uchar *)X, output, target); +} +*/ \ No newline at end of file