sgminer/kernel/neoscrypt.cl

/* NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20 */
/* Adapted and improved for 14.x drivers by Wolf9466 (Wolf`) */

// Stupid AMD compiler ignores the unroll pragma in these two
#define SALSA_SMALL_UNROLL 3
#define CHACHA_SMALL_UNROLL 3

// If SMALL_BLAKE2S is defined, BLAKE2S_UNROLL is interpreted
// as the unroll factor; must divide cleanly into ten.
// Usually a bad idea.
//#define SMALL_BLAKE2S
//#define BLAKE2S_UNROLL 5

#define BLOCK_SIZE           64U
#define FASTKDF_BUFFER_SIZE 256U
#ifndef PASSWORD_LEN
#define PASSWORD_LEN         80U
#endif

#if !defined(cl_khr_byte_addressable_store)
#error "Device does not support unaligned stores"
#endif

// Swaps 128 bytes at a time without using temp vars
void SwapBytes128(void *restrict A, void *restrict B, uint len)
{
	#pragma unroll 2
	for(int i = 0; i < (len >> 7); ++i)
	{
		((ulong16 *)A)[i] ^= ((ulong16 *)B)[i];
		((ulong16 *)B)[i] ^= ((ulong16 *)A)[i];
		((ulong16 *)A)[i] ^= ((ulong16 *)B)[i];
	}
}

void CopyBytes128(void *restrict dst, const void *restrict src, uint len)
{
	#pragma unroll 2
    for(int i = 0; i < len; ++i)
		((ulong16 *)dst)[i] = ((ulong16 *)src)[i];
}

void CopyBytes(void *restrict dst, const void *restrict src, uint len)
{
    for(int i = 0; i < len; ++i)
		((uchar *)dst)[i] = ((uchar *)src)[i];
}

void XORBytesInPlace(void *restrict dst, const void *restrict src, uint len)
{
	for(int i = 0; i < len; ++i)
		((uchar *)dst)[i] ^= ((uchar *)src)[i];
}

void XORBytes(void *restrict dst, const void *restrict src1, const void *restrict src2, uint len)
{
	#pragma unroll 1
	for(int i = 0; i < len; ++i)
		((uchar *)dst)[i] = ((uchar *)src1)[i] ^ ((uchar *)src2)[i];
}

// Blake2S

#define BLAKE2S_BLOCK_SIZE    64U
#define BLAKE2S_OUT_SIZE      32U
#define BLAKE2S_KEY_SIZE      32U

static const __constant uint BLAKE2S_IV[8] =
{
    0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
    0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
};

static const __constant uchar BLAKE2S_SIGMA[10][16] =
{
    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
    { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
    { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
    {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
    {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
    {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
    { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
    { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
    {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
    { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
};

#define BLAKE_G(idx0, idx1, a, b, c, d, key)	do { \
	a += b + key[BLAKE2S_SIGMA[idx0][idx1]]; \
	d = rotate(d ^ a, 16U); \
	c += d; \
	b = rotate(b ^ c, 20U); \
	a += b + key[BLAKE2S_SIGMA[idx0][idx1 + 1]]; \
	d = rotate(d ^ a, 24U); \
	c += d; \
	b = rotate(b ^ c, 25U); \
} while(0)

void Blake2S(uint *restrict inout, const uint *restrict inkey)
{
	uint16 V;
	uint8 tmpblock;

	// Load first block (IV into V.lo) and constants (IV into V.hi)
	V.lo = V.hi = vload8(0U, BLAKE2S_IV);

	// XOR with initial constant
	V.s0 ^= 0x01012020;

	// Copy input block for later
	tmpblock = V.lo;

	// XOR length of message so far (including this block)
	// There are two uints for this field, but high uint is zero
	V.sc ^= BLAKE2S_BLOCK_SIZE;

	// Compress state, using the key as the key
	#ifdef SMALL_BLAKE2S
	#pragma unroll BLAKE2S_UNROLL
	#else
	#pragma unroll
	#endif
	for(int x = 0; x < 10; ++x)
	{
		BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inkey);
		BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inkey);
		BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inkey);
		BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inkey);
		BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inkey);
		BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inkey);
		BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inkey);
		BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inkey);
	}

	// XOR low part of state with the high part,
	// then with the original input block.
	V.lo ^= V.hi ^ tmpblock;

	// Load constants (IV into V.hi)
	V.hi = vload8(0U, BLAKE2S_IV);

	// Copy input block for later
	tmpblock = V.lo;

	// XOR length of message into block again
	V.sc ^= BLAKE2S_BLOCK_SIZE << 1;

	// Last block compression - XOR final constant into state
	V.se ^= 0xFFFFFFFFU;

	// Compress block, using the input as the key
	#ifdef SMALL_BLAKE2S
	#pragma unroll BLAKE2S_UNROLL
	#else
	#pragma unroll
	#endif
	for(int x = 0; x < 10; ++x)
	{
		BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inout);
		BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inout);
		BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inout);
		BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inout);
		BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inout);
		BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inout);
		BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inout);
		BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inout);
	}

	// XOR low part of state with high part, then with input block
	V.lo ^= V.hi ^ tmpblock;

	// Store result in input/output buffer
	vstore8(V.lo, 0, inout);
}

/* FastKDF, a fast buffered key derivation function:
 * FASTKDF_BUFFER_SIZE must be a power of 2;
 * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE;
 * prf_output_size must be <= prf_key_size; */
void fastkdf(const uchar *restrict password, const uchar *restrict salt, const uint salt_len, uchar *restrict output, uint output_len)
{

	/*                    WARNING!
	 * This algorithm uses byte-wise addressing for memory blocks.
	 * Or in other words, trying to copy an unaligned memory region
	 * will significantly slow down the algorithm, when copying uses
	 * words or bigger entities. It even may corrupt the data, when
	 * the device does not support it properly.
	 * Therefore use byte copying, which will not the fastest but at
	 * least get reliable results. */

	// BLOCK_SIZE            64U
	// FASTKDF_BUFFER_SIZE  256U
	// BLAKE2S_BLOCK_SIZE    64U
	// BLAKE2S_KEY_SIZE      32U
	// BLAKE2S_OUT_SIZE      32U
	uchar bufidx = 0;
	uint8 Abuffer[9], Bbuffer[9] = { (uint8)(0) };
	uchar *A = (uchar *)Abuffer, *B = (uchar *)Bbuffer;

	// Initialize the password buffer
	#pragma unroll 1
	for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)A)[i] = ((ulong *)password)[i % 10];

	((uint16 *)(A + FASTKDF_BUFFER_SIZE))[0] = ((uint16 *)password)[0];

	// Initialize the salt buffer
	if(salt_len == FASTKDF_BUFFER_SIZE)
	{
		((ulong16 *)B)[0] = ((ulong16 *)B)[2] = ((ulong16 *)salt)[0];
		((ulong16 *)B)[1] = ((ulong16 *)B)[3] = ((ulong16 *)salt)[1];
	}
	else
	{
		// salt_len is 80 bytes here
		#pragma unroll 1
		for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)B)[i] = ((ulong *)salt)[i % 10];

		// Initialized the rest to zero earlier
		#pragma unroll 1
		for(int i = 0; i < 10; ++i) ((ulong *)(B + FASTKDF_BUFFER_SIZE))[i] = ((ulong *)salt)[i];
	}

    // The primary iteration
    #pragma unroll 1
    for(int i = 0; i < 32; ++i)
    {
		// Make the key buffer twice the size of the key so it fits a Blake2S block
		// This way, we don't need a temp buffer in the Blake2S function.
		uchar input[BLAKE2S_BLOCK_SIZE], key[BLAKE2S_BLOCK_SIZE] = { 0 };

		// Copy input and key to their buffers
		CopyBytes(input, A + bufidx, BLAKE2S_BLOCK_SIZE);
		CopyBytes(key, B + bufidx, BLAKE2S_KEY_SIZE);

        // PRF
        Blake2S((uint *)input, (uint *)key);

        // Calculate the next buffer pointer
		bufidx = 0;

		for(int x = 0; x < BLAKE2S_OUT_SIZE; ++x)
			bufidx += input[x];

		// bufidx a uchar now - always mod 255
		//bufidx &= (FASTKDF_BUFFER_SIZE - 1);

        // Modify the salt buffer
		XORBytesInPlace(B + bufidx, input, BLAKE2S_OUT_SIZE);

		if(bufidx < BLAKE2S_KEY_SIZE)
		{
			// Head modified, tail updated
			// this was made off the original code... wtf
			//CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx));
			CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, BLAKE2S_KEY_SIZE - bufidx);
		}
		else if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE)
		{
			// Tail modified, head updated
			CopyBytes(B, B + FASTKDF_BUFFER_SIZE, BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx));
		}
    }

    // Modify and copy into the output buffer

    // Damned compiler crashes
    // Fuck you, AMD

	//for(uint i = 0; i < output_len; ++i, ++bufidx)
	//	output[i] = B[bufidx] ^ A[i];

    uint left = FASTKDF_BUFFER_SIZE - bufidx;
	//uint left = (~bufidx) + 1

	if(left < output_len)
	{
		XORBytes(output, B + bufidx, A, left);
		XORBytes(output + left, B, A + left, output_len - left);
	}
	else
	{
		XORBytes(output, B + bufidx, A, output_len);
	}
}

#define SALSA_CORE(state)	do { \
	state.s4 ^= rotate(state.s0 + state.sc, 7U); state.s8 ^= rotate(state.s4 + state.s0, 9U); state.sc ^= rotate(state.s8 + state.s4, 13U); state.s0 ^= rotate(state.sc + state.s8, 18U); \
	state.s9 ^= rotate(state.s5 + state.s1, 7U); state.sd ^= rotate(state.s9 + state.s5, 9U); state.s1 ^= rotate(state.sd + state.s9, 13U); state.s5 ^= rotate(state.s1 + state.sd, 18U); \
	state.se ^= rotate(state.sa + state.s6, 7U); state.s2 ^= rotate(state.se + state.sa, 9U); state.s6 ^= rotate(state.s2 + state.se, 13U); state.sa ^= rotate(state.s6 + state.s2, 18U); \
	state.s3 ^= rotate(state.sf + state.sb, 7U); state.s7 ^= rotate(state.s3 + state.sf, 9U); state.sb ^= rotate(state.s7 + state.s3, 13U); state.sf ^= rotate(state.sb + state.s7, 18U); \
	state.s1 ^= rotate(state.s0 + state.s3, 7U); state.s2 ^= rotate(state.s1 + state.s0, 9U); state.s3 ^= rotate(state.s2 + state.s1, 13U); state.s0 ^= rotate(state.s3 + state.s2, 18U); \
	state.s6 ^= rotate(state.s5 + state.s4, 7U); state.s7 ^= rotate(state.s6 + state.s5, 9U); state.s4 ^= rotate(state.s7 + state.s6, 13U); state.s5 ^= rotate(state.s4 + state.s7, 18U); \
	state.sb ^= rotate(state.sa + state.s9, 7U); state.s8 ^= rotate(state.sb + state.sa, 9U); state.s9 ^= rotate(state.s8 + state.sb, 13U); state.sa ^= rotate(state.s9 + state.s8, 18U); \
	state.sc ^= rotate(state.sf + state.se, 7U); state.sd ^= rotate(state.sc + state.sf, 9U); state.se ^= rotate(state.sd + state.sc, 13U); state.sf ^= rotate(state.se + state.sd, 18U); \
} while(0)

uint16 salsa_small_scalar_rnd(uint16 X)
{
	uint16 st = X;

	#if SALSA_SMALL_UNROLL == 1

	for(int i = 0; i < 10; ++i)
	{
		SALSA_CORE(st);
	}

	#elif SALSA_SMALL_UNROLL == 2

	for(int i = 0; i < 5; ++i)
	{
		SALSA_CORE(st);
		SALSA_CORE(st);
	}

	#elif SALSA_SMALL_UNROLL == 3

	for(int i = 0; i < 4; ++i)
	{
		SALSA_CORE(st);
		if(i == 3) break;
		SALSA_CORE(st);
		SALSA_CORE(st);
	}

	#elif SALSA_SMALL_UNROLL == 4

	for(int i = 0; i < 3; ++i)
	{
		SALSA_CORE(st);
		SALSA_CORE(st);
		if(i == 2) break;
		SALSA_CORE(st);
		SALSA_CORE(st);
	}

	#else

	for(int i = 0; i < 2; ++i)
	{
		SALSA_CORE(st);
		SALSA_CORE(st);
		SALSA_CORE(st);
		SALSA_CORE(st);
		SALSA_CORE(st);
	}

	#endif

	return(X + st);
}

#define CHACHA_CORE_PARALLEL(state)	do { \
	state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], (uint4)(16U, 16U, 16U, 16U)); \
	state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], (uint4)(12U, 12U, 12U, 12U)); \
	state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], (uint4)(8U, 8U, 8U, 8U)); \
	state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], (uint4)(7U, 7U, 7U, 7U)); \
	\
	state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], (uint4)(16U, 16U, 16U, 16U)); \
	state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, (uint4)(12U, 12U, 12U, 12U)); \
	state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], (uint4)(8U, 8U, 8U, 8U)); \
	state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, (uint4)(7U, 7U, 7U, 7U)); \
} while(0)

uint16 chacha_small_parallel_rnd(uint16 X)
{
	uint4 t, st[4];

	((uint16 *)st)[0] = X;

	#if CHACHA_SMALL_UNROLL == 1

	for(int i = 0; i < 10; ++i)
	{
		CHACHA_CORE_PARALLEL(st);
	}

	#elif CHACHA_SMALL_UNROLL == 2

	for(int i = 0; i < 5; ++i)
	{
		CHACHA_CORE_PARALLEL(st);
		CHACHA_CORE_PARALLEL(st);
	}

	#elif CHACHA_SMALL_UNROLL == 3

	for(int i = 0; i < 4; ++i)
	{
		CHACHA_CORE_PARALLEL(st);
		if(i == 3) break;
		CHACHA_CORE_PARALLEL(st);
		CHACHA_CORE_PARALLEL(st);
	}

	#elif CHACHA_SMALL_UNROLL == 4

	for(int i = 0; i < 3; ++i)
	{
		CHACHA_CORE_PARALLEL(st);
		CHACHA_CORE_PARALLEL(st);
		if(i == 2) break;
		CHACHA_CORE_PARALLEL(st);
		CHACHA_CORE_PARALLEL(st);
	}

	#else

	for(int i = 0; i < 2; ++i)
	{
		CHACHA_CORE_PARALLEL(st);
		CHACHA_CORE_PARALLEL(st);
		CHACHA_CORE_PARALLEL(st);
		CHACHA_CORE_PARALLEL(st);
		CHACHA_CORE_PARALLEL(st);
	}

	#endif

	return(X + ((uint16 *)st)[0]);
}

void neoscrypt_blkmix(uint16 *XV, bool alg)
{

    /* NeoScrypt flow:                   Scrypt flow:
         Xa ^= Xd;  M(Xa'); Ya = Xa";      Xa ^= Xb;  M(Xa'); Ya = Xa";
         Xb ^= Xa"; M(Xb'); Yb = Xb";      Xb ^= Xa"; M(Xb'); Yb = Xb";
         Xc ^= Xb"; M(Xc'); Yc = Xc";      Xa" = Ya;
         Xd ^= Xc"; M(Xd'); Yd = Xd";      Xb" = Yb;
         Xa" = Ya; Xb" = Yc;
         Xc" = Yb; Xd" = Yd; */

	XV[0] ^= XV[3];

	if(!alg)
	{
		XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0];
		XV[1] = salsa_small_scalar_rnd(XV[1]); XV[2] ^= XV[1];
		XV[2] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[2];
		XV[3] = salsa_small_scalar_rnd(XV[3]);
	}
	else
	{
		XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0];
		XV[1] = chacha_small_parallel_rnd(XV[1]); XV[2] ^= XV[1];
		XV[2] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[2];
		XV[3] = chacha_small_parallel_rnd(XV[3]);
	}

	XV[1] ^= XV[2];
	XV[2] ^= XV[1];
	XV[1] ^= XV[2];
}

void ScratchpadStore(__global void *V, void *X, uchar idx)
{
	((__global ulong16 *)V)[idx << 1] = ((ulong16 *)X)[0];
	((__global ulong16 *)V)[(idx << 1) + 1] = ((ulong16 *)X)[1];
}

void ScratchpadMix(void *X, const __global void *V, uchar idx)
{
	((ulong16 *)X)[0] ^= ((__global ulong16 *)V)[idx << 1];
	((ulong16 *)X)[1] ^= ((__global ulong16 *)V)[(idx << 1) + 1];
}

void SMix(uint16 *X, __global uint16 *V, bool flag)
{
	#pragma unroll 1
	for(int i = 0; i < 128; ++i)
	{
		ScratchpadStore(V, X, i);
		neoscrypt_blkmix(X, flag);
	}

	#pragma unroll 1
	for(int i = 0; i < 128; ++i)
	{
		const uint idx = convert_uchar(((uint *)X)[48] & 0x7F);
		ScratchpadMix(X, V, idx);
		neoscrypt_blkmix(X, flag);
	}
}

__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
__kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, const uint target)
{
#define CONSTANT_N 128
#define CONSTANT_r 2
	// X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha
	uint16 X[4], Z[4];
	/* V = CONSTANT_N * CONSTANT_r * 2 * BLOCK_SIZE */
	__global ulong16 *V = (__global ulong16 *)(padcache + (0x8000 * (get_global_id(0) % MAX_GLOBAL_THREADS)));
	uchar outbuf[32];
	uchar data[PASSWORD_LEN];

	((ulong8 *)data)[0] = ((__global const ulong8 *)input)[0];
	((ulong *)data)[8] = ((__global const ulong *)input)[8];
	((uint *)data)[18] = ((__global const uint *)input)[18];
	((uint *)data)[19] = get_global_id(0);

    // X = KDF(password, salt)
	fastkdf(data, data, PASSWORD_LEN, (uchar *)X, 256);

    // Process ChaCha 1st, Salsa 2nd and XOR them - run that through PBKDF2
    CopyBytes128(Z, X, 2);

    // X = SMix(X); X & Z are swapped, repeat.
    for(bool flag = false;; ++flag)
    {
		SMix(X, V, flag);
		if(flag) break;
		SwapBytes128(X, Z, 256);
	}

	// blkxor(X, Z)
	((ulong16 *)X)[0] ^= ((ulong16 *)Z)[0];
	((ulong16 *)X)[1] ^= ((ulong16 *)Z)[1];

	// output = KDF(password, X)
	fastkdf(data, (uchar *)X, FASTKDF_BUFFER_SIZE, outbuf, 32);
	if(((uint *)outbuf)[7] <= target) output[atomic_add(output + 0xFF, 1)] = get_global_id(0);
}
Added Neoscrypt with Wolf9466 improvements. 10 years ago			`/* NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20 */`
			/* Adapted and improved for 14.x drivers by Wolf9466 (Wolf`) */

			`// Stupid AMD compiler ignores the unroll pragma in these two`
			`#define SALSA_SMALL_UNROLL 3`
			`#define CHACHA_SMALL_UNROLL 3`

			`// If SMALL_BLAKE2S is defined, BLAKE2S_UNROLL is interpreted`
			`// as the unroll factor; must divide cleanly into ten.`
			`// Usually a bad idea.`
			`//#define SMALL_BLAKE2S`
			`//#define BLAKE2S_UNROLL 5`

			`#define BLOCK_SIZE 64U`
			`#define FASTKDF_BUFFER_SIZE 256U`
			`#ifndef PASSWORD_LEN`
			`#define PASSWORD_LEN 80U`
			`#endif`

			`#if !defined(cl_khr_byte_addressable_store)`
			`#error "Device does not support unaligned stores"`
			`#endif`

			`// Swaps 128 bytes at a time without using temp vars`
			`void SwapBytes128(void restrict A, void restrict B, uint len)`
			`{`
			`#pragma unroll 2`
			`for(int i = 0; i < (len >> 7); ++i)`
			`{`
			`((ulong16 )A)[i] ^= ((ulong16 )B)[i];`
			`((ulong16 )B)[i] ^= ((ulong16 )A)[i];`
			`((ulong16 )A)[i] ^= ((ulong16 )B)[i];`
			`}`
			`}`

			`void CopyBytes128(void restrict dst, const void restrict src, uint len)`
			`{`
			`#pragma unroll 2`
			`for(int i = 0; i < len; ++i)`
			`((ulong16 )dst)[i] = ((ulong16 )src)[i];`
			`}`

			`void CopyBytes(void restrict dst, const void restrict src, uint len)`
			`{`
			`for(int i = 0; i < len; ++i)`
			`((uchar )dst)[i] = ((uchar )src)[i];`
			`}`

			`void XORBytesInPlace(void restrict dst, const void restrict src, uint len)`
			`{`
			`for(int i = 0; i < len; ++i)`
			`((uchar )dst)[i] ^= ((uchar )src)[i];`
			`}`

			`void XORBytes(void restrict dst, const void restrict src1, const void *restrict src2, uint len)`
			`{`
			`#pragma unroll 1`
			`for(int i = 0; i < len; ++i)`
			`((uchar )dst)[i] = ((uchar )src1)[i] ^ ((uchar *)src2)[i];`
			`}`

			`// Blake2S`

			`#define BLAKE2S_BLOCK_SIZE 64U`
			`#define BLAKE2S_OUT_SIZE 32U`
			`#define BLAKE2S_KEY_SIZE 32U`

			`static const __constant uint BLAKE2S_IV[8] =`
			`{`
			`0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,`
			`0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19`
			`};`

			`static const __constant uchar BLAKE2S_SIGMA[10][16] =`
			`{`
			`{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } ,`
			`{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } ,`
			`{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } ,`
			`{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } ,`
			`{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } ,`
			`{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } ,`
			`{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } ,`
			`{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } ,`
			`{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } ,`
			`{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } ,`
			`};`

			`#define BLAKE_G(idx0, idx1, a, b, c, d, key) do { \`
			`a += b + key[BLAKE2S_SIGMA[idx0][idx1]]; \`
			`d = rotate(d ^ a, 16U); \`
			`c += d; \`
			`b = rotate(b ^ c, 20U); \`
			`a += b + key[BLAKE2S_SIGMA[idx0][idx1 + 1]]; \`
			`d = rotate(d ^ a, 24U); \`
			`c += d; \`
			`b = rotate(b ^ c, 25U); \`
			`} while(0)`

			`void Blake2S(uint restrict inout, const uint restrict inkey)`
			`{`
			`uint16 V;`
			`uint8 tmpblock;`

			`// Load first block (IV into V.lo) and constants (IV into V.hi)`
			`V.lo = V.hi = vload8(0U, BLAKE2S_IV);`

			`// XOR with initial constant`
			`V.s0 ^= 0x01012020;`

			`// Copy input block for later`
			`tmpblock = V.lo;`

			`// XOR length of message so far (including this block)`
			`// There are two uints for this field, but high uint is zero`
			`V.sc ^= BLAKE2S_BLOCK_SIZE;`

			`// Compress state, using the key as the key`
			`#ifdef SMALL_BLAKE2S`
			`#pragma unroll BLAKE2S_UNROLL`
			`#else`
			`#pragma unroll`
			`#endif`
			`for(int x = 0; x < 10; ++x)`
			`{`
			`BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inkey);`
			`BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inkey);`
			`BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inkey);`
			`BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inkey);`
			`BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inkey);`
			`BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inkey);`
			`BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inkey);`
			`BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inkey);`
			`}`

			`// XOR low part of state with the high part,`
			`// then with the original input block.`
			`V.lo ^= V.hi ^ tmpblock;`

			`// Load constants (IV into V.hi)`
			`V.hi = vload8(0U, BLAKE2S_IV);`

			`// Copy input block for later`
			`tmpblock = V.lo;`

			`// XOR length of message into block again`
			`V.sc ^= BLAKE2S_BLOCK_SIZE << 1;`

			`// Last block compression - XOR final constant into state`
			`V.se ^= 0xFFFFFFFFU;`

			`// Compress block, using the input as the key`
			`#ifdef SMALL_BLAKE2S`
			`#pragma unroll BLAKE2S_UNROLL`
			`#else`
			`#pragma unroll`
			`#endif`
			`for(int x = 0; x < 10; ++x)`
			`{`
			`BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inout);`
			`BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inout);`
			`BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inout);`
			`BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inout);`
			`BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inout);`
			`BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inout);`
			`BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inout);`
			`BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inout);`
			`}`

			`// XOR low part of state with high part, then with input block`
			`V.lo ^= V.hi ^ tmpblock;`

			`// Store result in input/output buffer`
			`vstore8(V.lo, 0, inout);`
			`}`

			`/* FastKDF, a fast buffered key derivation function:`
			`* FASTKDF_BUFFER_SIZE must be a power of 2;`
			`* password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE;`
			`* prf_output_size must be <= prf_key_size; */`
			`void fastkdf(const uchar restrict password, const uchar restrict salt, const uint salt_len, uchar *restrict output, uint output_len)`
			`{`

			`/* WARNING!`
			`* This algorithm uses byte-wise addressing for memory blocks.`
			`* Or in other words, trying to copy an unaligned memory region`
			`* will significantly slow down the algorithm, when copying uses`
			`* words or bigger entities. It even may corrupt the data, when`
			`* the device does not support it properly.`
			`* Therefore use byte copying, which will not the fastest but at`
			`* least get reliable results. */`

			`// BLOCK_SIZE 64U`
			`// FASTKDF_BUFFER_SIZE 256U`
			`// BLAKE2S_BLOCK_SIZE 64U`
			`// BLAKE2S_KEY_SIZE 32U`
			`// BLAKE2S_OUT_SIZE 32U`
			`uchar bufidx = 0;`
			`uint8 Abuffer[9], Bbuffer[9] = { (uint8)(0) };`
			`uchar A = (uchar )Abuffer, B = (uchar )Bbuffer;`

			`// Initialize the password buffer`
			`#pragma unroll 1`
			`for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong )A)[i] = ((ulong )password)[i % 10];`

			`((uint16 )(A + FASTKDF_BUFFER_SIZE))[0] = ((uint16 )password)[0];`

			`// Initialize the salt buffer`
			`if(salt_len == FASTKDF_BUFFER_SIZE)`
			`{`
			`((ulong16 )B)[0] = ((ulong16 )B)[2] = ((ulong16 *)salt)[0];`
			`((ulong16 )B)[1] = ((ulong16 )B)[3] = ((ulong16 *)salt)[1];`
			`}`
			`else`
			`{`
			`// salt_len is 80 bytes here`
			`#pragma unroll 1`
			`for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong )B)[i] = ((ulong )salt)[i % 10];`

			`// Initialized the rest to zero earlier`
			`#pragma unroll 1`
			`for(int i = 0; i < 10; ++i) ((ulong )(B + FASTKDF_BUFFER_SIZE))[i] = ((ulong )salt)[i];`
			`}`

			`// The primary iteration`
			`#pragma unroll 1`
			`for(int i = 0; i < 32; ++i)`
			`{`
			`// Make the key buffer twice the size of the key so it fits a Blake2S block`
			`// This way, we don't need a temp buffer in the Blake2S function.`
			`uchar input[BLAKE2S_BLOCK_SIZE], key[BLAKE2S_BLOCK_SIZE] = { 0 };`

			`// Copy input and key to their buffers`
			`CopyBytes(input, A + bufidx, BLAKE2S_BLOCK_SIZE);`
			`CopyBytes(key, B + bufidx, BLAKE2S_KEY_SIZE);`

			`// PRF`
			`Blake2S((uint )input, (uint )key);`

			`// Calculate the next buffer pointer`
			`bufidx = 0;`

			`for(int x = 0; x < BLAKE2S_OUT_SIZE; ++x)`
			`bufidx += input[x];`

			`// bufidx a uchar now - always mod 255`
			`//bufidx &= (FASTKDF_BUFFER_SIZE - 1);`

			`// Modify the salt buffer`
			`XORBytesInPlace(B + bufidx, input, BLAKE2S_OUT_SIZE);`

			`if(bufidx < BLAKE2S_KEY_SIZE)`
			`{`
			`// Head modified, tail updated`
			`// this was made off the original code... wtf`
			`//CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx));`
			`CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, BLAKE2S_KEY_SIZE - bufidx);`
			`}`
			`else if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE)`
			`{`
			`// Tail modified, head updated`
			`CopyBytes(B, B + FASTKDF_BUFFER_SIZE, BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx));`
			`}`
			`}`

			`// Modify and copy into the output buffer`

			`// Damned compiler crashes`
			`// Fuck you, AMD`

			`//for(uint i = 0; i < output_len; ++i, ++bufidx)`
			`// output[i] = B[bufidx] ^ A[i];`

			`uint left = FASTKDF_BUFFER_SIZE - bufidx;`
			`//uint left = (~bufidx) + 1`

			`if(left < output_len)`
			`{`
			`XORBytes(output, B + bufidx, A, left);`
			`XORBytes(output + left, B, A + left, output_len - left);`
			`}`
			`else`
			`{`
			`XORBytes(output, B + bufidx, A, output_len);`
			`}`
			`}`

			`#define SALSA_CORE(state) do { \`
			`state.s4 ^= rotate(state.s0 + state.sc, 7U); state.s8 ^= rotate(state.s4 + state.s0, 9U); state.sc ^= rotate(state.s8 + state.s4, 13U); state.s0 ^= rotate(state.sc + state.s8, 18U); \`
			`state.s9 ^= rotate(state.s5 + state.s1, 7U); state.sd ^= rotate(state.s9 + state.s5, 9U); state.s1 ^= rotate(state.sd + state.s9, 13U); state.s5 ^= rotate(state.s1 + state.sd, 18U); \`
			`state.se ^= rotate(state.sa + state.s6, 7U); state.s2 ^= rotate(state.se + state.sa, 9U); state.s6 ^= rotate(state.s2 + state.se, 13U); state.sa ^= rotate(state.s6 + state.s2, 18U); \`
			`state.s3 ^= rotate(state.sf + state.sb, 7U); state.s7 ^= rotate(state.s3 + state.sf, 9U); state.sb ^= rotate(state.s7 + state.s3, 13U); state.sf ^= rotate(state.sb + state.s7, 18U); \`
			`state.s1 ^= rotate(state.s0 + state.s3, 7U); state.s2 ^= rotate(state.s1 + state.s0, 9U); state.s3 ^= rotate(state.s2 + state.s1, 13U); state.s0 ^= rotate(state.s3 + state.s2, 18U); \`
			`state.s6 ^= rotate(state.s5 + state.s4, 7U); state.s7 ^= rotate(state.s6 + state.s5, 9U); state.s4 ^= rotate(state.s7 + state.s6, 13U); state.s5 ^= rotate(state.s4 + state.s7, 18U); \`
			`state.sb ^= rotate(state.sa + state.s9, 7U); state.s8 ^= rotate(state.sb + state.sa, 9U); state.s9 ^= rotate(state.s8 + state.sb, 13U); state.sa ^= rotate(state.s9 + state.s8, 18U); \`
			`state.sc ^= rotate(state.sf + state.se, 7U); state.sd ^= rotate(state.sc + state.sf, 9U); state.se ^= rotate(state.sd + state.sc, 13U); state.sf ^= rotate(state.se + state.sd, 18U); \`
			`} while(0)`

			`uint16 salsa_small_scalar_rnd(uint16 X)`
			`{`
			`uint16 st = X;`

			`#if SALSA_SMALL_UNROLL == 1`

			`for(int i = 0; i < 10; ++i)`
			`{`
			`SALSA_CORE(st);`
			`}`

			`#elif SALSA_SMALL_UNROLL == 2`

			`for(int i = 0; i < 5; ++i)`
			`{`
			`SALSA_CORE(st);`
			`SALSA_CORE(st);`
			`}`

			`#elif SALSA_SMALL_UNROLL == 3`

			`for(int i = 0; i < 4; ++i)`
			`{`
			`SALSA_CORE(st);`
			`if(i == 3) break;`
			`SALSA_CORE(st);`
			`SALSA_CORE(st);`
			`}`

			`#elif SALSA_SMALL_UNROLL == 4`

			`for(int i = 0; i < 3; ++i)`
			`{`
			`SALSA_CORE(st);`
			`SALSA_CORE(st);`
			`if(i == 2) break;`
			`SALSA_CORE(st);`
			`SALSA_CORE(st);`
			`}`

			`#else`

			`for(int i = 0; i < 2; ++i)`
			`{`
			`SALSA_CORE(st);`
			`SALSA_CORE(st);`
			`SALSA_CORE(st);`
			`SALSA_CORE(st);`
			`SALSA_CORE(st);`
			`}`

			`#endif`

			`return(X + st);`
			`}`

			`#define CHACHA_CORE_PARALLEL(state) do { \`
			`state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], (uint4)(16U, 16U, 16U, 16U)); \`
			`state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], (uint4)(12U, 12U, 12U, 12U)); \`
			`state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], (uint4)(8U, 8U, 8U, 8U)); \`
			`state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], (uint4)(7U, 7U, 7U, 7U)); \`
			`\`
			`state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], (uint4)(16U, 16U, 16U, 16U)); \`
			`state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, (uint4)(12U, 12U, 12U, 12U)); \`
			`state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], (uint4)(8U, 8U, 8U, 8U)); \`
			`state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, (uint4)(7U, 7U, 7U, 7U)); \`
			`} while(0)`

			`uint16 chacha_small_parallel_rnd(uint16 X)`
			`{`
			`uint4 t, st[4];`

			`((uint16 *)st)[0] = X;`

			`#if CHACHA_SMALL_UNROLL == 1`

			`for(int i = 0; i < 10; ++i)`
			`{`
			`CHACHA_CORE_PARALLEL(st);`
			`}`

			`#elif CHACHA_SMALL_UNROLL == 2`

			`for(int i = 0; i < 5; ++i)`
			`{`
			`CHACHA_CORE_PARALLEL(st);`
			`CHACHA_CORE_PARALLEL(st);`
			`}`

			`#elif CHACHA_SMALL_UNROLL == 3`

			`for(int i = 0; i < 4; ++i)`
			`{`
			`CHACHA_CORE_PARALLEL(st);`
			`if(i == 3) break;`
			`CHACHA_CORE_PARALLEL(st);`
			`CHACHA_CORE_PARALLEL(st);`
			`}`

			`#elif CHACHA_SMALL_UNROLL == 4`

			`for(int i = 0; i < 3; ++i)`
			`{`
			`CHACHA_CORE_PARALLEL(st);`
			`CHACHA_CORE_PARALLEL(st);`
			`if(i == 2) break;`
			`CHACHA_CORE_PARALLEL(st);`
			`CHACHA_CORE_PARALLEL(st);`
			`}`

			`#else`

			`for(int i = 0; i < 2; ++i)`
			`{`
			`CHACHA_CORE_PARALLEL(st);`
			`CHACHA_CORE_PARALLEL(st);`
			`CHACHA_CORE_PARALLEL(st);`
			`CHACHA_CORE_PARALLEL(st);`
			`CHACHA_CORE_PARALLEL(st);`
			`}`

			`#endif`

			`return(X + ((uint16 *)st)[0]);`
			`}`

			`void neoscrypt_blkmix(uint16 *XV, bool alg)`
			`{`

			`/* NeoScrypt flow: Scrypt flow:`
			`Xa ^= Xd; M(Xa'); Ya = Xa"; Xa ^= Xb; M(Xa'); Ya = Xa";`
			`Xb ^= Xa"; M(Xb'); Yb = Xb"; Xb ^= Xa"; M(Xb'); Yb = Xb";`
			`Xc ^= Xb"; M(Xc'); Yc = Xc"; Xa" = Ya;`
			`Xd ^= Xc"; M(Xd'); Yd = Xd"; Xb" = Yb;`
			`Xa" = Ya; Xb" = Yc;`
			`Xc" = Yb; Xd" = Yd; */`

			`XV[0] ^= XV[3];`

			`if(!alg)`
			`{`
			`XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0];`
			`XV[1] = salsa_small_scalar_rnd(XV[1]); XV[2] ^= XV[1];`
			`XV[2] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[2];`
			`XV[3] = salsa_small_scalar_rnd(XV[3]);`
			`}`
			`else`
			`{`
			`XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0];`
			`XV[1] = chacha_small_parallel_rnd(XV[1]); XV[2] ^= XV[1];`
			`XV[2] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[2];`
			`XV[3] = chacha_small_parallel_rnd(XV[3]);`
			`}`

			`XV[1] ^= XV[2];`
			`XV[2] ^= XV[1];`
			`XV[1] ^= XV[2];`
			`}`

			`void ScratchpadStore(__global void V, void X, uchar idx)`
			`{`
			`((__global ulong16 )V)[idx << 1] = ((ulong16 )X)[0];`
			`((__global ulong16 )V)[(idx << 1) + 1] = ((ulong16 )X)[1];`
			`}`

			`void ScratchpadMix(void X, const __global void V, uchar idx)`
			`{`
			`((ulong16 )X)[0] ^= ((__global ulong16 )V)[idx << 1];`
			`((ulong16 )X)[1] ^= ((__global ulong16 )V)[(idx << 1) + 1];`
			`}`

			`void SMix(uint16 X, __global uint16 V, bool flag)`
			`{`
			`#pragma unroll 1`
			`for(int i = 0; i < 128; ++i)`
			`{`
			`ScratchpadStore(V, X, i);`
			`neoscrypt_blkmix(X, flag);`
			`}`

			`#pragma unroll 1`
			`for(int i = 0; i < 128; ++i)`
			`{`
			`const uint idx = convert_uchar(((uint *)X)[48] & 0x7F);`
			`ScratchpadMix(X, V, idx);`
			`neoscrypt_blkmix(X, flag);`
			`}`
			`}`

			`__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))`
			`__kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, const uint target)`
			`{`
			`#define CONSTANT_N 128`
			`#define CONSTANT_r 2`
			`// X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha`
			`uint16 X[4], Z[4];`
			`/* V = CONSTANT_N * CONSTANT_r * 2 * BLOCK_SIZE */`
			`__global ulong16 V = (__global ulong16 )(padcache + (0x8000 * (get_global_id(0) % MAX_GLOBAL_THREADS)));`
			`uchar outbuf[32];`
			`uchar data[PASSWORD_LEN];`

			`((ulong8 )data)[0] = ((__global const ulong8 )input)[0];`
			`((ulong )data)[8] = ((__global const ulong )input)[8];`
			`((uint )data)[18] = ((__global const uint )input)[18];`
			`((uint *)data)[19] = get_global_id(0);`

			`// X = KDF(password, salt)`
			`fastkdf(data, data, PASSWORD_LEN, (uchar *)X, 256);`

			`// Process ChaCha 1st, Salsa 2nd and XOR them - run that through PBKDF2`
			`CopyBytes128(Z, X, 2);`

			`// X = SMix(X); X & Z are swapped, repeat.`
			`for(bool flag = false;; ++flag)`
			`{`
			`SMix(X, V, flag);`
			`if(flag) break;`
			`SwapBytes128(X, Z, 256);`
			`}`

			`// blkxor(X, Z)`
			`((ulong16 )X)[0] ^= ((ulong16 )Z)[0];`
			`((ulong16 )X)[1] ^= ((ulong16 )Z)[1];`

			`// output = KDF(password, X)`
			`fastkdf(data, (uchar *)X, FASTKDF_BUFFER_SIZE, outbuf, 32);`
			`if(((uint *)outbuf)[7] <= target) output[atomic_add(output + 0xFF, 1)] = get_global_id(0);`
			`}`