Added Neoscrypt with Wolf9466 improvements.

2025-03-13 06:01:03 +00:00 · 2014-11-19 12:06:16 +01:00 · 2014-11-19 12:06:16 +01:00 · 42737acf66
commit 42737acf66
parent 0557017ca8
11 changed files with 2286 additions and 89 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -65,6 +65,7 @@ sgminer_SOURCES += algorithm/talkcoin.c algorithm/talkcoin.h
 sgminer_SOURCES += algorithm/bitblock.c algorithm/bitblock.h
 sgminer_SOURCES += algorithm/x14.c algorithm/x14.h
 sgminer_SOURCES += algorithm/fresh.c algorithm/fresh.h
+sgminer_SOURCES += algorithm/neoscrypt.c algorithm/neoscrypt.h

 bin_SCRIPTS	= $(top_srcdir)/kernel/*.cl

--- a/algorithm.c
+++ b/algorithm.c
@ -29,6 +29,7 @@
 #include "algorithm/bitblock.h"
 #include "algorithm/x14.h"
 #include "algorithm/fresh.h"
+#include "algorithm/neoscrypt.h"

 #include "compat.h"

@ -92,6 +93,17 @@ static void append_scrypt_compiler_options(struct _build_kernel_data *data, stru
  strcat(data->binary_filename, buf);
 }

+static void append_neoscrypt_compiler_options(struct _build_kernel_data *data, struct cgpu_info *cgpu, struct _algorithm_t *algorithm)
+{
+  char buf[255];
+  sprintf(buf, " -D MAX_GLOBAL_THREADS=%u", 
+    (unsigned int)cgpu->thread_concurrency);
+  strcat(data->compiler_options, buf);
+  
+  sprintf(buf, "tc%u", (unsigned int)cgpu->thread_concurrency);
+  strcat(data->binary_filename, buf);
+}
+
 static void append_x11_compiler_options(struct _build_kernel_data *data, struct cgpu_info *cgpu, struct _algorithm_t *algorithm)
 {
  char buf[255];
@ -140,6 +152,30 @@ static cl_int queue_scrypt_kernel(struct __clState *clState, struct _dev_blk_ctx
  return status;
 }

+static cl_int queue_neoscrypt_kernel(_clState *clState, dev_blk_ctx *blk, __maybe_unused cl_uint threads)
+{
+  cl_kernel *kernel = &clState->kernel;
+  unsigned int num = 0;
+  cl_uint le_target;
+  cl_int status = 0;
+  
+  /* This looks like a unnecessary double cast, but to make sure, that
+   * the target's most significant entry is adressed as a 32-bit value
+   * and not accidently by something else the double cast seems wise.
+   * The compiler will get rid of it anyway. 
+   */
+  le_target = (cl_uint)le32toh(((uint32_t *)blk->work->/*device_*/target)[7]);
+  memcpy(clState->cldata, blk->work->data, 80);
+  status = clEnqueueWriteBuffer(clState->commandQueue, clState->CLbuffer0, true, 0, 80, clState->cldata, 0, NULL, NULL);
+  
+  CL_SET_ARG(clState->CLbuffer0);
+  CL_SET_ARG(clState->outputBuffer);
+  CL_SET_ARG(clState->padbuffer8);
+  CL_SET_ARG(le_target);
+  
+  return status;
+}
+
 static cl_int queue_maxcoin_kernel(struct __clState *clState, struct _dev_blk_ctx *blk, __maybe_unused cl_uint threads)
 {
  cl_kernel *kernel = &clState->kernel;
@ -597,6 +633,11 @@ static algorithm_settings_t algos[] = {
  A_SCRYPT( "zuikkis" ),
 #undef A_SCRYPT

+#define A_NEOSCRYPT(a) \
+  { a, ALGO_NEOSCRYPT, "", 1, 65536, 65536, 0, 0, 0xFF, 0xFFFF000000000000ULL, 0x0000ffffUL, 0, -1, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, neoscrypt_regenhash, queue_neoscrypt_kernel, gen_hash, append_neoscrypt_compiler_options}
+  A_NEOSCRYPT("neoscrypt"),
+#undef A_NEOSCRYPT
+
  // kernels starting from this will have difficulty calculated by using quarkcoin algorithm
 #define A_QUARK(a, b) \
  { a, ALGO_QUARK, "", 256, 256, 256, 0, 0, 0xFF, 0xFFFFFFULL, 0x0000ffffUL, 0, 0, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, b, queue_sph_kernel, gen_hash, append_x11_compiler_options }
--- a/algorithm.h
+++ b/algorithm.h
@ -23,7 +23,8 @@ typedef enum {
  ALGO_TWE,
  ALGO_FUGUE,
  ALGO_NIST,
-  ALGO_FRESH
+  ALGO_FRESH,
+  ALGO_NEOSCRYPT
 } algorithm_type_t;

 extern const char *algorithm_type_str[];
--- a/algorithm/neoscrypt.c
+++ b/algorithm/neoscrypt.c
--- a/algorithm/neoscrypt.h
+++ b/algorithm/neoscrypt.h
@ -0,0 +1,13 @@
+#ifndef NEOSCRYPT_H
+#define NEOSCRYPT_H
+  
+#include "miner.h"
+
+/* The neoscrypt scratch buffer needs 32kBytes memory. */
+#define NEOSCRYPT_SCRATCHBUF_SIZE (32 * 1024)
+
+/* These routines are always available. */
+extern void neoscrypt_regenhash(struct work *work);
+extern void neoscrypt(const unsigned char *input, unsigned char *output, unsigned int profile);
+
+#endif /* NEOSCRYPT_H */
--- a/kernel/neoscrypt.cl
+++ b/kernel/neoscrypt.cl
@ -0,0 +1,525 @@
+/* NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20 */
+/* Adapted and improved for 14.x drivers by Wolf9466 (Wolf`) */
+
+// Stupid AMD compiler ignores the unroll pragma in these two
+#define SALSA_SMALL_UNROLL 3
+#define CHACHA_SMALL_UNROLL 3
+
+// If SMALL_BLAKE2S is defined, BLAKE2S_UNROLL is interpreted
+// as the unroll factor; must divide cleanly into ten.
+// Usually a bad idea.
+//#define SMALL_BLAKE2S
+//#define BLAKE2S_UNROLL 5
+
+#define BLOCK_SIZE           64U
+#define FASTKDF_BUFFER_SIZE 256U
+#ifndef PASSWORD_LEN
+#define PASSWORD_LEN         80U
+#endif
+
+#if !defined(cl_khr_byte_addressable_store)
+#error "Device does not support unaligned stores"
+#endif
+
+// Swaps 128 bytes at a time without using temp vars
+void SwapBytes128(void *restrict A, void *restrict B, uint len)
+{
+	#pragma unroll 2
+	for(int i = 0; i < (len >> 7); ++i)
+	{
+		((ulong16 *)A)[i] ^= ((ulong16 *)B)[i];
+		((ulong16 *)B)[i] ^= ((ulong16 *)A)[i];
+		((ulong16 *)A)[i] ^= ((ulong16 *)B)[i];
+	}
+}
+
+void CopyBytes128(void *restrict dst, const void *restrict src, uint len)
+{
+	#pragma unroll 2
+    for(int i = 0; i < len; ++i)
+		((ulong16 *)dst)[i] = ((ulong16 *)src)[i];
+}
+
+void CopyBytes(void *restrict dst, const void *restrict src, uint len)
+{
+    for(int i = 0; i < len; ++i)
+		((uchar *)dst)[i] = ((uchar *)src)[i];
+}
+
+void XORBytesInPlace(void *restrict dst, const void *restrict src, uint len)
+{
+	for(int i = 0; i < len; ++i)
+		((uchar *)dst)[i] ^= ((uchar *)src)[i];
+}
+
+void XORBytes(void *restrict dst, const void *restrict src1, const void *restrict src2, uint len)
+{
+	#pragma unroll 1
+	for(int i = 0; i < len; ++i)
+		((uchar *)dst)[i] = ((uchar *)src1)[i] ^ ((uchar *)src2)[i];
+}
+
+// Blake2S
+
+#define BLAKE2S_BLOCK_SIZE    64U
+#define BLAKE2S_OUT_SIZE      32U
+#define BLAKE2S_KEY_SIZE      32U
+
+static const __constant uint BLAKE2S_IV[8] =
+{
+    0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
+    0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
+};
+
+static const __constant uchar BLAKE2S_SIGMA[10][16] =
+{
+    {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
+    { 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
+    { 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
+    {  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
+    {  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
+    {  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
+    { 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
+    { 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
+    {  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
+    { 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
+};
+
+#define BLAKE_G(idx0, idx1, a, b, c, d, key)	do { \
+	a += b + key[BLAKE2S_SIGMA[idx0][idx1]]; \
+	d = rotate(d ^ a, 16U); \
+	c += d; \
+	b = rotate(b ^ c, 20U); \
+	a += b + key[BLAKE2S_SIGMA[idx0][idx1 + 1]]; \
+	d = rotate(d ^ a, 24U); \
+	c += d; \
+	b = rotate(b ^ c, 25U); \
+} while(0)
+
+void Blake2S(uint *restrict inout, const uint *restrict inkey)
+{
+	uint16 V;
+	uint8 tmpblock;
+
+	// Load first block (IV into V.lo) and constants (IV into V.hi)
+	V.lo = V.hi = vload8(0U, BLAKE2S_IV);
+
+	// XOR with initial constant
+	V.s0 ^= 0x01012020;
+
+	// Copy input block for later
+	tmpblock = V.lo;
+
+	// XOR length of message so far (including this block)
+	// There are two uints for this field, but high uint is zero
+	V.sc ^= BLAKE2S_BLOCK_SIZE;
+
+	// Compress state, using the key as the key
+	#ifdef SMALL_BLAKE2S
+	#pragma unroll BLAKE2S_UNROLL
+	#else
+	#pragma unroll
+	#endif
+	for(int x = 0; x < 10; ++x)
+	{
+		BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inkey);
+		BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inkey);
+		BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inkey);
+		BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inkey);
+		BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inkey);
+		BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inkey);
+		BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inkey);
+		BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inkey);
+	}
+
+	// XOR low part of state with the high part,
+	// then with the original input block.
+	V.lo ^= V.hi ^ tmpblock;
+
+	// Load constants (IV into V.hi)
+	V.hi = vload8(0U, BLAKE2S_IV);
+
+	// Copy input block for later
+	tmpblock = V.lo;
+
+	// XOR length of message into block again
+	V.sc ^= BLAKE2S_BLOCK_SIZE << 1;
+
+	// Last block compression - XOR final constant into state
+	V.se ^= 0xFFFFFFFFU;
+
+	// Compress block, using the input as the key
+	#ifdef SMALL_BLAKE2S
+	#pragma unroll BLAKE2S_UNROLL
+	#else
+	#pragma unroll
+	#endif
+	for(int x = 0; x < 10; ++x)
+	{
+		BLAKE_G(x, 0x00, V.s0, V.s4, V.s8, V.sc, inout);
+		BLAKE_G(x, 0x02, V.s1, V.s5, V.s9, V.sd, inout);
+		BLAKE_G(x, 0x04, V.s2, V.s6, V.sa, V.se, inout);
+		BLAKE_G(x, 0x06, V.s3, V.s7, V.sb, V.sf, inout);
+		BLAKE_G(x, 0x08, V.s0, V.s5, V.sa, V.sf, inout);
+		BLAKE_G(x, 0x0A, V.s1, V.s6, V.sb, V.sc, inout);
+		BLAKE_G(x, 0x0C, V.s2, V.s7, V.s8, V.sd, inout);
+		BLAKE_G(x, 0x0E, V.s3, V.s4, V.s9, V.se, inout);
+	}
+
+	// XOR low part of state with high part, then with input block
+	V.lo ^= V.hi ^ tmpblock;
+
+	// Store result in input/output buffer
+	vstore8(V.lo, 0, inout);
+}
+
+/* FastKDF, a fast buffered key derivation function:
+ * FASTKDF_BUFFER_SIZE must be a power of 2;
+ * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE;
+ * prf_output_size must be <= prf_key_size; */
+void fastkdf(const uchar *restrict password, const uchar *restrict salt, const uint salt_len, uchar *restrict output, uint output_len)
+{
+
+	/*                    WARNING!
+	 * This algorithm uses byte-wise addressing for memory blocks.
+	 * Or in other words, trying to copy an unaligned memory region
+	 * will significantly slow down the algorithm, when copying uses
+	 * words or bigger entities. It even may corrupt the data, when
+	 * the device does not support it properly.
+	 * Therefore use byte copying, which will not the fastest but at
+	 * least get reliable results. */
+
+	// BLOCK_SIZE            64U
+	// FASTKDF_BUFFER_SIZE  256U
+	// BLAKE2S_BLOCK_SIZE    64U
+	// BLAKE2S_KEY_SIZE      32U
+	// BLAKE2S_OUT_SIZE      32U
+	uchar bufidx = 0;
+	uint8 Abuffer[9], Bbuffer[9] = { (uint8)(0) };
+	uchar *A = (uchar *)Abuffer, *B = (uchar *)Bbuffer;
+
+	// Initialize the password buffer
+	#pragma unroll 1
+	for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)A)[i] = ((ulong *)password)[i % 10];
+
+	((uint16 *)(A + FASTKDF_BUFFER_SIZE))[0] = ((uint16 *)password)[0];
+
+	// Initialize the salt buffer
+	if(salt_len == FASTKDF_BUFFER_SIZE)
+	{
+		((ulong16 *)B)[0] = ((ulong16 *)B)[2] = ((ulong16 *)salt)[0];
+		((ulong16 *)B)[1] = ((ulong16 *)B)[3] = ((ulong16 *)salt)[1];
+	}
+	else
+	{
+		// salt_len is 80 bytes here
+		#pragma unroll 1
+		for(int i = 0; i < (FASTKDF_BUFFER_SIZE >> 3); ++i) ((ulong *)B)[i] = ((ulong *)salt)[i % 10];
+
+		// Initialized the rest to zero earlier
+		#pragma unroll 1
+		for(int i = 0; i < 10; ++i) ((ulong *)(B + FASTKDF_BUFFER_SIZE))[i] = ((ulong *)salt)[i];
+	}
+
+    // The primary iteration
+    #pragma unroll 1
+    for(int i = 0; i < 32; ++i)
+    {
+		// Make the key buffer twice the size of the key so it fits a Blake2S block
+		// This way, we don't need a temp buffer in the Blake2S function.
+		uchar input[BLAKE2S_BLOCK_SIZE], key[BLAKE2S_BLOCK_SIZE] = { 0 };
+
+		// Copy input and key to their buffers
+		CopyBytes(input, A + bufidx, BLAKE2S_BLOCK_SIZE);
+		CopyBytes(key, B + bufidx, BLAKE2S_KEY_SIZE);
+
+        // PRF
+        Blake2S((uint *)input, (uint *)key);
+
+        // Calculate the next buffer pointer
+		bufidx = 0;
+
+		for(int x = 0; x < BLAKE2S_OUT_SIZE; ++x)
+			bufidx += input[x];
+
+		// bufidx a uchar now - always mod 255
+		//bufidx &= (FASTKDF_BUFFER_SIZE - 1);
+
+        // Modify the salt buffer
+		XORBytesInPlace(B + bufidx, input, BLAKE2S_OUT_SIZE);
+
+		if(bufidx < BLAKE2S_KEY_SIZE)
+		{
+			// Head modified, tail updated
+			// this was made off the original code... wtf
+			//CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, min(BLAKE2S_OUT_SIZE, BLAKE2S_KEY_SIZE - bufidx));
+			CopyBytes(B + FASTKDF_BUFFER_SIZE + bufidx, B + bufidx, BLAKE2S_KEY_SIZE - bufidx);
+		}
+		else if((FASTKDF_BUFFER_SIZE - bufidx) < BLAKE2S_OUT_SIZE)
+		{
+			// Tail modified, head updated
+			CopyBytes(B, B + FASTKDF_BUFFER_SIZE, BLAKE2S_OUT_SIZE - (FASTKDF_BUFFER_SIZE - bufidx));
+		}
+    }
+
+    // Modify and copy into the output buffer
+
+    // Damned compiler crashes
+    // Fuck you, AMD
+
+	//for(uint i = 0; i < output_len; ++i, ++bufidx)
+	//	output[i] = B[bufidx] ^ A[i];
+
+    uint left = FASTKDF_BUFFER_SIZE - bufidx;
+	//uint left = (~bufidx) + 1
+
+	if(left < output_len)
+	{
+		XORBytes(output, B + bufidx, A, left);
+		XORBytes(output + left, B, A + left, output_len - left);
+	}
+	else
+	{
+		XORBytes(output, B + bufidx, A, output_len);
+	}
+}
+
+#define SALSA_CORE(state)	do { \
+	state.s4 ^= rotate(state.s0 + state.sc, 7U); state.s8 ^= rotate(state.s4 + state.s0, 9U); state.sc ^= rotate(state.s8 + state.s4, 13U); state.s0 ^= rotate(state.sc + state.s8, 18U); \
+	state.s9 ^= rotate(state.s5 + state.s1, 7U); state.sd ^= rotate(state.s9 + state.s5, 9U); state.s1 ^= rotate(state.sd + state.s9, 13U); state.s5 ^= rotate(state.s1 + state.sd, 18U); \
+	state.se ^= rotate(state.sa + state.s6, 7U); state.s2 ^= rotate(state.se + state.sa, 9U); state.s6 ^= rotate(state.s2 + state.se, 13U); state.sa ^= rotate(state.s6 + state.s2, 18U); \
+	state.s3 ^= rotate(state.sf + state.sb, 7U); state.s7 ^= rotate(state.s3 + state.sf, 9U); state.sb ^= rotate(state.s7 + state.s3, 13U); state.sf ^= rotate(state.sb + state.s7, 18U); \
+	state.s1 ^= rotate(state.s0 + state.s3, 7U); state.s2 ^= rotate(state.s1 + state.s0, 9U); state.s3 ^= rotate(state.s2 + state.s1, 13U); state.s0 ^= rotate(state.s3 + state.s2, 18U); \
+	state.s6 ^= rotate(state.s5 + state.s4, 7U); state.s7 ^= rotate(state.s6 + state.s5, 9U); state.s4 ^= rotate(state.s7 + state.s6, 13U); state.s5 ^= rotate(state.s4 + state.s7, 18U); \
+	state.sb ^= rotate(state.sa + state.s9, 7U); state.s8 ^= rotate(state.sb + state.sa, 9U); state.s9 ^= rotate(state.s8 + state.sb, 13U); state.sa ^= rotate(state.s9 + state.s8, 18U); \
+	state.sc ^= rotate(state.sf + state.se, 7U); state.sd ^= rotate(state.sc + state.sf, 9U); state.se ^= rotate(state.sd + state.sc, 13U); state.sf ^= rotate(state.se + state.sd, 18U); \
+} while(0)
+
+uint16 salsa_small_scalar_rnd(uint16 X)
+{
+	uint16 st = X;
+
+	#if SALSA_SMALL_UNROLL == 1
+
+	for(int i = 0; i < 10; ++i)
+	{
+		SALSA_CORE(st);
+	}
+
+	#elif SALSA_SMALL_UNROLL == 2
+
+	for(int i = 0; i < 5; ++i)
+	{
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+	}
+
+	#elif SALSA_SMALL_UNROLL == 3
+
+	for(int i = 0; i < 4; ++i)
+	{
+		SALSA_CORE(st);
+		if(i == 3) break;
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+	}
+
+	#elif SALSA_SMALL_UNROLL == 4
+
+	for(int i = 0; i < 3; ++i)
+	{
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+		if(i == 2) break;
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+	}
+
+	#else
+
+	for(int i = 0; i < 2; ++i)
+	{
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+		SALSA_CORE(st);
+	}
+
+	#endif
+
+	return(X + st);
+}
+
+#define CHACHA_CORE_PARALLEL(state)	do { \
+	state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], (uint4)(16U, 16U, 16U, 16U)); \
+	state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], (uint4)(12U, 12U, 12U, 12U)); \
+	state[0] += state[1]; state[3] = rotate(state[3] ^ state[0], (uint4)(8U, 8U, 8U, 8U)); \
+	state[2] += state[3]; state[1] = rotate(state[1] ^ state[2], (uint4)(7U, 7U, 7U, 7U)); \
+	\
+	state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], (uint4)(16U, 16U, 16U, 16U)); \
+	state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, (uint4)(12U, 12U, 12U, 12U)); \
+	state[0] += state[1].yzwx; state[3].wxyz = rotate(state[3].wxyz ^ state[0], (uint4)(8U, 8U, 8U, 8U)); \
+	state[2].zwxy += state[3].wxyz; state[1].yzwx = rotate(state[1].yzwx ^ state[2].zwxy, (uint4)(7U, 7U, 7U, 7U)); \
+} while(0)
+
+uint16 chacha_small_parallel_rnd(uint16 X)
+{
+	uint4 t, st[4];
+
+	((uint16 *)st)[0] = X;
+
+	#if CHACHA_SMALL_UNROLL == 1
+
+	for(int i = 0; i < 10; ++i)
+	{
+		CHACHA_CORE_PARALLEL(st);
+	}
+
+	#elif CHACHA_SMALL_UNROLL == 2
+
+	for(int i = 0; i < 5; ++i)
+	{
+		CHACHA_CORE_PARALLEL(st);
+		CHACHA_CORE_PARALLEL(st);
+	}
+
+	#elif CHACHA_SMALL_UNROLL == 3
+
+	for(int i = 0; i < 4; ++i)
+	{
+		CHACHA_CORE_PARALLEL(st);
+		if(i == 3) break;
+		CHACHA_CORE_PARALLEL(st);
+		CHACHA_CORE_PARALLEL(st);
+	}
+
+	#elif CHACHA_SMALL_UNROLL == 4
+
+	for(int i = 0; i < 3; ++i)
+	{
+		CHACHA_CORE_PARALLEL(st);
+		CHACHA_CORE_PARALLEL(st);
+		if(i == 2) break;
+		CHACHA_CORE_PARALLEL(st);
+		CHACHA_CORE_PARALLEL(st);
+	}
+
+	#else
+
+	for(int i = 0; i < 2; ++i)
+	{
+		CHACHA_CORE_PARALLEL(st);
+		CHACHA_CORE_PARALLEL(st);
+		CHACHA_CORE_PARALLEL(st);
+		CHACHA_CORE_PARALLEL(st);
+		CHACHA_CORE_PARALLEL(st);
+	}
+
+	#endif
+
+	return(X + ((uint16 *)st)[0]);
+}
+
+void neoscrypt_blkmix(uint16 *XV, bool alg)
+{
+
+    /* NeoScrypt flow:                   Scrypt flow:
+         Xa ^= Xd;  M(Xa'); Ya = Xa";      Xa ^= Xb;  M(Xa'); Ya = Xa";
+         Xb ^= Xa"; M(Xb'); Yb = Xb";      Xb ^= Xa"; M(Xb'); Yb = Xb";
+         Xc ^= Xb"; M(Xc'); Yc = Xc";      Xa" = Ya;
+         Xd ^= Xc"; M(Xd'); Yd = Xd";      Xb" = Yb;
+         Xa" = Ya; Xb" = Yc;
+         Xc" = Yb; Xd" = Yd; */
+
+	XV[0] ^= XV[3];
+
+	if(!alg)
+	{
+		XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0];
+		XV[1] = salsa_small_scalar_rnd(XV[1]); XV[2] ^= XV[1];
+		XV[2] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[2];
+		XV[3] = salsa_small_scalar_rnd(XV[3]);
+	}
+	else
+	{
+		XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0];
+		XV[1] = chacha_small_parallel_rnd(XV[1]); XV[2] ^= XV[1];
+		XV[2] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[2];
+		XV[3] = chacha_small_parallel_rnd(XV[3]);
+	}
+
+	XV[1] ^= XV[2];
+	XV[2] ^= XV[1];
+	XV[1] ^= XV[2];
+}
+
+void ScratchpadStore(__global void *V, void *X, uchar idx)
+{
+	((__global ulong16 *)V)[idx << 1] = ((ulong16 *)X)[0];
+	((__global ulong16 *)V)[(idx << 1) + 1] = ((ulong16 *)X)[1];
+}
+
+void ScratchpadMix(void *X, const __global void *V, uchar idx)
+{
+	((ulong16 *)X)[0] ^= ((__global ulong16 *)V)[idx << 1];
+	((ulong16 *)X)[1] ^= ((__global ulong16 *)V)[(idx << 1) + 1];
+}
+
+void SMix(uint16 *X, __global uint16 *V, bool flag)
+{
+	#pragma unroll 1
+	for(int i = 0; i < 128; ++i)
+	{
+		ScratchpadStore(V, X, i);
+		neoscrypt_blkmix(X, flag);
+	}
+
+	#pragma unroll 1
+	for(int i = 0; i < 128; ++i)
+	{
+		const uint idx = convert_uchar(((uint *)X)[48] & 0x7F);
+		ScratchpadMix(X, V, idx);
+		neoscrypt_blkmix(X, flag);
+	}
+}
+
+__attribute__((reqd_work_group_size(WORKSIZE, 1, 1)))
+__kernel void search(__global const uchar* restrict input, __global uint* restrict output, __global uchar *padcache, const uint target)
+{
+#define CONSTANT_N 128
+#define CONSTANT_r 2
+	// X = CONSTANT_r * 2 * BLOCK_SIZE(64); Z is a copy of X for ChaCha
+	uint16 X[4], Z[4];
+	/* V = CONSTANT_N * CONSTANT_r * 2 * BLOCK_SIZE */
+	__global ulong16 *V = (__global ulong16 *)(padcache + (0x8000 * (get_global_id(0) % MAX_GLOBAL_THREADS)));
+	uchar outbuf[32];
+	uchar data[PASSWORD_LEN];
+
+	((ulong8 *)data)[0] = ((__global const ulong8 *)input)[0];
+	((ulong *)data)[8] = ((__global const ulong *)input)[8];
+	((uint *)data)[18] = ((__global const uint *)input)[18];
+	((uint *)data)[19] = get_global_id(0);
+
+    // X = KDF(password, salt)
+	fastkdf(data, data, PASSWORD_LEN, (uchar *)X, 256);
+
+    // Process ChaCha 1st, Salsa 2nd and XOR them - run that through PBKDF2
+    CopyBytes128(Z, X, 2);
+
+    // X = SMix(X); X & Z are swapped, repeat.
+    for(bool flag = false;; ++flag)
+    {
+		SMix(X, V, flag);
+		if(flag) break;
+		SwapBytes128(X, Z, 256);
+	}
+
+	// blkxor(X, Z)
+	((ulong16 *)X)[0] ^= ((ulong16 *)Z)[0];
+	((ulong16 *)X)[1] ^= ((ulong16 *)Z)[1];
+
+	// output = KDF(password, X)
+	fastkdf(data, (uchar *)X, FASTKDF_BUFFER_SIZE, outbuf, 32);
+	if(((uint *)outbuf)[7] <= target) output[atomic_add(output + 0xFF, 1)] = get_global_id(0);
+}
--- a/miner.h
+++ b/miner.h
@ -1100,6 +1100,7 @@ extern pthread_cond_t restart_cond;
 extern void clear_stratum_shares(struct pool *pool);
 extern void clear_pool_work(struct pool *pool);
 extern void set_target(unsigned char *dest_target, double diff, double diff_multiplier2);
+extern void set_target_neoscrypt(unsigned char *target, double diff);

 extern void kill_work(void);

--- a/ocl.c
+++ b/ocl.c
@ -34,6 +34,7 @@
 #include "ocl.h"
 #include "ocl/build_kernel.h"
 #include "ocl/binary_kernel.h"
+#include "algorithm/neoscrypt.h"

 /* FIXME: only here for global config vars, replace with configuration.h
 * or similar as soon as config is in a struct instead of littered all
@ -344,19 +345,55 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
    cgpu->lookup_gap = 2;
  }

-  if (!cgpu->opt_tc) {
+  // neoscrypt calculates TC differently
+  if (!safe_cmp(cgpu->algorithm.name, "neoscrypt")) {
+    int max_int = ((cgpu->dynamic) ? MAX_INTENSITY : cgpu->intensity);
+    size_t glob_thread_count = 1UL << max_int;
+
+    // if TC is entered by user, use that value... otherwise use default
+    cgpu->thread_concurrency = ((cgpu->opt_tc) ? cgpu->opt_tc : ((glob_thread_count < cgpu->work_size) ? cgpu->work_size : glob_thread_count));
+
+    // if TC * scratchbuf size is too big for memory... reduce to max
+    if (((uint64_t)cgpu->thread_concurrency * NEOSCRYPT_SCRATCHBUF_SIZE) >(uint64_t)cgpu->max_alloc) {
+      /* Selected intensity will not run on this GPU. Not enough memory.
+      * Adapt the memory setting. */
+      glob_thread_count = cgpu->max_alloc / NEOSCRYPT_SCRATCHBUF_SIZE;
+
+      /* Find highest significant bit in glob_thread_count, which gives
+      * the intensity. */
+      while (max_int && ((1U << max_int) & glob_thread_count) == 0) {
+        --max_int;
+      }
+
+      /* Check if max_intensity is >0. */
+      if (max_int < MIN_INTENSITY) {
+        applog(LOG_ERR, "GPU %d: Max intensity is below minimum.", gpu);
+        max_int = MIN_INTENSITY;
+      }
+
+      cgpu->intensity = max_int;
+      cgpu->thread_concurrency = 1U << max_int;
+    }
+
+    applog(LOG_DEBUG, "GPU %d: computing max. global thread count to %u", gpu, (unsigned)(cgpu->thread_concurrency));
+
+  }
+  else if (!cgpu->opt_tc) {
    unsigned int sixtyfours;

    sixtyfours =  cgpu->max_alloc / 131072 / 64 / (algorithm->n/1024) - 1;
    cgpu->thread_concurrency = sixtyfours * 64;
    if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) {
      cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders;
-      if (cgpu->thread_concurrency > cgpu->shaders * 5)
+      if (cgpu->thread_concurrency > cgpu->shaders * 5) {
        cgpu->thread_concurrency = cgpu->shaders * 5;
+      }
    }
    applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %d", gpu, (int)(cgpu->thread_concurrency));
-  } else
+  }
+  else {
    cgpu->thread_concurrency = cgpu->opt_tc;
+  }


  cl_uint slot, cpnd;
@ -445,17 +482,36 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
  }

  size_t bufsize;
+  size_t readbufsize = 128;

  if (algorithm->rw_buffer_size < 0) {
-    size_t ipt = (algorithm->n / cgpu->lookup_gap +
-            (algorithm->n % cgpu->lookup_gap > 0));
-    bufsize = 128 * ipt * cgpu->thread_concurrency;
-  } else
-    bufsize = (size_t) algorithm->rw_buffer_size;
+    // calc buffer size for neoscrypt
+    if (!safe_cmp(algorithm->name, "neoscrypt")) {
+      /* The scratch/pad-buffer needs 32kBytes memory per thread. */
+      bufsize = NEOSCRYPT_SCRATCHBUF_SIZE * cgpu->thread_concurrency;
+
+      /* This is the input buffer. For neoscrypt this is guaranteed to be
+      * 80 bytes only. */
+      readbufsize = 80;
+
+      applog(LOG_DEBUG, "Neoscrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
+      // scrypt/n-scrypt
+    }
+    else {
+      size_t ipt = (algorithm->n / cgpu->lookup_gap + (algorithm->n % cgpu->lookup_gap > 0));
+      bufsize = 128 * ipt * cgpu->thread_concurrency;
+      applog(LOG_DEBUG, "Scrypt buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
+    }
+  }
+  else {
+    bufsize = (size_t)algorithm->rw_buffer_size;
+    applog(LOG_DEBUG, "Buffer sizes: %lu RW, %lu R", (unsigned long)bufsize, (unsigned long)readbufsize);
+  }

  clState->padbuffer8 = NULL;

  if (bufsize > 0) {
+    applog(LOG_DEBUG, "Creating read/write buffer sized %lu", (unsigned long)bufsize);
    /* Use the max alloc value which has been rounded to a power of
     * 2 greater >= required amount earlier */
    if (bufsize > cgpu->max_alloc) {
@ -463,7 +519,6 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
           gpu, (unsigned long)(cgpu->max_alloc));
      applog(LOG_WARNING, "Your settings come to %lu", (unsigned long)bufsize);
    }
-    applog(LOG_DEBUG, "Creating buffer sized %lu", (unsigned long)bufsize);

    /* This buffer is weird and might work to some degree even if
     * the create buffer call has apparently failed, so check if we
@ -475,11 +530,14 @@ _clState *initCl(unsigned int gpu, char *name, size_t nameSize, algorithm_t *alg
    }
  }

-  clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status);
+  applog(LOG_DEBUG, "Using read buffer sized %lu", (unsigned long)readbufsize);
+  clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, readbufsize, NULL, &status);
  if (status != CL_SUCCESS) {
    applog(LOG_ERR, "Error %d: clCreateBuffer (CLbuffer0)", status);
    return NULL;
  }
+  
+  applog(LOG_DEBUG, "Using output buffer sized %lu", BUFFERSIZE);
  clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status);

  if (status != CL_SUCCESS) {
--- a/sgminer.c
+++ b/sgminer.c
@ -2019,11 +2019,26 @@ static void update_gbt(struct pool *pool)
 /* Return the work coin/network difficulty */
 static double get_work_blockdiff(const struct work *work)
 {
-  uint8_t pow = work->data[72];
-  int powdiff = (8 * (0x1d - 3)) - (8 * (pow - 3));
-  uint32_t diff32 = be32toh(*((uint32_t *)(work->data + 72))) & 0x00FFFFFF;
-  double numerator = work->pool->algorithm.diff_numerator << powdiff;
-  return numerator / (double)diff32;
+  uint64_t diff64;
+  double numerator;
+
+  // Neoscrypt has the data reversed
+  if (!safe_cmp(work->pool->algorithm.name, "neoscrypt")) {
+    diff64 = bswap_64(((uint64_t)(be32toh(*((uint32_t *)(work->data + 72))) & 0xFFFFFF00)) << 8);
+    numerator = (double)work->pool->algorithm.diff_numerator;
+  }
+  else {
+    uint8_t pow = work->data[72];
+    int powdiff = (8 * (0x1d - 3)) - (8 * (pow - 3));;
+    diff64 = be32toh(*((uint32_t *)(work->data + 72))) & 0x0000000000FFFFFF;
+    numerator = work->pool->algorithm.diff_numerator << powdiff;
+  }
+
+  if (unlikely(!diff64)) {
+    diff64 = 1;
+  }
+
+  return numerator / (double)diff64;
 }

 static void gen_gbt_work(struct pool *pool, struct work *work)
@ -2073,7 +2088,10 @@ static void gen_gbt_work(struct pool *pool, struct work *work)
    free(header);
  }

-  calc_midstate(work);
+  // Neoscrypt doesn't calc_midstate()
+  if (safe_cmp(pool->algorithm.name, "neoscrypt")) {
+    calc_midstate(work);
+  }
  local_work++;
  work->pool = pool;
  work->gbt = true;
@ -2189,10 +2207,15 @@ static bool getwork_decode(json_t *res_val, struct work *work)
    return false;
  }

-  if (!jobj_binary(res_val, "midstate", work->midstate, sizeof(work->midstate), false)) {
-    // Calculate it ourselves
-    applog(LOG_DEBUG, "%s: Calculating midstate locally", isnull(get_pool_name(work->pool), ""));
-    calc_midstate(work);
+  // Neoscrypt doesn't calc midstate
+  if (safe_cmp(work->pool->algorithm.name, "neoscrypt")) {
+    if (!jobj_binary(res_val, "midstate", work->midstate, sizeof(work->midstate), false)) {
+      // Calculate it ourselves
+      if (opt_morenotices) {
+        applog(LOG_DEBUG, "%s: Calculating midstate locally", isnull(get_pool_name(work->pool), ""));
+      }
+      calc_midstate(work);
+    }
  }

  if (unlikely(!jobj_binary(res_val, "target", work->target, sizeof(work->target), true))) {
@ -2936,8 +2959,8 @@ static bool submit_upstream_work(struct work *work, CURL *curl, char *curl_err_s

  endian_flip128(work->data, work->data);

-  /* build hex string */
-  hexstr = bin2hex(work->data, sizeof(work->data));
+  /* build hex string - Make sure to restrict to 80 bytes for Neoscrypt */
+  hexstr = bin2hex(work->data, ((!safe_cmp(work->pool->algorithm.name, "neoscrypt")) ? 80 : sizeof(work->data)));

  /* build JSON-RPC request */
  if (work->gbt) {
@ -3304,11 +3327,19 @@ static void calc_diff(struct work *work, double known)

    d64 = work->pool->algorithm.diff_multiplier2 * truediffone;

-    dcut64 = le256todouble(work->target);
+    applog(LOG_DEBUG, "calc_diff() algorithm = %s", work->pool->algorithm.name);
+    // Neoscrypt
+    if (!safe_cmp(work->pool->algorithm.name, "neoscrypt")) {
+      dcut64 = (double)*((uint64_t *)(work->target + 22));
+    }
+    else {
+      dcut64 = le256todouble(work->target);  
+    }
    if (unlikely(!dcut64))
      dcut64 = 1;
    work->work_difficulty = d64 / dcut64;
  }
+
  difficulty = work->work_difficulty;

  pool_stats->last_diff = difficulty;
@ -5465,8 +5496,21 @@ static void *stratum_sthread(void *userdata)
    sshare->sshare_time = time(NULL);
    /* This work item is freed in parse_stratum_response */
    sshare->work = work;
-    nonce = *((uint32_t *)(work->data + 76));
+
+    applog(LOG_DEBUG, "stratum_sthread() algorithm = %s", pool->algorithm.name);
+
+    // Neoscrypt is little endian
+    if (!safe_cmp(pool->algorithm.name, "neoscrypt")) {
+      nonce = htobe32(*((uint32_t *)(work->data + 76)));
+      //*((uint32_t *)nonce2) = htole32(work->nonce2);
+    }
+    else {
+      nonce = *((uint32_t *)(work->data + 76));
+    }
    __bin2hex(noncehex, (const unsigned char *)&nonce, 4);
+
+    *((uint64_t *)nonce2) = htole64(work->nonce2);
+    __bin2hex(nonce2hex, nonce2, work->nonce2_len);
    memset(s, 0, 1024);

    mutex_lock(&sshare_lock);
@ -5474,10 +5518,6 @@ static void *stratum_sthread(void *userdata)
    sshare->id = swork_id++;
    mutex_unlock(&sshare_lock);

-    nonce2_64 = (uint64_t *)nonce2;
-    *nonce2_64 = htole64(work->nonce2);
-    __bin2hex(nonce2hex, nonce2, work->nonce2_len);
-
    snprintf(s, sizeof(s),
      "{\"params\": [\"%s\", \"%s\", \"%s\", \"%s\", \"%s\"], \"id\": %d, \"method\": \"mining.submit\"}",
      pool->rpc_user, work->job_id, nonce2hex, work->ntime, noncehex, sshare->id);
@ -5885,6 +5925,50 @@ void set_target(unsigned char *dest_target, double diff, double diff_multiplier2
  memcpy(dest_target, target, 32);
 }

+/*****************************************************
+* Special set_target() function for Neoscrypt
+****************************************************/
+void set_target_neoscrypt(unsigned char *target, double diff)
+{
+  uint64_t m;
+  int k;
+
+  diff /= 65536.0;
+  for (k = 6; k > 0 && diff > 1.0; --k) {
+    diff /= 4294967296.0;
+  }
+
+  m = 4294901760.0 / diff;
+
+  if (m == 0 && k == 6) {
+    memset(target, 0xff, 32);
+  }
+  else {
+    memset(target, 0, 32);
+    ((uint32_t *)target)[k] = (uint32_t)m;
+    ((uint32_t *)target)[k + 1] = (uint32_t)(m >> 32);
+  }
+
+  if (opt_debug) {
+    /* The target is computed in this systems endianess and stored
+    * in its endianess on a uint32-level. But because the target are
+    * eight uint32s, they are stored in mixed mode, i.e., each uint32
+    * is stored in the local endianess, but the least significant bit
+    * is stored in target[0] bit 0.
+    *
+    * To print this large number in a native human readable form the
+    * order of the array entries is swapped, i.e., target[7] <-> target[0]
+    * and each array entry is byte swapped to have the least significant
+    * bit to the right. */
+    uint32_t swaped[8];
+    swab256(swaped, target);
+    char *htarget = bin2hex((unsigned char *)swaped, 32);
+
+    applog(LOG_DEBUG, "Generated neoscrypt target 0x%s", htarget);
+    free(htarget);
+  }
+}
+
 /* Generates stratum based work based on the most recent notify information
 * from the pool. This will keep generating work while a pool is down so we use
 * other means to detect when the pool has died in stratum_thread */
@ -5893,12 +5977,12 @@ static void gen_stratum_work(struct pool *pool, struct work *work)
  unsigned char merkle_root[32], merkle_sha[64];
  uint32_t *data32, *swap32;
  uint64_t nonce2le;
-  int i;
+  int i, j;

  cg_wlock(&pool->data_lock);

  /* Update coinbase. Always use an LE encoded nonce2 to fill in values
-   * from left to right and prevent overflow errors with small n2sizes */
+  * from left to right and prevent overflow errors with small n2sizes */
  nonce2le = htole64(pool->nonce2);
  memcpy(pool->coinbase + pool->nonce2_offset, &nonce2le, pool->n2size);
  work->nonce2 = pool->nonce2++;
@ -5915,16 +5999,50 @@ static void gen_stratum_work(struct pool *pool, struct work *work)
    gen_hash(merkle_sha, 64, merkle_root);
    memcpy(merkle_sha, merkle_root, 32);
  }
-  data32 = (uint32_t *)merkle_sha;
-  swap32 = (uint32_t *)merkle_root;
-  flip32(swap32, data32);

-  /* Copy the data template from header_bin */
-  memcpy(work->data, pool->header_bin, 128);
-  memcpy(work->data + pool->merkle_offset, merkle_root, 32);
+  applog(LOG_DEBUG, "gen_stratum_work() - algorithm = %s", pool->algorithm.name);
+
+  // Different for Neoscrypt because of Little Endian
+  if (!safe_cmp(pool->algorithm.name, "neoscrypt")) {
+    /* Incoming data is in little endian. */
+    memcpy(merkle_root, merkle_sha, 32);
+
+    uint32_t temp = pool->merkle_offset / sizeof(uint32_t), i;
+    /* Put version (4 byte) + prev_hash (4 byte* 8) but big endian encoded
+    * into work. */
+    for (i = 0; i < temp; ++i) {
+      ((uint32_t *)work->data)[i] = be32toh(((uint32_t *)pool->header_bin)[i]);
+    }
+
+    /* Now add the merkle_root (4 byte* 8), but it is encoded in little endian. */
+    temp += 8;
+
+    for (j = 0; i < temp; ++i, ++j) {
+      ((uint32_t *)work->data)[i] = le32toh(((uint32_t *)merkle_root)[j]);
+    }
+
+    /* Add the time encoded in big endianess. */
+    hex2bin((unsigned char *)&temp, pool->swork.ntime, 4);
+
+    /* Add the nbits (big endianess). */
+    ((uint32_t *)work->data)[17] = be32toh(temp);
+    hex2bin((unsigned char *)&temp, pool->swork.nbit, 4);
+    ((uint32_t *)work->data)[18] = be32toh(temp);
+    ((uint32_t *)work->data)[20] = 0x80000000;
+    ((uint32_t *)work->data)[31] = 0x00000280;
+  }
+  else {
+    data32 = (uint32_t *)merkle_sha;
+    swap32 = (uint32_t *)merkle_root;
+    flip32(swap32, data32);
+
+    /* Copy the data template from header_bin */
+    memcpy(work->data, pool->header_bin, 128);
+    memcpy(work->data + pool->merkle_offset, merkle_root, 32);
+  }

  /* Store the stratum work diff to check it still matches the pool's
-   * stratum diff when submitting shares */
+  * stratum diff when submitting shares */
  work->sdiff = pool->swork.diff;

  /* Copy parameters required for share submission */
@ -5941,13 +6059,19 @@ static void gen_stratum_work(struct pool *pool, struct work *work)
    applog(LOG_DEBUG, "Generated stratum merkle %s", merkle_hash);
    applog(LOG_DEBUG, "Generated stratum header %s", header);
    applog(LOG_DEBUG, "Work job_id %s nonce2 %"PRIu64" ntime %s", work->job_id,
-           work->nonce2, work->ntime);
+      work->nonce2, work->ntime);
    free(header);
    free(merkle_hash);
  }

-  calc_midstate(work);
-  set_target(work->target, work->sdiff, pool->algorithm.diff_multiplier2);
+  // For Neoscrypt use set_target_neoscrypt() function
+  if (!safe_cmp(pool->algorithm.name, "neoscrypt")) {
+    set_target_neoscrypt(work->target, work->sdiff);
+  }
+  else {
+    calc_midstate(work);
+    set_target(work->target, work->sdiff, pool->algorithm.diff_multiplier2);
+  }

  local_work++;
  work->pool = pool;
@ -6124,15 +6248,17 @@ static unsigned long compare_pool_settings(struct pool *pool1, struct pool *pool
  unsigned int options = 0;
  const char *opt1, *opt2;

-  if(!pool1 || !pool2)
+  applog(LOG_DEBUG, "compare_pool_settings()");
+
+  if (!pool1 || !pool2)
    return 0;

  //compare pool devices
-  opt1 = get_pool_setting(pool1->devices, ((!empty_string(default_profile.devices))?default_profile.devices:"all"));
-  opt2 = get_pool_setting(pool2->devices, ((!empty_string(default_profile.devices))?default_profile.devices:"all"));
+  opt1 = get_pool_setting(pool1->devices, ((!empty_string(default_profile.devices)) ? default_profile.devices : "all"));
+  opt2 = get_pool_setting(pool2->devices, ((!empty_string(default_profile.devices)) ? default_profile.devices : "all"));

  //changing devices means a hard reset of mining threads
-  if(strcasecmp(opt1, opt2) != 0)
+  if (strcasecmp(opt1, opt2) != 0)
    options |= (SWITCHER_APPLY_DEVICE | SWITCHER_HARD_RESET);

  //compare gpu threads
@ -6140,11 +6266,11 @@ static unsigned long compare_pool_settings(struct pool *pool1, struct pool *pool
  opt2 = get_pool_setting(pool2->gpu_threads, default_profile.gpu_threads);

  //changing gpu threads means a hard reset of mining threads
-  if(strcasecmp(opt1, opt2) != 0)
+  if (strcasecmp(opt1, opt2) != 0)
    options |= (SWITCHER_APPLY_GT | SWITCHER_HARD_RESET);

  //compare algorithm
-  if(!cmp_algorithm(&pool1->algorithm, &pool2->algorithm))
+  if (!cmp_algorithm(&pool1->algorithm, &pool2->algorithm))
    options |= (SWITCHER_APPLY_ALGO | SWITCHER_SOFT_RESET);

  //lookup gap
@ -6152,46 +6278,46 @@ static unsigned long compare_pool_settings(struct pool *pool1, struct pool *pool
  opt2 = get_pool_setting(pool2->lookup_gap, default_profile.lookup_gap);

  //lookup gap means soft reset but only if hard reset isnt set
-  if(strcasecmp(opt1, opt2) != 0)
+  if (strcasecmp(opt1, opt2) != 0)
    options |= (SWITCHER_APPLY_LG | SWITCHER_SOFT_RESET);

  //intensities
  opt1 = get_pool_setting(pool1->rawintensity, default_profile.rawintensity);
  opt2 = get_pool_setting(pool2->rawintensity, default_profile.rawintensity);

-  if(strcasecmp(opt1, opt2) != 0)
+  if (strcasecmp(opt1, opt2) != 0)
  {
    //intensity is soft reset
-    if(!empty_string(opt2))
+    if (!empty_string(opt2))
      options |= (SWITCHER_APPLY_RAWINT | SWITCHER_SOFT_RESET);
  }

  //xintensity -- only if raw intensity not set
-  if(!opt_isset(options, SWITCHER_APPLY_RAWINT))
+  if (!opt_isset(options, SWITCHER_APPLY_RAWINT))
  {
    opt1 = get_pool_setting(pool1->xintensity, default_profile.xintensity);
    opt2 = get_pool_setting(pool2->xintensity, default_profile.xintensity);

    //if different...
-    if(strcasecmp(opt1, opt2) != 0)
+    if (strcasecmp(opt1, opt2) != 0)
    {
      //intensity is soft reset
-      if(!empty_string(opt2))
+      if (!empty_string(opt2))
        options |= (SWITCHER_APPLY_XINT | SWITCHER_SOFT_RESET);
    }
  }

  //intensity -- only if raw intensity and xintensity not set
-  if(!opt_isset(options, SWITCHER_APPLY_RAWINT) && !opt_isset(options, SWITCHER_APPLY_XINT))
+  if (!opt_isset(options, SWITCHER_APPLY_RAWINT) && !opt_isset(options, SWITCHER_APPLY_XINT))
  {
    opt1 = get_pool_setting(pool1->intensity, default_profile.intensity);
    opt2 = get_pool_setting(pool2->intensity, default_profile.intensity);

    //if different...
-    if(strcasecmp(opt1, opt2) != 0)
+    if (strcasecmp(opt1, opt2) != 0)
    {
      //intensity is soft reset
-      if(!empty_string(opt2))
+      if (!empty_string(opt2))
        options |= (SWITCHER_APPLY_INT | SWITCHER_SOFT_RESET);
      //if blank, set default profile to intensity 8 and apply
      else
@ -6203,10 +6329,10 @@ static unsigned long compare_pool_settings(struct pool *pool1, struct pool *pool
  opt1 = get_pool_setting(pool1->shaders, default_profile.shaders);
  opt2 = get_pool_setting(pool2->shaders, default_profile.shaders);

-  if(strcasecmp(opt1, opt2) != 0)
+  if (strcasecmp(opt1, opt2) != 0)
  {
    //shaders is soft reset
-    if(!empty_string(opt2))
+    if (!empty_string(opt2))
      options |= (SWITCHER_APPLY_SHADER | SWITCHER_SOFT_RESET);
  }

@ -6215,7 +6341,7 @@ static unsigned long compare_pool_settings(struct pool *pool1, struct pool *pool
  opt2 = get_pool_setting(pool2->thread_concurrency, default_profile.thread_concurrency);

  //thread-concurrency is soft reset
-  if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
+  if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
    options |= (SWITCHER_APPLY_TC | SWITCHER_SOFT_RESET);

  //worksize
@ -6223,45 +6349,45 @@ static unsigned long compare_pool_settings(struct pool *pool1, struct pool *pool
  opt2 = get_pool_setting(pool2->worksize, default_profile.worksize);

  //worksize is soft reset
-  if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
-      options |= (SWITCHER_APPLY_WORKSIZE | SWITCHER_SOFT_RESET);
+  if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
+    options |= (SWITCHER_APPLY_WORKSIZE | SWITCHER_SOFT_RESET);

-  #ifdef HAVE_ADL
-    //gpu-engine
-    opt1 = get_pool_setting(pool1->gpu_engine, default_profile.gpu_engine);
-    opt2 = get_pool_setting(pool2->gpu_engine, default_profile.gpu_engine);
+#ifdef HAVE_ADL
+  //gpu-engine
+  opt1 = get_pool_setting(pool1->gpu_engine, default_profile.gpu_engine);
+  opt2 = get_pool_setting(pool2->gpu_engine, default_profile.gpu_engine);

-    if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
-      options |= SWITCHER_APPLY_GPU_ENGINE;
+  if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
+    options |= SWITCHER_APPLY_GPU_ENGINE;

-    //gpu-memclock
-    opt1 = get_pool_setting(pool1->gpu_memclock, default_profile.gpu_memclock);
-    opt2 = get_pool_setting(pool2->gpu_memclock, default_profile.gpu_memclock);
+  //gpu-memclock
+  opt1 = get_pool_setting(pool1->gpu_memclock, default_profile.gpu_memclock);
+  opt2 = get_pool_setting(pool2->gpu_memclock, default_profile.gpu_memclock);

-    if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
-      options |= SWITCHER_APPLY_GPU_MEMCLOCK;
+  if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
+    options |= SWITCHER_APPLY_GPU_MEMCLOCK;

-    //GPU fans
-    opt1 = get_pool_setting(pool1->gpu_fan, default_profile.gpu_fan);
-    opt2 = get_pool_setting(pool2->gpu_fan, default_profile.gpu_fan);
+  //GPU fans
+  opt1 = get_pool_setting(pool1->gpu_fan, default_profile.gpu_fan);
+  opt2 = get_pool_setting(pool2->gpu_fan, default_profile.gpu_fan);

-    if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
-      options |= SWITCHER_APPLY_GPU_FAN;
+  if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
+    options |= SWITCHER_APPLY_GPU_FAN;

-    //GPU powertune
-    opt1 = get_pool_setting(pool1->gpu_powertune, default_profile.gpu_powertune);
-    opt2 = get_pool_setting(pool2->gpu_powertune, default_profile.gpu_powertune);
+  //GPU powertune
+  opt1 = get_pool_setting(pool1->gpu_powertune, default_profile.gpu_powertune);
+  opt2 = get_pool_setting(pool2->gpu_powertune, default_profile.gpu_powertune);

-    if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
-      options |= SWITCHER_APPLY_GPU_POWERTUNE;
+  if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
+    options |= SWITCHER_APPLY_GPU_POWERTUNE;

-    //GPU vddc
-    opt1 = get_pool_setting(pool1->gpu_vddc, default_profile.gpu_vddc);
-    opt2 = get_pool_setting(pool2->gpu_vddc, default_profile.gpu_vddc);
+  //GPU vddc
+  opt1 = get_pool_setting(pool1->gpu_vddc, default_profile.gpu_vddc);
+  opt2 = get_pool_setting(pool2->gpu_vddc, default_profile.gpu_vddc);

-    if(strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
-      options |= SWITCHER_APPLY_GPU_VDDC;
-  #endif
+  if (strcasecmp(opt1, opt2) != 0 && !empty_string(opt2))
+    options |= SWITCHER_APPLY_GPU_VDDC;
+#endif

  // Remove soft reset if hard reset is set
  if (opt_isset(options, SWITCHER_HARD_RESET) &&
@ -6281,6 +6407,8 @@ static void get_work_prepare_thread(struct thr_info *mythr, struct work *work)
 {
  int i;

+  applog(LOG_DEBUG, "get_work_prepare_thread()");
+
  //if switcher is disabled
  if(opt_switchmode == SWITCH_OFF)
    return;
@ -6608,6 +6736,7 @@ struct work *get_work(struct thr_info *thr, const int thr_id)
    }
  }

+  applog(LOG_DEBUG, "preparing thread...");
  get_work_prepare_thread(thr, work);

  diff_t = time(NULL) - diff_t;
@ -6700,7 +6829,16 @@ bool test_nonce(struct work *work, uint32_t nonce)
  uint32_t diff1targ;

  rebuild_nonce(work, nonce);
-  diff1targ = work->pool->algorithm.diff1targ;
+
+  applog(LOG_DEBUG, "test_nonce() algorithm = %s", work->pool->algorithm.name);
+
+  // for Neoscrypt, the diff1targe value is in work->target
+  if ((work->pool->algorithm.name, "neoscrypt")) {
+    diff1targ = ((uint32_t *)work->target)[7];
+  }
+  else {
+    diff1targ = work->pool->algorithm.diff1targ;
+  }

  return (le32toh(*hash_32) <= diff1targ);
 }
--- a/winbuild/sgminer.vcxproj
+++ b/winbuild/sgminer.vcxproj
@ -263,6 +263,7 @@
    <ClCompile Include="..\algorithm.c" />
    <ClCompile Include="..\algorithm\animecoin.c" />
    <ClCompile Include="..\algorithm\bitblock.c" />
+    <ClCompile Include="..\algorithm\neoscrypt.c" />
    <ClCompile Include="..\algorithm\talkcoin.c" />
    <ClCompile Include="..\algorithm\x14.c" />
    <ClCompile Include="..\algorithm\fresh.c" />
@ -321,6 +322,7 @@
    <ClInclude Include="..\algorithm.h" />
    <ClInclude Include="..\algorithm\animecoin.h" />
    <ClInclude Include="..\algorithm\bitblock.h" />
+    <ClInclude Include="..\algorithm\neoscrypt.h" />
    <ClInclude Include="..\algorithm\talkcoin.h" />
    <ClInclude Include="..\algorithm\x14.h" />
    <ClInclude Include="..\algorithm\fresh.h" />
--- a/winbuild/sgminer.vcxproj.filters
+++ b/winbuild/sgminer.vcxproj.filters
@ -197,6 +197,9 @@
    <ClCompile Include="..\algorithm.c">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="..\algorithm\neoscrypt.c">
+      <Filter>Source Files\algorithm</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\adl.h">
@ -373,6 +376,9 @@
    <ClInclude Include="..\sph\sph_shabal.h">
      <Filter>Header Files\sph</Filter>
    </ClInclude>
+    <ClInclude Include="..\algorithm\neoscrypt.h">
+      <Filter>Header Files\algorithm</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <None Include="README.txt" />