add neoscrypt based on djm34 work

indent, link --intensity, and some clean up Tested speed on linux ~= 160kH/s on a 750Ti (Black Edition) To be continued...
10 years ago · 65c1d787e7
11 changed files with 2852 additions and 2 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -41,6 +41,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu \
 			  quark/quarkcoin.cu quark/animecoin.cu \
 			  quark/cuda_quark_compactionTest.cu  \
 			  neoscrypt.cu neoscrypt/neoscrypt.c neoscrypt/cuda_neoscrypt.cu \
 			  cuda_nist5.cu pentablake.cu skein.cu skein2.cu zr5.cu \
 			  sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
 			  sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@
-ccMiner release 1.6.2-tpruvot (Apr 2015) - "Scrypt/N/Jane algos"
+ccMiner release 1.6.3-tpruvot (May 2015) - "Neoscrypt"
 ---------------------------------------------------------------
 ***************************************************************
@ -37,6 +37,7 @@ Deep, Doom and Qubit
 Keccak (Maxcoin)
 Pentablake (Blake 512 x5)
 1Coin Triple S
 Neoscrypt (FeatherCoin)
 Scrypt and Scrypt:N
 Scrypt-Jane (Chacha)
 Skein (Skein + SHA)
@ -76,6 +77,7 @@ its command line interface and options.
                          lyra2       use to mine Vertcoin
                          mjollnir    use to mine Mjollnircoin
                          myr-gr      use to mine Myriad-Groest
                          neoscrypt   use to mine FeatherCoin
                          nist5       use to mine TalkCoin
                          penta       use to mine Joincoin / Pentablake
                          pluck       use to mine Supcoin
@ -211,6 +213,9 @@ features.
 >>> RELEASE HISTORY <<<
  Not released!!  v1.6.3
                  Import Neoscrypt from djm34 work
  Apr. 21th 2015  v1.6.2
                  Import Scrypt, Scrypt:N and Scrypt-jane from Cudaminer
                  Add the --time-limit command line parameter
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -98,6 +98,7 @@ enum sha_algos {
 	ALGO_LYRA2,
 	ALGO_MJOLLNIR,		/* Hefty hash */
 	ALGO_MYR_GR,
 	ALGO_NEOSCRYPT,
 	ALGO_NIST5,
 	ALGO_PENTABLAKE,
 	ALGO_PLUCK,
@ -135,6 +136,7 @@ static const char *algo_names[] = {
 	"lyra2",
 	"mjollnir",
 	"myr-gr",
 	"neoscrypt",
 	"nist5",
 	"penta",
 	"pluck",
@ -273,6 +275,7 @@ Options:\n\
 			lyra2       VertCoin\n\
 			mjollnir    Mjollnircoin\n\
 			myr-gr      Myriad-Groestl\n\
 			neoscrypt   use to mine FeatherCoin\n\
 			nist5       NIST5 (TalkCoin)\n\
 			penta       Pentablake hash (5x Blake 512)\n\
 			pluck       SupCoin\n\
@ -537,7 +540,7 @@ static bool work_decode(const json_t *val, struct work *work)
 	int adata_sz = ARRAY_SIZE(work->data), atarget_sz = ARRAY_SIZE(work->target);
 	int i;
-	if (opt_algo == ALGO_ZR5) {
+	if (opt_algo == ALGO_NEOSCRYPT || opt_algo == ALGO_ZR5) {
 		data_size = 80; adata_sz = 20;
 	}
@ -1241,6 +1244,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	switch (opt_algo) {
 		case ALGO_JACKPOT:
 		case ALGO_NEOSCRYPT:
 		case ALGO_PLUCK:
 		case ALGO_SCRYPT:
 		case ALGO_SCRYPT_JANE:
@ -1472,6 +1476,7 @@ static void *miner_thread(void *userdata)
 				minmax = 0x400000;
 				break;
 			case ALGO_LYRA2:
 			case ALGO_NEOSCRYPT:
 			case ALGO_SCRYPT:
 			case ALGO_SCRYPT_JANE:
 				minmax = 0x100000;
@ -1599,6 +1604,11 @@ static void *miner_thread(void *userdata)
 			                      max_nonce, &hashes_done);
 			break;
 		case ALGO_NEOSCRYPT:
 			rc = scanhash_neoscrypt(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
 			break;
 		case ALGO_NIST5:
 			rc = scanhash_nist5(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -265,6 +265,7 @@
    <ClCompile Include="myriadgroestl.cpp" />
    <ClCompile Include="lyra2\Lyra2.c" />
    <ClCompile Include="lyra2\Sponge.c" />
    <ClCompile Include="neoscrypt\neoscrypt.c" />
    <ClCompile Include="sph\aes_helper.c" />
    <ClCompile Include="sph\blake.c" />
    <ClCompile Include="sph\bmw.c" />
@ -435,6 +436,8 @@
      <AdditionalOptions Condition="'$(Configuration)'=='Release'">-Xptxas "-abi=yes" %(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)'=='Debug'">-Xptxas "-abi=yes" %(AdditionalOptions)</AdditionalOptions>
    </CudaCompile>
    <CudaCompile Include="neoscrypt.cu" />
    <CudaCompile Include="neoscrypt\cuda_neoscrypt.cu" />
    <CudaCompile Include="pluck\pluck.cu" />
    <CudaCompile Include="pluck\cuda_pluck.cu" />
    <CudaCompile Include="quark\cuda_quark_groestl512.cu">
--- a/miner.h
+++ b/miner.h
@ -324,6 +324,9 @@ extern int scanhash_lyra2(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
 extern int scanhash_neoscrypt(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_nist5(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
@ -689,6 +692,7 @@ unsigned int jackpothash(void *state, const void *input);
 void groestlhash(void *state, const void *input);
 void lyra2_hash(void *state, const void *input);
 void myriadhash(void *state, const void *input);
 void neoscrypt(const uchar *password, uchar *output, uint profile);
 void nist5hash(void *state, const void *input);
 void pentablakehash(void *output, const void *input);
 void pluckhash(uint32_t *hash, const uint32_t *data, uchar *hashbuffer, const int N);
--- a/neoscrypt.cu
+++ b/neoscrypt.cu
@ -0,0 +1,83 @@
 extern "C" {
 #include "neoscrypt/neoscrypt.h"
 }
 #include "cuda_helper.h"
 #include "miner.h"
 static uint32_t *d_hash[MAX_GPUS] ;
 extern void neoscrypt_setBlockTarget(uint32_t * data, const void *ptarget);
 extern void neoscrypt_cpu_init(int thr_id, uint32_t threads, uint32_t* hash);
 extern uint32_t neoscrypt_cpu_hash_k4(int stratum, int thr_id, uint32_t threads, uint32_t startNounce, int order);
 #define SHIFT 130
 int scanhash_neoscrypt(int thr_id, uint32_t *pdata, const uint32_t *ptarget, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
 	const int stratum = have_stratum;
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
 	int intensity = is_windows() ? 18 : 19;
 	uint32_t throughput = device_intensity(thr_id, __func__, 1U << intensity);
 	throughput = throughput / 32; /* set for max intensity ~= 20 */
 	throughput = min(throughput, max_nonce - first_nonce + 1);
 	static bool init[MAX_GPUS] = { 0 };
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
 		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 32 * SHIFT * sizeof(uint64_t) * throughput));
 		neoscrypt_cpu_init(thr_id, throughput, d_hash[thr_id]);
 		applog(LOG_INFO, "Using %d cuda threads", throughput);
 		init[thr_id] = true;
 	}
 	uint32_t endiandata[20];
 	if (stratum) {
 		for (int k = 0; k < 20; k++)
 			be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
 	} else {
 		for (int k = 0; k < 20; k++)
 			endiandata[k] = pdata[k];
 	}
 	neoscrypt_setBlockTarget(endiandata,ptarget);
 	do {
 		uint32_t foundNonce = neoscrypt_cpu_hash_k4(stratum, thr_id, throughput, pdata[19], 0);
 		if (foundNonce != UINT32_MAX)
 		{
 			uint32_t _ALIGN(64) vhash64[8];
 			*hashes_done = pdata[19] - first_nonce + 1;
 			if (stratum) {
 				be32enc(&endiandata[19], foundNonce);
 			} else {
 				endiandata[19] = foundNonce;
 			}
 			neoscrypt((uchar*) endiandata, (uchar*)vhash64, 0x80000620);
 			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
 				pdata[19] = foundNonce;
 				return 1;
 			} else {
 				applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNonce);
 			}
 		}
 		pdata[19] += throughput;
 	} while (!work_restart[thr_id].restart && ((uint64_t)max_nonce > ((uint64_t)(pdata[19]) + (uint64_t)throughput)));
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
--- a/neoscrypt/cuda_neoscrypt.cu
+++ b/neoscrypt/cuda_neoscrypt.cu
@ -0,0 +1,607 @@
 #include <stdio.h>
 #include <memory.h>
 #include "cuda_helper.h"
 #include "cuda_vectors.h"
 __device__  uint4 *  W;
 uint32_t *d_NNonce[MAX_GPUS];
 uint32_t *d_nnounce[MAX_GPUS];
 __constant__  uint32_t pTarget[8];
 __constant__  uint32_t key_init[16];
 __constant__  uint32_t input_init[16];
 __constant__  uint32_t  c_data[80];
 #define SALSA_SMALL_UNROLL 1
 #define CHACHA_SMALL_UNROLL 1
 #define BLAKE2S_BLOCK_SIZE    64U
 #define BLAKE2S_OUT_SIZE      32U
 #define BLAKE2S_KEY_SIZE      32U
 #define BLOCK_SIZE            64U
 #define FASTKDF_BUFFER_SIZE  256U
 #define PASSWORD_LEN          80U
 /// constants ///
 static const __constant__  uint8 BLAKE2S_IV_Vec =
 	{
 		0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
 		0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
 	};
 static const  uint8 BLAKE2S_IV_Vechost =
 {
 	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
 	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
 };
 static const uint32_t BLAKE2S_SIGMA_host[10][16] =
 {
 	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
 	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
 	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
 	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
 	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
 	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
 	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
 	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
 	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
 	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
 };
 __constant__ uint32_t BLAKE2S_SIGMA[10][16];
 // Blake2S
 #define BLAKE2S_BLOCK_SIZE    64U
 #define BLAKE2S_OUT_SIZE      32U
 #define BLAKE2S_KEY_SIZE      32U
 #if __CUDA_ARCH__ >= 500
 #define BLAKE_G(idx0, idx1, a, b, c, d, key) { \
 	idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
 	a += b; d = __byte_perm(d^a,0, 0x1032); \
 	c += d; b = rotateR(b^c, 12); \
 	idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \
 	a += b; d = __byte_perm(d^a,0, 0x0321); \
 	c += d; b = rotateR(b^c, 7); \
 }
 #else
 #define BLAKE_G(idx0, idx1, a, b, c, d, key) { \
 	idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
 	a += b; d = rotate(d^a,16); \
 	c += d; b = rotateR(b^c, 12); \
 	idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \
 	a += b; d = rotateR(d^a,8); \
 	c += d; b = rotateR(b^c, 7); \
 }
 #endif
 #define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n)))
 #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
 #define BLAKE_Ghost(idx0, idx1, a, b, c, d, key) { \
 	idx = BLAKE2S_SIGMA_host[idx0][idx1]; a += key[idx]; \
 	a += b; d = ROTR32(d^a,16); \
 	c += d; b = ROTR32(b^c, 12); \
 	idx = BLAKE2S_SIGMA_host[idx0][idx1+1]; a += key[idx]; \
 	a += b; d = ROTR32(d^a,8); \
 	c += d; b = ROTR32(b^c, 7); \
 }
 static __forceinline__ __device__ void Blake2S(uint32_t * inout, const uint32_t * TheKey)
 {
 	uint16 V;
 	uint32_t idx;
 	uint8 tmpblock;
 	V.hi = BLAKE2S_IV_Vec;
 	V.lo = BLAKE2S_IV_Vec;
 	V.lo.s0 ^= 0x01012020;
 	// Copy input block for later
 	tmpblock = V.lo;
 	V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;
 	for (int x = 0; x < 10; ++x)
 	{
 		BLAKE_G(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
 		BLAKE_G(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
 		BLAKE_G(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
 		BLAKE_G(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
 		BLAKE_G(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
 		BLAKE_G(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
 		BLAKE_G(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
 		BLAKE_G(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 	}
 	V.lo ^= V.hi;
 	V.lo ^= tmpblock;
 	V.hi = BLAKE2S_IV_Vec;
 	tmpblock = V.lo;
 	V.hi.s4 ^= 128;
 	V.hi.s6 = ~V.hi.s6;
 	for (int x = 0; x < 10; ++x)
 	{
 		BLAKE_G(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
 		BLAKE_G(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
 		BLAKE_G(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
 		BLAKE_G(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
 		BLAKE_G(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
 		BLAKE_G(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
 		BLAKE_G(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
 		BLAKE_G(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
 	}
 	V.lo ^= V.hi ^ tmpblock;
 	((uint8*)inout)[0]=V.lo;
 }
 static __forceinline__ __host__ void Blake2Shost(uint32_t * inout, const uint32_t * inkey)
 {
 	uint16 V;
 	uint32_t idx;
 	uint8 tmpblock;
 	V.hi = BLAKE2S_IV_Vechost;
 	V.lo = BLAKE2S_IV_Vechost;
 	V.lo.s0 ^= 0x01012020;
 	// Copy input block for later
 	tmpblock = V.lo;
 	V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;
 	for (int x = 0; x < 10; ++x)
 	{
 		BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inkey);
 		BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inkey);
 		BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inkey);
 		BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inkey);
 		BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inkey);
 		BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inkey);
 		BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inkey);
 		BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inkey);
 	}
 	V.lo ^= V.hi;
 	V.lo ^= tmpblock;
 	V.hi = BLAKE2S_IV_Vechost;
 	tmpblock = V.lo;
 	V.hi.s4 ^= 128;
 	V.hi.s6 = ~V.hi.s6;
 	for (int x = 0; x < 10; ++x)
 	{
 		BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
 		BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
 		BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
 		BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
 		BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
 		BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
 		BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
 		BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
 	}
 	V.lo ^= V.hi ^ tmpblock;
 	((uint8*)inout)[0] = V.lo;
 }
 static __forceinline__ __device__ void fastkdf256(const uint32_t* password, uint8_t* output)
 {
 	uint8_t bufidx = 0;
 	uchar4 bufhelper;
 	uint8_t A[320],B[288];
 	((uintx64*)A)[0] = ((uintx64*)password)[0];
 	((uint816 *)A)[4] =  ((uint816 *)password)[0];
 	((uintx64*)B)[0] = ((uintx64*)password)[0];
 	((uint48 *)B)[8] = ((uint48 *)password)[0];
 	uint32_t input[BLAKE2S_BLOCK_SIZE/4]; uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = { 0 };
 	((uint816*)input)[0] = ((uint816*)input_init)[0];
 	((uint48*)key)[0] = ((uint48*)key_init)[0];
 	for (int i = 0; i < 32; ++i)
 	{
 		bufhelper = ((uchar4*)input)[0];
 		for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
 			bufhelper += ((uchar4*)input)[x];
 		bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
 		int qbuf = bufidx/4;
 		int rbuf = bufidx&3;
 		int bitbuf = rbuf << 3;
 		 uint32_t shifted[9];
 		shift256R2(shifted, ((uint8*)input)[0], bitbuf);
 		for (int k = 0; k < 9; ++k) {
 			((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
 		}
 		if (bufidx < BLAKE2S_KEY_SIZE)                          {((uint8*)B)[8] = ((uint8*)B)[0];}
 		else if (bufidx > FASTKDF_BUFFER_SIZE-BLAKE2S_OUT_SIZE) {((uint8*)B)[0] = ((uint8*)B)[8];}
 		if (i<31) {
 			for (int k = 0; k <BLAKE2S_BLOCK_SIZE / 4; k++) {
 				((uchar4*)(input))[k] = make_uchar4(
 					(A + bufidx)[4 * k], (A + bufidx)[4 * k + 1],
 					(A + bufidx)[4 * k + 2], (A + bufidx)[4 * k + 3]
 				);
 			}
 			for (int k = 0; k <BLAKE2S_KEY_SIZE / 4; k++) {
 				((uchar4*)(key))[k] = make_uchar4(
 					(B + bufidx)[4 * k], (B + bufidx)[4 * k + 1],
 					(B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3]
 				);
 			}
 			Blake2S((uint32_t*)input, key);
 		}
 	}
 	int left = FASTKDF_BUFFER_SIZE - bufidx;
 	int qleft =left/4;
 	int rleft =left&3;
 	for (int k = 0; k < qleft; ++k) {
 		((uchar4*)output)[k] = make_uchar4(
 			(B + bufidx)[4 * k], (B + bufidx)[4 * k + 1],
 			(B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3]
 		) ^ ((uchar4*)A)[k];
 	}
 	for (int i = 4*qleft; i < 4*qleft+rleft; ++i) {
 		output[i] = (B + bufidx)[i] ^ A[i];
 	}
 	for (int i = qleft*4+rleft; i < (qleft+1)*4; ++i) {
 		((uint8_t *)output)[i] = ((uint8_t *)B)[i - left] ^ ((uint8_t *)A)[i];
 	}
 	for (int i = qleft+1; i < FASTKDF_BUFFER_SIZE/4; ++i) {
 		((uchar4 *)output)[i] = make_uchar4(B[4*i - left],B[4*i+1-left],
 		 B[4*i+2-left],B[4*i+3-left]) ^ ((uchar4 *)A)[i];
 	}
 }
 static __forceinline__ __device__ void fastkdf32( const uint32_t * password, const uint32_t * salt, uint32_t * output)
 {
 	uint8_t bufidx = 0;
 	uchar4 bufhelper;
 	uint8_t A[320];
 	uint8_t B[288];
 	// Initialize the password buffer
 	((uintx64*)A)[0] = ((uintx64*)password)[0];
 	((uint816*)A)[4] = ((uint816*)password)[0];
 	((uintx64*)B)[0] = ((uintx64*)salt)[0];
 	((uintx64*)B)[1] = ((uintx64*)salt)[0];
 	uint32_t input[BLAKE2S_BLOCK_SIZE/4];
 	uint32_t key[BLAKE2S_BLOCK_SIZE/4] = { 0 };
 	((uint816*)input)[0] = ((uint816*)password)[0];
 	((uint48*)key)[0] = ((uint48*)salt)[0];
 	for (int i = 0; i < 32; ++i)
 	{
 		Blake2S((uint32_t*)input, key);
 		bufidx = 0;
 		bufhelper = ((uchar4*)input)[0];
 		for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) { bufhelper += ((uchar4*)input)[x]; }
 		bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
 		int qbuf = bufidx / 4;
 		int rbuf = bufidx & 3;
 		int bitbuf = rbuf << 3;
 		uint32_t shifted[9];
 		shift256R2(shifted, ((uint8*)input)[0], bitbuf);
 		for (int k = 0; k < 9; ++k) {
 			((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
 		}
 		if (i<31){
 		if (bufidx < BLAKE2S_KEY_SIZE)                            {((uint8*)B)[8] = ((uint8*)B)[0];}
 		else if (bufidx > FASTKDF_BUFFER_SIZE - BLAKE2S_OUT_SIZE) {((uint8*)B)[0] = ((uint8*)B)[8];}
 //		MyUnion Test;
 		for (uint8_t k = 0; k <BLAKE2S_BLOCK_SIZE/4 ; k++) {
 	((uchar4*)(input))[k] =
 	make_uchar4((A + bufidx)[4 * k], (A + bufidx)[4 * k + 1], (A + bufidx)[4 * k + 2], (A + bufidx)[4 * k + 3]);
 		}
 		for (uint8_t k = 0; k <BLAKE2S_KEY_SIZE / 4; k++) {
 	((uchar4*)(key))[k] =
 	make_uchar4((B + bufidx)[4 * k], (B + bufidx)[4 * k + 1], (B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3]);
 		}
 		}
 	}
 	uchar4 unfucked[1];
 	unfucked[0] = make_uchar4(B[28 + bufidx], B[29 + bufidx],B[30 + bufidx], B[31 + bufidx]);
 	((uint32_t*)output)[7] = ((uint32_t*)unfucked)[0] ^ ((uint32_t*)A)[7];
 }
 #define SALSA(a,b,c,d) { \
    t =a+d; b^=rotate(t,  7);    \
    t =b+a; c^=rotate(t,  9);    \
    t =c+b; d^=rotate(t, 13);    \
    t =d+c; a^=rotate(t, 18);     \
 }
 #define SALSA_CORE(state) { \
    SALSA(state.s0,state.s4,state.s8,state.sc); \
    SALSA(state.s5,state.s9,state.sd,state.s1); \
    SALSA(state.sa,state.se,state.s2,state.s6); \
    SALSA(state.sf,state.s3,state.s7,state.sb); \
    SALSA(state.s0,state.s1,state.s2,state.s3); \
    SALSA(state.s5,state.s6,state.s7,state.s4); \
    SALSA(state.sa,state.sb,state.s8,state.s9); \
    SALSA(state.sf,state.sc,state.sd,state.se); \
 }
 #if __CUDA_ARCH__ >=500
 #define CHACHA_STEP(a,b,c,d) { \
    a += b; d = __byte_perm(d^a,0,0x1032); \
    c += d; b = rotate(b^c, 12); \
    a += b; d = __byte_perm(d^a,0,0x2103); \
    c += d; b = rotate(b^c, 7); \
 }
 #else
 #define CHACHA_STEP(a,b,c,d) { \
    a += b; d = rotate(d^a,16); \
    c += d; b = rotate(b^c, 12); \
    a += b; d = rotate(d^a,8); \
    c += d; b = rotate(b^c, 7); \
 }
 #endif
 #define CHACHA_CORE_PARALLEL(state) { \
    CHACHA_STEP(state.lo.s0, state.lo.s4, state.hi.s0, state.hi.s4); \
    CHACHA_STEP(state.lo.s1, state.lo.s5, state.hi.s1, state.hi.s5); \
    CHACHA_STEP(state.lo.s2, state.lo.s6, state.hi.s2, state.hi.s6); \
    CHACHA_STEP(state.lo.s3, state.lo.s7, state.hi.s3, state.hi.s7); \
    CHACHA_STEP(state.lo.s0, state.lo.s5, state.hi.s2, state.hi.s7); \
    CHACHA_STEP(state.lo.s1, state.lo.s6, state.hi.s3, state.hi.s4); \
    CHACHA_STEP(state.lo.s2, state.lo.s7, state.hi.s0, state.hi.s5); \
    CHACHA_STEP(state.lo.s3, state.lo.s4, state.hi.s1, state.hi.s6); \
 }
 static __forceinline__ __device__ uint16 salsa_small_scalar_rnd(const uint16 &X)
 {
 	uint16 state = X;
 	uint32_t t;
 	for (int i = 0; i < 10; ++i) { SALSA_CORE(state);}
 	return(X + state);
 }
 static __device__ __forceinline__ uint16 chacha_small_parallel_rnd(const uint16 &X)
 {
 	uint16 st = X;
 	for (int i = 0; i < 10; ++i) {CHACHA_CORE_PARALLEL(st);}
 	return(X + st);
 }
 static __device__ __forceinline__ void neoscrypt_chacha(uint16 *XV)
 {
 	XV[0] ^= XV[3];
 	uint16 temp;
 	XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0];
 	 temp = chacha_small_parallel_rnd(XV[1]); XV[2] ^= temp;
 	XV[1] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[1];
 	XV[3] = chacha_small_parallel_rnd(XV[3]);
 	XV[2] = temp;
 }
 static __device__ __forceinline__ void neoscrypt_salsa(uint16 *XV)
 {
 	XV[0] ^= XV[3];
 	uint16 temp;
 	XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0];
 	 temp = salsa_small_scalar_rnd(XV[1]); XV[2] ^= temp;
 	XV[1] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[1];
 	XV[3] = salsa_small_scalar_rnd(XV[3]);
 	XV[2] = temp;
 }
 #define SHIFT 130
 __global__ __launch_bounds__(128, 1)
 void neoscrypt_gpu_hash_k0(int stratum, uint32_t threads, uint32_t startNonce)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	uint32_t shift = SHIFT * 16 * thread;
 //	if (thread < threads)
 	{
 		uint32_t data[80];
 		uint16 X[4];
 		const uint32_t nonce = startNonce + thread;
 		for (int i = 0; i<20; i++) {
 			((uint4*)data)[i] = ((uint4 *)c_data)[i];
 		}  //ld.local.v4
 		data[19] = (stratum) ? cuda_swab32(nonce) : nonce; //freaking morons !!!
 		data[39] = data[19];
 		data[59] = data[19];
 		fastkdf256(data, (uint8_t*)X);
 		((uintx64 *)(W + shift))[0] = ((uintx64 *)X)[0];
 //		((ulonglong16 *)(W + shift))[0] = ((ulonglong16 *)X)[0];
 	}
 }
 __global__ __launch_bounds__(128, 1)
 void neoscrypt_gpu_hash_k01(uint32_t threads, uint32_t startNonce)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	uint32_t shift = SHIFT * 16 * thread;
 //	if (thread < threads)
 	{
 		uint16 X[4];
 		((uintx64 *)X)[0]= __ldg32(&(W + shift)[0]);
 		//#pragma unroll
 		for (int i = 0; i < 128; ++i)
 		{
 			neoscrypt_chacha(X);
 			((ulonglong16 *)(W + shift))[i+1] = ((ulonglong16 *)X)[0];
 //			((uintx64 *)(W + shift))[i + 1] = ((uintx64 *)X)[0];
 		}
 	}
 }
 __global__ __launch_bounds__(128, 1)
 void neoscrypt_gpu_hash_k2(uint32_t threads, uint32_t startNonce)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	uint32_t shift = SHIFT * 16 * thread;
 //	if (thread < threads)
 	{
 		uint16 X[4];
 		((uintx64 *)X)[0] = __ldg32(&(W + shift)[2048]);
 		for (int t = 0; t < 128; t++)
 		{
 			int idx = X[3].lo.s0 & 0x7F;
 			((uintx64 *)X)[0] ^= __ldg32(&(W + shift)[idx << 4]);
 			neoscrypt_chacha(X);
 		}
 		((uintx64 *)(W + shift))[129] = ((uintx64*)X)[0];  // best checked
 	}
 }
 __global__ __launch_bounds__(128, 1)
 void neoscrypt_gpu_hash_k3(uint32_t threads, uint32_t startNonce)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 //	if (thread < threads)
 	{
 		uint32_t shift = SHIFT * 16 * thread;
 		uint16 Z[4];
 		((uintx64*)Z)[0] = __ldg32(&(W + shift)[0]);
 		//#pragma unroll
 		for (int i = 0; i < 128; ++i) {
 			neoscrypt_salsa(Z);
 			((ulonglong16 *)(W + shift))[i+1] = ((ulonglong16 *)Z)[0];
 //			((uintx64 *)(W + shift))[i + 1] = ((uintx64 *)Z)[0];
 		}
 	}
 }
 __global__ __launch_bounds__(128, 1)
 void neoscrypt_gpu_hash_k4(int stratum, uint32_t threads, uint32_t startNonce, uint32_t *nonceVector)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 //	if (thread < threads)
 	{
 		const uint32_t nonce = startNonce + thread;
 		uint32_t shift = SHIFT * 16 * thread;
 		uint16 Z[4];
 		uint32_t outbuf[8];
 		uint32_t data[80];
 		for (int i=0; i<20; i++) {
 			((uint4*)data)[i] = ((uint4 *)c_data)[i];
 		}
 		data[19] = (stratum) ? cuda_swab32(nonce) : nonce;
 		data[39] = data[19];
 		data[59] = data[19];
 		((uintx64 *)Z)[0] = __ldg32(&(W + shift)[2048]);
 		for (int t = 0; t < 128; t++)
 		{
 			int idx = Z[3].lo.s0 & 0x7F;
 			((uintx64 *)Z)[0] ^= __ldg32(&(W + shift)[idx << 4]);
 			neoscrypt_salsa(Z);
 		}
 		((uintx64 *)Z)[0] ^= __ldg32(&(W + shift)[2064]);
 		fastkdf32(data, (uint32_t*)Z, outbuf);
 		if (outbuf[7] <= pTarget[7]) {
 			uint32_t tmp = atomicExch(&nonceVector[0], nonce);
 		}
 	}
 }
 void neoscrypt_cpu_init(int thr_id, uint32_t threads, uint32_t *hash)
 {
 	cudaMemcpyToSymbol(BLAKE2S_SIGMA, BLAKE2S_SIGMA_host, sizeof(BLAKE2S_SIGMA_host), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(W, &hash, sizeof(hash), 0, cudaMemcpyHostToDevice);
 	cudaMalloc(&d_NNonce[thr_id], sizeof(uint32_t));
 }
 __host__
 uint32_t neoscrypt_cpu_hash_k4(int stratum, int thr_id, uint32_t threads, uint32_t startNounce, int order)
 {
 	uint32_t result[MAX_GPUS] = { 0xffffffff };
 	cudaMemset(d_NNonce[thr_id], 0xff, sizeof(uint32_t));
 	const uint32_t threadsperblock = 128;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 //	neoscrypt_gpu_hash_orig << <grid, block >> >(threads, startNounce, d_NNonce[thr_id]);
 	neoscrypt_gpu_hash_k0  << <grid, block >> >(stratum,threads, startNounce);
 	neoscrypt_gpu_hash_k01 << <grid, block >> >(threads, startNounce);
 	neoscrypt_gpu_hash_k2  << <grid, block >> >(threads, startNounce);
 	neoscrypt_gpu_hash_k3  << <grid, block >> >(threads, startNounce);
 	neoscrypt_gpu_hash_k4  << <grid, block >> >(stratum,threads, startNounce, d_NNonce[thr_id]);
 	MyStreamSynchronize(NULL, order, thr_id);
 	cudaMemcpy(&result[thr_id], d_NNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	return result[thr_id];
 }
 __host__
 void neoscrypt_setBlockTarget(uint32_t* pdata, const void *target)
 {
 	unsigned char PaddedMessage[80*4]; //bring balance to the force
 	uint32_t input[16], key[16] = { 0 };
 	memcpy(PaddedMessage,     pdata, 80);
 	memcpy(PaddedMessage + 80, pdata, 80);
 	memcpy(PaddedMessage + 160, pdata, 80);
 	memcpy(PaddedMessage + 240, pdata, 80);
 	((uint16*)input)[0] = ((uint16*)pdata)[0];
 	((uint8*)key)[0] = ((uint8*)pdata)[0];
 //	for (int i = 0; i<10; i++) { printf(" pdata/input %d %08x %08x \n",i,pdata[2*i],pdata[2*i+1]); }
 	Blake2Shost(input,key);
 	cudaMemcpyToSymbol(pTarget, target, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(input_init, input, 16 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(key_init, key, 16 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(c_data, PaddedMessage, 40 * sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
 }
--- a/neoscrypt/cuda_vectors.h
+++ b/neoscrypt/cuda_vectors.h
--- a/neoscrypt/neoscrypt.c
+++ b/neoscrypt/neoscrypt.c
@ -0,0 +1,992 @@
 /*
 * Copyright (c) 2009 Colin Percival, 2011 ArtForz
 * Copyright (c) 2012 Andrew Moon (floodyberry)
 * Copyright (c) 2012 Samuel Neves <sneves@dei.uc.pt>
 * Copyright (c) 2014 John Doering <ghostlander@phoenixcoin.org>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
 #include <stdlib.h>
 #include <stdint.h>
 #include <string.h>
 #include "neoscrypt.h"
 #if (WINDOWS)
 /* sizeof(unsigned long) = 4 for MinGW64 */
 typedef unsigned long long ulong;
 #else
 typedef unsigned long ulong;
 #endif
 typedef unsigned int  uint;
 typedef unsigned char uchar;
 typedef unsigned int  bool;
 #define MIN(a, b) ((a) < (b) ? a : b)
 #define MAX(a, b) ((a) > (b) ? a : b)
 /* SHA-256 */
 static const uint32_t sha256_constants[64] = {
 	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
 	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
 	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
 	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
 	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
 	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
 	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
 	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 };
 #define Ch(x,y,z)  (z ^ (x & (y ^ z)))
 #define Maj(x,y,z) (((x | y) & z) | (x & y))
 #define S0(x)      (ROTR32(x,  2) ^ ROTR32(x, 13) ^ ROTR32(x, 22))
 #define S1(x)      (ROTR32(x,  6) ^ ROTR32(x, 11) ^ ROTR32(x, 25))
 #define G0(x)      (ROTR32(x,  7) ^ ROTR32(x, 18) ^ (x >>  3))
 #define G1(x)      (ROTR32(x, 17) ^ ROTR32(x, 19) ^ (x >> 10))
 #define W0(in,i)   (U8TO32_BE(&in[i * 4]))
 #define W1(i)      (G1(w[i - 2]) + w[i - 7] + G0(w[i - 15]) + w[i - 16])
 #define STEP(i) \
 	t1 = S0(r[0]) + Maj(r[0], r[1], r[2]); \
 	t0 = r[7] + S1(r[4]) + Ch(r[4], r[5], r[6]) + sha256_constants[i] + w[i]; \
 	r[7] = r[6]; \
 	r[6] = r[5]; \
 	r[5] = r[4]; \
 	r[4] = r[3] + t0; \
 	r[3] = r[2]; \
 	r[2] = r[1]; \
 	r[1] = r[0]; \
 	r[0] = t0 + t1;
 typedef struct sha256_hash_state_t {
 	uint32_t H[8];
 	uint64_t T;
 	uint32_t leftover;
 	uint8_t buffer[SCRYPT_HASH_BLOCK_SIZE];
 } sha256_hash_state;
 static void sha256_blocks(sha256_hash_state *S, const uint8_t *in, size_t blocks)
 {
 	uint32_t r[8], w[64], t0, t1;
 	size_t i;
 	for (i = 0; i < 8; i++)
 		r[i] = S->H[i];
 	while (blocks--) {
 		for (i = 0U; i < 16; i++) {
 			w[i] = W0(in, i);
 		}
 		for (i = 16; i < 64; i++) {
 			w[i] = W1(i);
 		}
 		for (i = 0U; i < 64; i++) {
 			STEP(i);
 		}
 		for (i = 0U; i < 8U; i++) {
 			r[i] += S->H[i];
 			S->H[i] = r[i];
 		}
 		S->T += SCRYPT_HASH_BLOCK_SIZE * 8;
 		in += SCRYPT_HASH_BLOCK_SIZE;
 	}
 }
 static void neoscrypt_hash_init_sha256(sha256_hash_state *S)
 {
 	S->H[0] = 0x6a09e667;
 	S->H[1] = 0xbb67ae85;
 	S->H[2] = 0x3c6ef372;
 	S->H[3] = 0xa54ff53a;
 	S->H[4] = 0x510e527f;
 	S->H[5] = 0x9b05688c;
 	S->H[6] = 0x1f83d9ab;
 	S->H[7] = 0x5be0cd19;
 	S->T = 0;
 	S->leftover = 0;
 }
 static void neoscrypt_hash_update_sha256(sha256_hash_state *S, const uint8_t *in, size_t inlen)
 {
 	size_t blocks, want;
 	/* handle the previous data */
 	if (S->leftover) {
 		want = (SCRYPT_HASH_BLOCK_SIZE - S->leftover);
 		want = (want < inlen) ? want : inlen;
 		memcpy(S->buffer + S->leftover, in, want);
 		S->leftover += (uint32_t)want;
 		if (S->leftover < SCRYPT_HASH_BLOCK_SIZE)
 			return;
 		in += want;
 		inlen -= want;
 		sha256_blocks(S, S->buffer, 1);
 	}
 	/* handle the current data */
 	blocks = (inlen & ~(SCRYPT_HASH_BLOCK_SIZE - 1));
 	S->leftover = (uint32_t)(inlen - blocks);
 	if (blocks) {
 		sha256_blocks(S, in, blocks / SCRYPT_HASH_BLOCK_SIZE);
 		in += blocks;
 	}
 	/* handle leftover data */
 	if (S->leftover)
 		memcpy(S->buffer, in, S->leftover);
 }
 static void neoscrypt_hash_finish_sha256(sha256_hash_state *S, uint8_t *hash)
 {
 	uint64_t t = S->T + (S->leftover * 8);
 	S->buffer[S->leftover] = 0x80;
 	if (S->leftover <= 55) {
 		memset(S->buffer + S->leftover + 1, 0, 55 - S->leftover);
 	} else {
 		memset(S->buffer + S->leftover + 1, 0, 63 - S->leftover);
 		sha256_blocks(S, S->buffer, 1);
 		memset(S->buffer, 0, 56);
 	}
 	U64TO8_BE(S->buffer + 56, t);
 	sha256_blocks(S, S->buffer, 1);
 	U32TO8_BE(&hash[ 0], S->H[0]);
 	U32TO8_BE(&hash[ 4], S->H[1]);
 	U32TO8_BE(&hash[ 8], S->H[2]);
 	U32TO8_BE(&hash[12], S->H[3]);
 	U32TO8_BE(&hash[16], S->H[4]);
 	U32TO8_BE(&hash[20], S->H[5]);
 	U32TO8_BE(&hash[24], S->H[6]);
 	U32TO8_BE(&hash[28], S->H[7]);
 }
 static void neoscrypt_hash_sha256(hash_digest hash, const uint8_t *m, size_t mlen)
 {
 	sha256_hash_state st;
 	neoscrypt_hash_init_sha256(&st);
 	neoscrypt_hash_update_sha256(&st, m, mlen);
 	neoscrypt_hash_finish_sha256(&st, hash);
 }
 /* HMAC for SHA-256 */
 typedef struct sha256_hmac_state_t {
 	sha256_hash_state inner, outer;
 } sha256_hmac_state;
 static void neoscrypt_hmac_init_sha256(sha256_hmac_state *st, const uint8_t *key, size_t keylen)
 {
 	uint8_t pad[SCRYPT_HASH_BLOCK_SIZE] = {0};
 	size_t i;
 	neoscrypt_hash_init_sha256(&st->inner);
 	neoscrypt_hash_init_sha256(&st->outer);
 	if (keylen <= SCRYPT_HASH_BLOCK_SIZE) {
 		/* use the key directly if it's <= blocksize bytes */
 		memcpy(pad, key, keylen);
 	} else {
 		/* if it's > blocksize bytes, hash it */
 		neoscrypt_hash_sha256(pad, key, keylen);
 	}
 	/* inner = (key ^ 0x36) */
 	/* h(inner || ...) */
 	for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
 		pad[i] ^= 0x36;
 	neoscrypt_hash_update_sha256(&st->inner, pad, SCRYPT_HASH_BLOCK_SIZE);
 	/* outer = (key ^ 0x5c) */
 	/* h(outer || ...) */
 	for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE; i++)
 		pad[i] ^= (0x5c ^ 0x36);
 	neoscrypt_hash_update_sha256(&st->outer, pad, SCRYPT_HASH_BLOCK_SIZE);
 }
 static void neoscrypt_hmac_update_sha256(sha256_hmac_state *st, const uint8_t *m, size_t mlen)
 {
 	/* h(inner || m...) */
 	neoscrypt_hash_update_sha256(&st->inner, m, mlen);
 }
 static void neoscrypt_hmac_finish_sha256(sha256_hmac_state *st, hash_digest mac)
 {
 	/* h(inner || m) */
 	hash_digest innerhash;
 	neoscrypt_hash_finish_sha256(&st->inner, innerhash);
 	/* h(outer || h(inner || m)) */
 	neoscrypt_hash_update_sha256(&st->outer, innerhash, sizeof(innerhash));
 	neoscrypt_hash_finish_sha256(&st->outer, mac);
 }
 /* PBKDF2 for SHA-256 */
 static void neoscrypt_pbkdf2_sha256(const uint8_t *password, size_t password_len,
 	const uint8_t *salt, size_t salt_len, uint64_t N, uint8_t *output, size_t output_len)
 {
 	sha256_hmac_state hmac_pw, hmac_pw_salt, work;
 	hash_digest ti, u;
 	uint8_t be[4];
 	uint32_t i, j, k, blocks;
 	/* bytes must be <= (0xffffffff - (SCRYPT_HASH_DIGEST_SIZE - 1)), which they will always be under scrypt */
 	/* hmac(password, ...) */
 	neoscrypt_hmac_init_sha256(&hmac_pw, password, password_len);
 	/* hmac(password, salt...) */
 	hmac_pw_salt = hmac_pw;
 	neoscrypt_hmac_update_sha256(&hmac_pw_salt, salt, salt_len);
 	blocks = ((uint32_t)output_len + (SCRYPT_HASH_DIGEST_SIZE - 1)) / SCRYPT_HASH_DIGEST_SIZE;
 	for(i = 1; i <= blocks; i++) {
 		/* U1 = hmac(password, salt || be(i)) */
 		U32TO8_BE(be, i);
 		work = hmac_pw_salt;
 		neoscrypt_hmac_update_sha256(&work, be, 4);
 		neoscrypt_hmac_finish_sha256(&work, ti);
 		memcpy(u, ti, sizeof(u));
 		/* T[i] = U1 ^ U2 ^ U3... */
 		for(j = 0; j < N - 1; j++) {
 			/* UX = hmac(password, U{X-1}) */
 			work = hmac_pw;
 			neoscrypt_hmac_update_sha256(&work, u, SCRYPT_HASH_DIGEST_SIZE);
 			neoscrypt_hmac_finish_sha256(&work, u);
 			/* T[i] ^= UX */
 			for(k = 0; k < sizeof(u); k++)
 				ti[k] ^= u[k];
 		}
 		memcpy(output, ti, (output_len > SCRYPT_HASH_DIGEST_SIZE) ? SCRYPT_HASH_DIGEST_SIZE : output_len);
 		output += SCRYPT_HASH_DIGEST_SIZE;
 		output_len -= SCRYPT_HASH_DIGEST_SIZE;
 	}
 }
 /* NeoScrypt */
 #if defined(ASM)
 extern void neoscrypt_salsa(uint *X, uint rounds);
 extern void neoscrypt_salsa_tangle(uint *X, uint count);
 extern void neoscrypt_chacha(uint *X, uint rounds);
 extern void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len);
 extern void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len);
 extern void neoscrypt_blkxor(void *dstp, const void *srcp, uint len);
 #else
 /* Salsa20, rounds must be a multiple of 2 */
 static void neoscrypt_salsa(uint *X, uint rounds)
 {
 	uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t;
 	x0 = X[0];   x1 = X[1];   x2 = X[2];   x3 = X[3];
 	x4 = X[4];   x5 = X[5];   x6 = X[6];   x7 = X[7];
 	x8 = X[8];   x9 = X[9];  x10 = X[10]; x11 = X[11];
   x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15];
 #define quarter(a, b, c, d) \
 	t = a + d; t = ROTL32(t,  7); b ^= t; \
 	t = b + a; t = ROTL32(t,  9); c ^= t; \
 	t = c + b; t = ROTL32(t, 13); d ^= t; \
 	t = d + c; t = ROTL32(t, 18); a ^= t;
 	for(; rounds; rounds -= 2) {
 		quarter( x0,  x4,  x8, x12);
 		quarter( x5,  x9, x13,  x1);
 		quarter(x10, x14,  x2,  x6);
 		quarter(x15,  x3,  x7, x11);
 		quarter( x0,  x1,  x2,  x3);
 		quarter( x5,  x6,  x7,  x4);
 		quarter(x10, x11,  x8,  x9);
 		quarter(x15, x12, x13, x14);
 	}
 	X[0] += x0;   X[1] += x1;   X[2] += x2;   X[3] += x3;
 	X[4] += x4;   X[5] += x5;   X[6] += x6;   X[7] += x7;
 	X[8] += x8;   X[9] += x9;  X[10] += x10; X[11] += x11;
   X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15;
 #undef quarter
 }
 /* ChaCha20, rounds must be a multiple of 2 */
 static void neoscrypt_chacha(uint *X, uint rounds)
 {
 	uint x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15, t;
 	x0 = X[0];   x1 = X[1];   x2 = X[2];   x3 = X[3];
 	x4 = X[4];   x5 = X[5];   x6 = X[6];   x7 = X[7];
 	x8 = X[8];   x9 = X[9];  x10 = X[10]; x11 = X[11];
   x12 = X[12]; x13 = X[13]; x14 = X[14]; x15 = X[15];
 #define quarter(a,b,c,d) \
 	a += b; t = d ^ a; d = ROTL32(t, 16); \
 	c += d; t = b ^ c; b = ROTL32(t, 12); \
 	a += b; t = d ^ a; d = ROTL32(t,  8); \
 	c += d; t = b ^ c; b = ROTL32(t,  7);
 	for(; rounds; rounds -= 2) {
 		quarter( x0,  x4,  x8, x12);
 		quarter( x1,  x5,  x9, x13);
 		quarter( x2,  x6, x10, x14);
 		quarter( x3,  x7, x11, x15);
 		quarter( x0,  x5, x10, x15);
 		quarter( x1,  x6, x11, x12);
 		quarter( x2,  x7,  x8, x13);
 		quarter( x3,  x4,  x9, x14);
 	}
 	X[0] += x0;   X[1] += x1;   X[2] += x2;   X[3] += x3;
 	X[4] += x4;   X[5] += x5;   X[6] += x6;   X[7] += x7;
 	X[8] += x8;   X[9] += x9;  X[10] += x10; X[11] += x11;
   X[12] += x12; X[13] += x13; X[14] += x14; X[15] += x15;
 #undef quarter
 }
 /* Fast 32-bit / 64-bit memcpy();
 * len must be a multiple of 32 bytes */
 static void neoscrypt_blkcpy(void *dstp, const void *srcp, uint len)
 {
 	ulong *dst = (ulong *) dstp;
 	ulong *src = (ulong *) srcp;
 	uint i;
 	for(i = 0; i < (len / sizeof(ulong)); i += 4) {
 		dst[i]     = src[i];
 		dst[i + 1] = src[i + 1];
 		dst[i + 2] = src[i + 2];
 		dst[i + 3] = src[i + 3];
 	}
 }
 /* Fast 32-bit / 64-bit block swapper;
 * len must be a multiple of 32 bytes */
 static void neoscrypt_blkswp(void *blkAp, void *blkBp, uint len)
 {
 	ulong *blkA = (ulong *) blkAp;
 	ulong *blkB = (ulong *) blkBp;
 	register ulong t0, t1, t2, t3;
 	uint i;
 	for(i = 0; i < (len / sizeof(ulong)); i += 4) {
 		t0          = blkA[i];
 		t1          = blkA[i + 1];
 		t2          = blkA[i + 2];
 		t3          = blkA[i + 3];
 		blkA[i]     = blkB[i];
 		blkA[i + 1] = blkB[i + 1];
 		blkA[i + 2] = blkB[i + 2];
 		blkA[i + 3] = blkB[i + 3];
 		blkB[i]     = t0;
 		blkB[i + 1] = t1;
 		blkB[i + 2] = t2;
 		blkB[i + 3] = t3;
 	}
 }
 /* Fast 32-bit / 64-bit block XOR engine;
 * len must be a multiple of 32 bytes */
 static void neoscrypt_blkxor(void *dstp, const void *srcp, uint len)
 {
 	ulong *dst = (ulong *) dstp;
 	ulong *src = (ulong *) srcp;
 	uint i;
 	for (i = 0; i < (len / sizeof(ulong)); i += 4) {
 		dst[i]     ^= src[i];
 		dst[i + 1] ^= src[i + 1];
 		dst[i + 2] ^= src[i + 2];
 		dst[i + 3] ^= src[i + 3];
 	}
 }
 #endif
 /* 32-bit / 64-bit optimised memcpy() */
 static void neoscrypt_copy(void *dstp, const void *srcp, uint len)
 {
 	ulong *dst = (ulong *) dstp;
 	ulong *src = (ulong *) srcp;
 	uint i, tail;
 	for(i = 0; i < (len / sizeof(ulong)); i++)
 		dst[i] = src[i];
 	tail = len & (sizeof(ulong) - 1);
 	if(tail) {
 		uchar *dstb = (uchar *) dstp;
 		uchar *srcb = (uchar *) srcp;
 		for(i = len - tail; i < len; i++)
 			dstb[i] = srcb[i];
 	}
 }
 /* 32-bit / 64-bit optimised memory erase aka memset() to zero */
 static void neoscrypt_erase(void *dstp, uint len)
 {
 	const ulong null = 0;
 	ulong *dst = (ulong *) dstp;
 	uint i, tail;
 	for (i = 0; i < (len / sizeof(ulong)); i++)
 		dst[i] = null;
 	tail = len & (sizeof(ulong) - 1);
 	if (tail) {
 		uchar *dstb = (uchar *) dstp;
 		for(i = len - tail; i < len; i++)
 			dstb[i] = (uchar)null;
 	}
 }
 /* 32-bit / 64-bit optimised XOR engine */
 static void neoscrypt_xor(void *dstp, const void *srcp, uint len)
 {
 	ulong *dst = (ulong *) dstp;
 	ulong *src = (ulong *) srcp;
 	uint i, tail;
 	for (i = 0; i < (len / sizeof(ulong)); i++)
 		dst[i] ^= src[i];
 	tail = len & (sizeof(ulong) - 1);
 	if (tail) {
 		uchar *dstb = (uchar *) dstp;
 		uchar *srcb = (uchar *) srcp;
 		for(i = len - tail; i < len; i++)
 			dstb[i] ^= srcb[i];
 	}
 }
 /* BLAKE2s */
 #define BLAKE2S_BLOCK_SIZE    64U
 #define BLAKE2S_OUT_SIZE      32U
 #define BLAKE2S_KEY_SIZE      32U
 /* Parameter block of 32 bytes */
 typedef struct blake2s_param_t {
 	uchar digest_length;
 	uchar key_length;
 	uchar fanout;
 	uchar depth;
 	uint  leaf_length;
 	uchar node_offset[6];
 	uchar node_depth;
 	uchar inner_length;
 	uchar salt[8];
 	uchar personal[8];
 } blake2s_param;
 /* State block of 180 bytes */
 typedef struct blake2s_state_t {
 	uint  h[8];
 	uint  t[2];
 	uint  f[2];
 	uchar buf[2 * BLAKE2S_BLOCK_SIZE];
 	uint  buflen;
 } blake2s_state;
 static const uint blake2s_IV[8] = {
 	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
 	0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
 };
 static const uint8_t blake2s_sigma[10][16] = {
 	{  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 } ,
 	{ 14, 10,  4,  8,  9, 15, 13,  6,  1, 12,  0,  2, 11,  7,  5,  3 } ,
 	{ 11,  8, 12,  0,  5,  2, 15, 13, 10, 14,  3,  6,  7,  1,  9,  4 } ,
 	{  7,  9,  3,  1, 13, 12, 11, 14,  2,  6,  5, 10,  4,  0, 15,  8 } ,
 	{  9,  0,  5,  7,  2,  4, 10, 15, 14,  1, 11, 12,  6,  8,  3, 13 } ,
 	{  2, 12,  6, 10,  0, 11,  8,  3,  4, 13,  7,  5, 15, 14,  1,  9 } ,
 	{ 12,  5,  1, 15, 14, 13,  4, 10,  0,  7,  6,  3,  9,  2,  8, 11 } ,
 	{ 13, 11,  7, 14, 12,  1,  3,  9,  5,  0, 15,  4,  8,  6,  2, 10 } ,
 	{  6, 15, 14,  9, 11,  3,  0,  8, 12,  2, 13,  7,  1,  4, 10,  5 } ,
 	{ 10,  2,  8,  4,  7,  6,  1,  5, 15, 11,  9, 14,  3, 12, 13 , 0 } ,
 };
 static void blake2s_compress(blake2s_state *S, const uint *buf)
 {
 	uint i;
 	uint m[16];
 	uint v[16];
 	neoscrypt_copy(m, buf, 64);
 	neoscrypt_copy(v, S, 32);
 	v[ 8] = blake2s_IV[0];
 	v[ 9] = blake2s_IV[1];
 	v[10] = blake2s_IV[2];
 	v[11] = blake2s_IV[3];
 	v[12] = S->t[0] ^ blake2s_IV[4];
 	v[13] = S->t[1] ^ blake2s_IV[5];
 	v[14] = S->f[0] ^ blake2s_IV[6];
 	v[15] = S->f[1] ^ blake2s_IV[7];
 #define G(r,i,a,b,c,d) do { \
 	a = a + b + m[blake2s_sigma[r][2*i+0]]; \
 	d = ROTR32(d ^ a, 16); \
 	c = c + d; \
 	b = ROTR32(b ^ c, 12); \
 	a = a + b + m[blake2s_sigma[r][2*i+1]]; \
 	d = ROTR32(d ^ a, 8); \
 	c = c + d; \
 	b = ROTR32(b ^ c, 7); \
 } while(0)
 #define ROUND(r) do { \
 	G(r, 0, v[ 0], v[ 4], v[ 8], v[12]); \
 	G(r, 1, v[ 1], v[ 5], v[ 9], v[13]); \
 	G(r, 2, v[ 2], v[ 6], v[10], v[14]); \
 	G(r, 3, v[ 3], v[ 7], v[11], v[15]); \
 	G(r, 4, v[ 0], v[ 5], v[10], v[15]); \
 	G(r, 5, v[ 1], v[ 6], v[11], v[12]); \
 	G(r, 6, v[ 2], v[ 7], v[ 8], v[13]); \
 	G(r, 7, v[ 3], v[ 4], v[ 9], v[14]); \
 } while(0)
 	ROUND(0);
 	ROUND(1);
 	ROUND(2);
 	ROUND(3);
 	ROUND(4);
 	ROUND(5);
 	ROUND(6);
 	ROUND(7);
 	ROUND(8);
 	ROUND(9);
 	for (i = 0; i < 8; i++)
 		S->h[i] = S->h[i] ^ v[i] ^ v[i + 8];
 #undef G
 #undef ROUND
 }
 static void blake2s_update(blake2s_state *S, const uchar *input, uint input_size)
 {
 	uint left, fill;
 	while(input_size > 0) {
 		left = S->buflen;
 		fill = 2 * BLAKE2S_BLOCK_SIZE - left;
 		if(input_size > fill) {
 			/* Buffer fill */
 			neoscrypt_copy(S->buf + left, input, fill);
 			S->buflen += fill;
 			/* Counter increment */
 			S->t[0] += BLAKE2S_BLOCK_SIZE;
 			/* Compress */
 			blake2s_compress(S, (uint *) S->buf);
 			/* Shift buffer left */
 			neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, BLAKE2S_BLOCK_SIZE);
 			S->buflen -= BLAKE2S_BLOCK_SIZE;
 			input += fill;
 			input_size -= fill;
 		} else {
 			neoscrypt_copy(S->buf + left, input, input_size);
 			S->buflen += input_size;
 			/* Do not compress */
 			input += input_size;
 			input_size = 0;
 		}
 	}
 }
 static void neoscrypt_blake2s(const void *input, const uint input_size, const void *key, const uchar key_size,
 	void *output, const uchar output_size)
 {
 	uchar block[BLAKE2S_BLOCK_SIZE];
 	blake2s_param P[1];
 	blake2s_state S[1];
 	/* Initialise */
 	neoscrypt_erase(P, 32);
 	P->digest_length = output_size;
 	P->key_length    = key_size;
 	P->fanout        = 1;
 	P->depth         = 1;
 	neoscrypt_erase(S, 180);
 	neoscrypt_copy(S, blake2s_IV, 32);
 	neoscrypt_xor(S, P, 32);
 	neoscrypt_erase(block, BLAKE2S_BLOCK_SIZE);
 	neoscrypt_copy(block, key, key_size);
 	blake2s_update(S, (uchar *) block, BLAKE2S_BLOCK_SIZE);
 	/* Update */
 	blake2s_update(S, (uchar *) input, input_size);
 	/* Finish */
 	if(S->buflen > BLAKE2S_BLOCK_SIZE) {
 		S->t[0] += BLAKE2S_BLOCK_SIZE;
 		blake2s_compress(S, (uint *) S->buf);
 		S->buflen -= BLAKE2S_BLOCK_SIZE;
 		neoscrypt_copy(S->buf, S->buf + BLAKE2S_BLOCK_SIZE, S->buflen);
 	}
 	S->t[0] += S->buflen;
 	S->f[0] = ~0U;
 	neoscrypt_erase(S->buf + S->buflen, 2 * BLAKE2S_BLOCK_SIZE - S->buflen);
 	blake2s_compress(S, (uint *) S->buf);
 	/* Write back */
 	neoscrypt_copy(output, S, output_size);
 	//for (int k = 0; k<4; k++) { printf("cpu blake   %d %08x %08x\n", k, ((unsigned int*)output)[2 * k], ((unsigned int*)output)[2 * k + 1]); }
 }
 #define FASTKDF_BUFFER_SIZE 256U
 /* FastKDF, a fast buffered key derivation function:
 * FASTKDF_BUFFER_SIZE must be a power of 2;
 * password_len, salt_len and output_len should not exceed FASTKDF_BUFFER_SIZE;
 * prf_output_size must be <= prf_key_size; */
 static void neoscrypt_fastkdf(const uchar *password, uint password_len, const uchar *salt, uint salt_len,
 	uint N, uchar *output, uint output_len)
 {
 	//for (int i = 0; i<10; i++) { printf("cpu password %d %08x %08x\n", i, ((unsigned int*)password)[2 * i], ((unsigned int*)password)[2 * i+1]); }
 	const uint stack_align =  0x40;
 	const uint kdf_buf_size = 256U; //FASTKDF_BUFFER_SIZE
 	const uint prf_input_size = 64U; //BLAKE2S_BLOCK_SIZE
 	const uint prf_key_size = 32U; //BLAKE2S_KEY_SIZE
 	const uint prf_output_size = 32U; //BLAKE2S_OUT_SIZE
 	uint bufptr, a, b, i, j;
 	uchar *A, *B, *prf_input, *prf_key, *prf_output;
 	uchar *stack;
 	stack = (uchar*)malloc(sizeof(uchar) * 2 * kdf_buf_size + prf_input_size + prf_key_size + prf_output_size + stack_align);
 	/* Align and set up the buffers in stack */
 	//uchar stack[2 * kdf_buf_size + prf_input_size + prf_key_size + prf_output_size + stack_align];
 	A          = &stack[stack_align & ~(stack_align - 1)];
 	B          = &A[kdf_buf_size + prf_input_size];
 	prf_output = &A[2 * kdf_buf_size + prf_input_size + prf_key_size];
 	/* Initialise the password buffer */
 	if(password_len > kdf_buf_size)
 		password_len = kdf_buf_size;
 	a = kdf_buf_size / password_len;
 	for(i = 0; i < a; i++)
 		neoscrypt_copy(&A[i * password_len], &password[0], password_len);
 	b = kdf_buf_size - a * password_len;
 	if(b)
 		neoscrypt_copy(&A[a * password_len], &password[0], b);
 	neoscrypt_copy(&A[kdf_buf_size], &password[0], prf_input_size);
 	/* Initialise the salt buffer */
 	if(salt_len > kdf_buf_size)
 		salt_len = kdf_buf_size;
 	a = kdf_buf_size / salt_len;
 	for(i = 0; i < a; i++)
 		neoscrypt_copy(&B[i * salt_len], &salt[0], salt_len);
 	b = kdf_buf_size - a * salt_len;
 	if(b)
 		neoscrypt_copy(&B[a * salt_len], &salt[0], b);
 	neoscrypt_copy(&B[kdf_buf_size], &salt[0], prf_key_size);
 	/* The primary iteration */
 	for(i = 0, bufptr = 0; i < N; i++) {
 		/* Map the PRF input buffer */
 		prf_input = &A[bufptr];
 		/* Map the PRF key buffer */
 		prf_key = &B[bufptr];
 		/* PRF */
 		// for (int k = 0; k<(prf_input_size/4); k++) { printf("cpu bufptr %08x before blake %d  %d %08x \n",bufptr, i, k, ((unsigned int*)prf_input)[k]); }
 		neoscrypt_blake2s(prf_input, prf_input_size, prf_key, prf_key_size, prf_output, prf_output_size);
 		// for (int k = 0; k<(prf_output_size/4); k++) { printf("cpu after blake %d  %d %08x \n", i, k, ((unsigned int*)prf_output)[k]); }
 		/* Calculate the next buffer pointer */
 		for(j = 0, bufptr = 0; j < prf_output_size; j++)
 			bufptr += prf_output[j];
 		bufptr &= (kdf_buf_size - 1);
 		/* Modify the salt buffer */
 		neoscrypt_xor(&B[bufptr], &prf_output[0], prf_output_size);
 		/* Head modified, tail updated */
 		if(bufptr < prf_key_size)
 			neoscrypt_copy(&B[kdf_buf_size + bufptr], &B[bufptr], MIN(prf_output_size, prf_key_size - bufptr));
 		/* Tail modified, head updated */
 		if((kdf_buf_size - bufptr) < prf_output_size)
 			neoscrypt_copy(&B[0], &B[kdf_buf_size], prf_output_size - (kdf_buf_size - bufptr));
 	}
 	/* Modify and copy into the output buffer */
 	if(output_len > kdf_buf_size)
 		output_len = kdf_buf_size;
 	a = kdf_buf_size - bufptr;
 	if(a >= output_len) {
 		neoscrypt_xor(&B[bufptr], &A[0], output_len);
 		neoscrypt_copy(&output[0], &B[bufptr], output_len);
 	} else {
 		neoscrypt_xor(&B[bufptr], &A[0], a);
 		neoscrypt_xor(&B[0], &A[a], output_len - a);
 		neoscrypt_copy(&output[0], &B[bufptr], a);
 		neoscrypt_copy(&output[a], &B[0], output_len - a);
 	}
 	// for (int i = 0; i<10; i++) { printf("cpu fastkdf %d %08x %08x\n", i, ((unsigned int*)output)[2 * i], ((unsigned int*)output)[2 * i + 1]); }
 }
 /* Configurable optimised block mixer */
 static void neoscrypt_blkmix(uint *X, uint *Y, uint r, uint mixmode)
 {
 	uint i, mixer, rounds;
 	mixer  = mixmode >> 8;
 	rounds = mixmode & 0xFF;
 	/* NeoScrypt flow:                   Scrypt flow:
 		 Xa ^= Xd;  M(Xa'); Ya = Xa";      Xa ^= Xb;  M(Xa'); Ya = Xa";
 		 Xb ^= Xa"; M(Xb'); Yb = Xb";      Xb ^= Xa"; M(Xb'); Yb = Xb";
 		 Xc ^= Xb"; M(Xc'); Yc = Xc";      Xa" = Ya;
 		 Xd ^= Xc"; M(Xd'); Yd = Xd";      Xb" = Yb;
 		 Xa" = Ya; Xb" = Yc;
 		 Xc" = Yb; Xd" = Yd; */
 	if (r == 1) {
 		neoscrypt_blkxor(&X[0], &X[16], SCRYPT_BLOCK_SIZE);
 		if(mixer)
 			neoscrypt_chacha(&X[0], rounds);
 		else
 			neoscrypt_salsa(&X[0], rounds);
 		neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE);
 		if(mixer)
 			neoscrypt_chacha(&X[16], rounds);
 		else
 			neoscrypt_salsa(&X[16], rounds);
 		return;
 	}
 	if (r == 2) {
 		neoscrypt_blkxor(&X[0], &X[48], SCRYPT_BLOCK_SIZE);
 		if(mixer)
 			neoscrypt_chacha(&X[0], rounds);
 		else
 			neoscrypt_salsa(&X[0], rounds);
 		neoscrypt_blkxor(&X[16], &X[0], SCRYPT_BLOCK_SIZE);
 		if(mixer)
 			neoscrypt_chacha(&X[16], rounds);
 		else
 			neoscrypt_salsa(&X[16], rounds);
 		neoscrypt_blkxor(&X[32], &X[16], SCRYPT_BLOCK_SIZE);
 		if(mixer)
 			neoscrypt_chacha(&X[32], rounds);
 		else
 			neoscrypt_salsa(&X[32], rounds);
 		neoscrypt_blkxor(&X[48], &X[32], SCRYPT_BLOCK_SIZE);
 		if(mixer)
 			neoscrypt_chacha(&X[48], rounds);
 		else
 			neoscrypt_salsa(&X[48], rounds);
 		neoscrypt_blkswp(&X[16], &X[32], SCRYPT_BLOCK_SIZE);
 		return;
 	}
 	/* Reference code for any reasonable r */
 	for (i = 0; i < 2 * r; i++) {
 		if(i) neoscrypt_blkxor(&X[16 * i], &X[16 * (i - 1)], SCRYPT_BLOCK_SIZE);
 		else  neoscrypt_blkxor(&X[0], &X[16 * (2 * r - 1)], SCRYPT_BLOCK_SIZE);
 		if(mixer)
 			neoscrypt_chacha(&X[16 * i], rounds);
 		else
 			neoscrypt_salsa(&X[16 * i], rounds);
 		neoscrypt_blkcpy(&Y[16 * i], &X[16 * i], SCRYPT_BLOCK_SIZE);
 	}
 	for (i = 0; i < r; i++)
 		neoscrypt_blkcpy(&X[16 * i], &Y[16 * 2 * i], SCRYPT_BLOCK_SIZE);
 	for (i = 0; i < r; i++)
 		neoscrypt_blkcpy(&X[16 * (i + r)], &Y[16 * (2 * i + 1)], SCRYPT_BLOCK_SIZE);
 }
 /* NeoScrypt core engine:
 * p = 1, salt = password;
 * Basic customisation (required):
 *   profile bit 0:
 *     0 = NeoScrypt(128, 2, 1) with Salsa20/20 and ChaCha20/20;
 *     1 = Scrypt(1024, 1, 1) with Salsa20/8;
 *   profile bits 4 to 1:
 *     0000 = FastKDF-BLAKE2s;
 *     0001 = PBKDF2-HMAC-SHA256;
 * Extended customisation (optional):
 *   profile bit 31:
 *     0 = extended customisation absent;
 *     1 = extended customisation present;
 *   profile bits 7 to 5 (rfactor):
 *     000 = r of 1;
 *     001 = r of 2;
 *     010 = r of 4;
 *     ...
 *     111 = r of 128;
 *   profile bits 12 to 8 (Nfactor):
 *     00000 = N of 2;
 *     00001 = N of 4;
 *     00010 = N of 8;
 *     .....
 *     00110 = N of 128;
 *     .....
 *     01001 = N of 1024;
 *     .....
 *     11110 = N of 2147483648;
 *   profile bits 30 to 13 are reserved */
 void neoscrypt(const uchar *password, uchar *output, uint profile)
 {
 	uint N = 128, r = 2, dblmix = 1, mixmode = 0x14, stack_align = 0x40;
 	uint kdf, i, j;
 	uint *X, *Y, *Z, *V;
 	if(profile & 0x1) {
 		N = 1024;        /* N = (1 << (Nfactor + 1)); */
 		r = 1;           /* r = (1 << rfactor); */
 		dblmix = 0;      /* Salsa only */
 		mixmode = 0x08;  /* 8 rounds */
 	}
 	if(profile >> 31) {
 		N = (1 << (((profile >> 8) & 0x1F) + 1));
 		r = (1 << ((profile >> 5) & 0x7));
 	}
 	uchar *stack;
 	stack = (uchar*)malloc(((N + 3) * r * 2 * SCRYPT_BLOCK_SIZE + stack_align)*sizeof(uchar));
 	/* X = r * 2 * SCRYPT_BLOCK_SIZE */
 	X = (uint *) &stack[stack_align & ~(stack_align - 1)];
 	/* Z is a copy of X for ChaCha */
 	Z = &X[32 * r];
 	/* Y is an X sized temporal space */
 	Y = &X[64 * r];
 	/* V = N * r * 2 * SCRYPT_BLOCK_SIZE */
 	V = &X[96 * r];
 	/* X = KDF(password, salt) */
 	kdf = (profile >> 1) & 0xF;
 	switch(kdf) {
 	default:
 	case(0x0):
 		neoscrypt_fastkdf(password, 80, password, 80, 32, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE);
 		break;
 	case(0x1):
 		neoscrypt_pbkdf2_sha256(password, 80, password, 80, 1, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE);
 		break;
 	}
 	/* Process ChaCha 1st, Salsa 2nd and XOR them into FastKDF; otherwise Salsa only */
 	if(dblmix) {
 		/* blkcpy(Z, X) */
 		neoscrypt_blkcpy(&Z[0], &X[0], r * 2 * SCRYPT_BLOCK_SIZE);
 		/* Z = SMix(Z) */
 		for(i = 0; i < N; i++) {
 			/* blkcpy(V, Z) */
 			neoscrypt_blkcpy(&V[i * (32 * r)], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE);
 			/* blkmix(Z, Y) */
 			neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100));
 		}
 		for(i = 0; i < N; i++) {
 			/* integerify(Z) mod N */
 			j = (32 * r) * (Z[16 * (2 * r - 1)] & (N - 1));
 			/* blkxor(Z, V) */
 			neoscrypt_blkxor(&Z[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE);
 			/* blkmix(Z, Y) */
 			neoscrypt_blkmix(&Z[0], &Y[0], r, (mixmode | 0x0100));
 		}
 	}
 #if (ASM)
 	/* Must be called before and after SSE2 Salsa */
 	neoscrypt_salsa_tangle(&X[0], r * 2);
 #endif
 	/* X = SMix(X) */
 	for(i = 0; i < N; i++) {
 		/* blkcpy(V, X) */
 		neoscrypt_blkcpy(&V[i * (32 * r)], &X[0], r * 2 * SCRYPT_BLOCK_SIZE);
 		/* blkmix(X, Y) */
 		neoscrypt_blkmix(&X[0], &Y[0], r, mixmode);
 	}
 	for(i = 0; i < N; i++) {
 		/* integerify(X) mod N */
 		j = (32 * r) * (X[16 * (2 * r - 1)] & (N - 1));
 		/* blkxor(X, V) */
 		neoscrypt_blkxor(&X[0], &V[j], r * 2 * SCRYPT_BLOCK_SIZE);
 		/* blkmix(X, Y) */
 		neoscrypt_blkmix(&X[0], &Y[0], r, mixmode);
 	}
 #if (ASM)
 	neoscrypt_salsa_tangle(&X[0], r * 2);
 #endif
 	if(dblmix)
 		/* blkxor(X, Z) */
 		neoscrypt_blkxor(&X[0], &Z[0], r * 2 * SCRYPT_BLOCK_SIZE);
 	/* output = KDF(password, X) */
 	switch(kdf) {
 	default:
 	case(0x0):
 		neoscrypt_fastkdf(password, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 32, output, 32);
 		break;
 	case(0x1):
 		neoscrypt_pbkdf2_sha256(password, 80, (uchar *) X, r * 2 * SCRYPT_BLOCK_SIZE, 1, output, 32);
 		break;
 	}
 }
--- a/neoscrypt/neoscrypt.h
+++ b/neoscrypt/neoscrypt.h
@ -0,0 +1,33 @@
 #if (__cplusplus)
 extern "C" {
 #endif
 void neoscrypt(const unsigned char *input, unsigned char *output, unsigned int profile);
 #if (__cplusplus)
 }
 #else
 #define SCRYPT_BLOCK_SIZE 64
 #define SCRYPT_HASH_BLOCK_SIZE 64
 #define SCRYPT_HASH_DIGEST_SIZE 32
 typedef uint8_t hash_digest[SCRYPT_HASH_DIGEST_SIZE];
 #define ROTL32(a,b) (((a) << (b)) | ((a) >> (32 - b)))
 #define ROTR32(a,b) (((a) >> (b)) | ((a) << (32 - b)))
 #define U8TO32_BE(p) \
    (((uint32_t)((p)[0]) << 24) | ((uint32_t)((p)[1]) << 16) | \
    ((uint32_t)((p)[2]) <<  8) | ((uint32_t)((p)[3])))
 #define U32TO8_BE(p, v) \
    (p)[0] = (uint8_t)((v) >> 24); (p)[1] = (uint8_t)((v) >> 16); \
    (p)[2] = (uint8_t)((v) >>  8); (p)[3] = (uint8_t)((v)      );
 #define U64TO8_BE(p, v) \
    U32TO8_BE((p),     (uint32_t)((v) >> 32)); \
    U32TO8_BE((p) + 4, (uint32_t)((v)      ));
 #endif
--- a/util.cpp
+++ b/util.cpp
@ -1777,6 +1777,9 @@ void print_hash_tests(void)
 	myriadhash(&hash[0], &buf[0]);
 	printpfx("myriad", hash);
 	neoscrypt(&buf[0], &hash[0], 80000620);
 	printpfx("neoscrypt", hash);
 	nist5hash(&hash[0], &buf[0]);
 	printpfx("nist5", hash);