From a4196b341d74b5abeba8239da8c7125ed4fe0f82 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Fri, 8 Jul 2016 15:23:02 +0200
Subject: [PATCH] neoscrypt: apply last VTC improvements

rewrote almost properly ;)
---
 README.txt                  |    5 +-
 configure.ac                |    2 +-
 neoscrypt/cuda_neoscrypt.cu | 1697 ++++++++++++++++++++++++-----------
 neoscrypt/neoscrypt.cpp     |   41 +-
 quark/quarkcoin.cu          |   11 +-
 5 files changed, 1212 insertions(+), 544 deletions(-)

diff --git a/README.txt b/README.txt
index e1bce2e..4f2bb58 100644
--- a/README.txt
+++ b/README.txt
@@ -240,11 +240,12 @@ features.
 
 >>> RELEASE HISTORY <<<
 
-  June 2016       v1.8.0
+  July 2016       v1.8.0
                   Pascal support with cuda 8
                   x11evo algo (XRE)
-                  Lyra2v2 and Decred hashrate improvements
+                  Lyra2v2, Neoscrypt and Decred improvements
                   Enhance windows NVAPI clock and power limits
+                  Led support for mining/shares activity on windows
 
   May  18th 2016  v1.7.6
                   Decred vote support
diff --git a/configure.ac b/configure.ac
index 62486f5..d0cc35a 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [1.8-dev], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [1.8], [], [ccminer], [http://github.com/tpruvot/ccminer])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu
index e7f4b21..43279b1 100644
--- a/neoscrypt/cuda_neoscrypt.cu
+++ b/neoscrypt/cuda_neoscrypt.cu
@@ -1,16 +1,38 @@
+// originally from djm34 - github.com/djm34/ccminer-sp-neoscrypt
+
 #include <stdio.h>
 #include <memory.h>
 
-#include "cuda_helper.h"
-#include "cuda_vectors.h" /* NOT COMPATIBLE WITH SM 3.0 !!! */
+#include <cuda_helper.h>
+#include <cuda_vector_uint2x4.h>
+#include "cuda_vectors.h"
+
+#include "miner.h"
+
+#ifdef __INTELLISENSE__
+#define __CUDA_ARCH__ 500
+#define atomicExch(p,x) x
+#endif
+
+static __thread cudaStream_t stream[2];
+
+__device__ __align__(16) uint2x4* W;
+__device__ __align__(16) uint2x4* W2;
+__device__ __align__(16) uint2x4* Tr;
+__device__ __align__(16) uint2x4* Tr2;
+__device__ __align__(16) uint2x4* Input;
+__device__ __align__(16) uint2x4* B2;
 
-static uint32_t *d_buffer[MAX_GPUS];
 static uint32_t *d_NNonce[MAX_GPUS];
-__constant__ uint4* W;
+
 __constant__ uint32_t pTarget[8];
 __constant__ uint32_t key_init[16];
 __constant__ uint32_t input_init[16];
-__constant__ uint32_t c_data[80];
+__constant__ uint32_t c_data[64];
+
+#define BLOCK_SIZE         64U
+#define BLAKE2S_BLOCK_SIZE 64U
+#define BLAKE2S_OUT_SIZE   32U
 
 /// constants ///
 
@@ -37,26 +59,99 @@ static const uint32_t BLAKE2S_SIGMA_host[10][16] = {
 	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
 };
 
-static __constant__ uint32_t BLAKE2S_SIGMA[10][16];
+__constant__ uint32_t BLAKE2S_SIGMA[10][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+};
 
-#define FASTKDF_BUFFER_SIZE  256U
+#define SALSA(a,b,c,d) { \
+	t =a+d; t=rotateL(t,  7); b^=t; \
+	t =b+a; t=rotateL(t,  9); c^=t; \
+	t =c+b; t=rotateL(t, 13); d^=t; \
+	t =d+c; t=rotateL(t, 18); a^=t; \
+}
 
-// Blake2S
+#define SALSA_CORE(state) { \
+	SALSA(state.s0, state.s4, state.s8, state.sc); \
+	SALSA(state.s5, state.s9, state.sd, state.s1); \
+	SALSA(state.sa, state.se, state.s2, state.s6); \
+	SALSA(state.sf, state.s3, state.s7, state.sb); \
+	SALSA(state.s0, state.s1, state.s2, state.s3); \
+	SALSA(state.s5, state.s6, state.s7, state.s4); \
+	SALSA(state.sa, state.sb, state.s8, state.s9); \
+	SALSA(state.sf, state.sc, state.sd, state.se); \
+}
 
-#define BLAKE2S_BLOCK_SIZE    64U
-#define BLAKE2S_OUT_SIZE      32U
-#define BLAKE2S_KEY_SIZE      32U
+__device__ __forceinline__
+static void shift256R4(uint32_t * ret, const uint8 &vec4, uint32_t shift2)
+{
+#if __CUDA_ARCH__ >= 320
+	uint32_t shift = 32 - shift2;
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[0]) : "r"(0), "r"(vec4.s0), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[1]) : "r"(vec4.s0), "r"(vec4.s1), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[2]) : "r"(vec4.s1), "r"(vec4.s2), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[3]) : "r"(vec4.s2), "r"(vec4.s3), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[4]) : "r"(vec4.s3), "r"(vec4.s4), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[5]) : "r"(vec4.s4), "r"(vec4.s5), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[6]) : "r"(vec4.s5), "r"(vec4.s6), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[7]) : "r"(vec4.s6), "r"(vec4.s7), "r"(shift));
+	asm("shr.b32         %0, %1, %2;"     : "=r"(ret[8]) : "r"(vec4.s7), "r"(shift));
+#else
+	// to check
+	shift256R(ret, vec4, shift2);
+#endif
+}
 
 #if __CUDA_ARCH__ >= 500
-#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \
-	idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
+
+#define CHACHA_STEP(a,b,c,d) { \
 	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateL(b^c, 12); \
+	a += b; d = __byte_perm(d^a, 0, 0x2103); \
+	c += d; b = rotateL(b^c, 7); \
+}
+
+#else
+
+#define CHACHA_STEP(a,b,c,d) { \
+	a += b; d = rotateL(d^a, 16); \
+	c += d; b = rotateL(b^c, 12); \
+	a += b; d = rotateL(d^a, 8); \
+	c += d; b = rotateL(b^c, 7); \
+}
+
+#endif
+
+#define CHACHA_CORE_PARALLEL(state)	{ \
+	CHACHA_STEP(state.lo.s0, state.lo.s4, state.hi.s0, state.hi.s4); \
+	CHACHA_STEP(state.lo.s1, state.lo.s5, state.hi.s1, state.hi.s5); \
+	CHACHA_STEP(state.lo.s2, state.lo.s6, state.hi.s2, state.hi.s6); \
+	CHACHA_STEP(state.lo.s3, state.lo.s7, state.hi.s3, state.hi.s7); \
+	CHACHA_STEP(state.lo.s0, state.lo.s5, state.hi.s2, state.hi.s7); \
+	CHACHA_STEP(state.lo.s1, state.lo.s6, state.hi.s3, state.hi.s4); \
+	CHACHA_STEP(state.lo.s2, state.lo.s7, state.hi.s0, state.hi.s5); \
+	CHACHA_STEP(state.lo.s3, state.lo.s4, state.hi.s1, state.hi.s6); \
+}
+
+#if __CUDA_ARCH__ < 500
+
+#define BLAKE(a, b, c, d, key1, key2) { \
+	a += key1; \
+	a += b; d = rotateL(d^a, 16); \
 	c += d; b = rotateR(b^c, 12); \
-	idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \
-	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	a += key2; \
+	a += b; d = rotateR(d^a, 8); \
 	c += d; b = rotateR(b^c, 7); \
 }
-#else
+
 #define BLAKE_G(idx0, idx1, a, b, c, d, key) { \
 	idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
 	a += b; d = rotateL(d^a, 16); \
@@ -65,39 +160,41 @@ static __constant__ uint32_t BLAKE2S_SIGMA[10][16];
 	a += b; d = rotateR(d^a, 8); \
 	c += d; b = rotateR(b^c, 7); \
 }
-#endif
 
-#if __CUDA_ARCH__ >= 500
 #define BLAKE_G_PRE(idx0, idx1, a, b, c, d, key) { \
 	a += key[idx0]; \
-	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	a += b; d = rotateL(d^a, 16); \
 	c += d; b = rotateR(b^c, 12); \
 	a += key[idx1]; \
-	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	a += b; d = rotateR(d^a, 8); \
 	c += d; b = rotateR(b^c, 7); \
 }
-#else
-#define BLAKE_G_PRE(idx0, idx1, a, b, c, d, key) { \
+
+#define BLAKE_G_PRE0(idx0, idx1, a, b, c, d, key) { \
+	a += b; d = rotateL(d^a, 16); \
+	c += d; b = rotateR(b^c, 12); \
+	a += b; d = rotateR(d^a, 8); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE_G_PRE1(idx0, idx1, a, b, c, d, key) { \
 	a += key[idx0]; \
 	a += b; d = rotateL(d^a, 16); \
 	c += d; b = rotateR(b^c, 12); \
-	a += key[idx1]; \
 	a += b; d = rotateR(d^a, 8); \
 	c += d; b = rotateR(b^c, 7); \
 }
-#endif
 
-#define BLAKE_Ghost(idx0, idx1, a, b, c, d, key) { \
-	idx = BLAKE2S_SIGMA_host[idx0][idx1]; a += key[idx]; \
-	a += b; d = ROTR32(d^a,16); \
-	c += d; b = ROTR32(b^c, 12); \
-	idx = BLAKE2S_SIGMA_host[idx0][idx1+1]; a += key[idx]; \
-	a += b; d = ROTR32(d^a,8); \
-	c += d; b = ROTR32(b^c, 7); \
+#define BLAKE_G_PRE2(idx0, idx1, a, b, c, d, key) { \
+	a += b; d = rotateL(d^a, 16); \
+	c += d; b = rotateR(b^c, 12); \
+	a += key[idx1]; \
+	a += b; d = rotateR(d^a, 8); \
+	c += d; b = rotateR(b^c, 7); \
 }
 
 static __forceinline__ __device__
-void Blake2S(uint32_t * inout, const uint32_t * TheKey)
+void Blake2S(uint32_t *out, const uint32_t* const __restrict__  inout, const  uint32_t * const __restrict__ TheKey)
 {
 	uint16 V;
 	uint32_t idx;
@@ -112,122 +209,98 @@ void Blake2S(uint32_t * inout, const uint32_t * TheKey)
 
 	V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;
 
-#if 0
-	for (int x = 0; x < 10; ++x)
-	{
-		BLAKE_G(x, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-		BLAKE_G(x, 0x2, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-		BLAKE_G(x, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-		BLAKE_G(x, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-		BLAKE_G(x, 0x8, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-		BLAKE_G(x, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-		BLAKE_G(x, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-		BLAKE_G(x, 0xE, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-	}
-#else
-	// { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-	BLAKE_G_PRE(0x0, 0x1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x2, 0x3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x4, 0x5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-	BLAKE_G_PRE(0x6, 0x7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x8, 0x9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0xA, 0xB, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0xC, 0xD, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0xE, 0xF, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-
+	//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 	// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-	BLAKE_G_PRE(0xE, 0xA, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x4, 0x8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x9, 0xF, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-	BLAKE_G_PRE(0xD, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x1, 0xC, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x0, 0x2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0xB, 0x7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x5, 0x3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-
+	BLAKE_G_PRE0(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE2(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 	// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-	BLAKE_G_PRE(0xB, 0x8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0xC, 0x0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x5, 0x2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-	BLAKE_G_PRE(0xF, 0xD, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0xA, 0xE, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x3, 0x6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x7, 0x1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x9, 0x4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-
+	BLAKE_G_PRE0(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 	// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-	BLAKE_G_PRE(0x7, 0x9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x3, 0x1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0xD, 0xC, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-	BLAKE_G_PRE(0xB, 0xE, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x2, 0x6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x5, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x4, 0x0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0xF, 0x8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-
+	BLAKE_G_PRE1(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 	// { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-	BLAKE_G_PRE(0x9, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x5, 0x7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x2, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-	BLAKE_G_PRE(0xA, 0xF, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0xE, 0x1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0xB, 0xC, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x6, 0x8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x3, 0xD, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-
+	BLAKE_G_PRE2(9, 0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(5, 7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(2, 4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(10, 15, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(14, 1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(11, 12, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(6, 8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(3, 13, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 	// { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-	BLAKE_G_PRE(0x2, 0xC, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x6, 0xA, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x0, 0xB, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-	BLAKE_G_PRE(0x8, 0x3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x4, 0xD, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x7, 0x5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0xF, 0xE, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x1, 0x9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-
+	BLAKE_G_PRE1(2, 12, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(6, 10, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(0, 11, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE2(8, 3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(4, 13, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(7, 5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(15, 14, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(1, 9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 	// { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-	BLAKE_G_PRE(0xC, 0x5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x1, 0xF, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0xE, 0xD, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-	BLAKE_G_PRE(0x4, 0xA, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x0, 0x7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x6, 0x3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x9, 0x2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x8, 0xB, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-
+	BLAKE_G_PRE2(12, 5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(1, 15, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(14, 13, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(4, 10, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(0, 7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(6, 3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(9, 2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(8, 11, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 	// { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-	BLAKE_G_PRE(0xD, 0xB, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x7, 0xE, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0xC, 0x1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-	BLAKE_G_PRE(0x3, 0x9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x5, 0x0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0xF, 0x4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x8, 0x6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x2, 0xA, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-
+	BLAKE_G_PRE0(13, 11, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(7, 14, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(12, 1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(3, 9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(5, 0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(15, 4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(8, 6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE(2, 10, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 	// { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-	BLAKE_G_PRE(0x6, 0xF, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0xE, 0x9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0xB, 0x3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-	BLAKE_G_PRE(0x0, 0x8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0xC, 0x2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0xD, 0x7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x1, 0x4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0xA, 0x5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-
+	BLAKE_G_PRE1(6, 15, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(14, 9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(11, 3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(0, 8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(12, 2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(13, 7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(1, 4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(10, 5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 	// { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
-	BLAKE_G_PRE(0xA, 0x2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x8, 0x4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x7, 0x6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-	BLAKE_G_PRE(0x1, 0x5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0xF, 0xB, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x9, 0xE, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x3, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0xD, 0x0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-#endif
+	BLAKE_G_PRE2(10, 2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(8, 4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(7, 6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE(1, 5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(15, 11, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(9, 14, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(3, 12, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(13, 0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 
-	V.lo ^= V.hi;
-	V.lo ^= tmpblock;
+	V.lo ^= V.hi ^ tmpblock;
 
 	V.hi = BLAKE2S_IV_Vec;
 	tmpblock = V.lo;
@@ -235,86 +308,121 @@ void Blake2S(uint32_t * inout, const uint32_t * TheKey)
 	V.hi.s4 ^= 128;
 	V.hi.s6 = ~V.hi.s6;
 
-#if 0
-	for (int x = 0; x < 10; ++x)
-	{
-		BLAKE_G(x, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
-		BLAKE_G(x, 0x2, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
-		BLAKE_G(x, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
-		BLAKE_G(x, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
-		BLAKE_G(x, 0x8, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
-		BLAKE_G(x, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
-		BLAKE_G(x, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
-		BLAKE_G(x, 0xE, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
-	}
-#else
-	//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-	BLAKE_G_PRE(0x0, 0x1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
-	BLAKE_G_PRE(0x2, 0x3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
-	BLAKE_G_PRE(0x4, 0x5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
-	BLAKE_G_PRE(0x6, 0x7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
-	BLAKE_G_PRE(0x8, 0x9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
-	BLAKE_G_PRE(0xA, 0xB, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
-	BLAKE_G_PRE(0xC, 0xD, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
-	BLAKE_G_PRE(0xE, 0xF, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
-
+	// { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
 	// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-	BLAKE_G_PRE(0xE, 0xA, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
-	BLAKE_G_PRE(0x4, 0x8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
-	BLAKE_G_PRE(0x9, 0xF, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
-	BLAKE_G_PRE(0xD, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
-	BLAKE_G_PRE(0x1, 0xC, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
-	BLAKE_G_PRE(0x0, 0x2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
-	BLAKE_G_PRE(0xB, 0x7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
-	BLAKE_G_PRE(0x5, 0x3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
-
+	BLAKE_G_PRE(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
 	// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-	BLAKE_G_PRE(0xB, 0x8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
-	BLAKE_G_PRE(0xC, 0x0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
-	BLAKE_G_PRE(0x5, 0x2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
-	BLAKE_G_PRE(0xF, 0xD, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
-	BLAKE_G_PRE(0xA, 0xE, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
-	BLAKE_G_PRE(0x3, 0x6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
-	BLAKE_G_PRE(0x7, 0x1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
-	BLAKE_G_PRE(0x9, 0x4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
-
+	BLAKE_G_PRE(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
 	// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-	BLAKE_G_PRE(0x7, 0x9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
-	BLAKE_G_PRE(0x3, 0x1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
-	BLAKE_G_PRE(0xD, 0xC, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
-	BLAKE_G_PRE(0xB, 0xE, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
-	BLAKE_G_PRE(0x2, 0x6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
-	BLAKE_G_PRE(0x5, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
-	BLAKE_G_PRE(0x4, 0x0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
-	BLAKE_G_PRE(0xF, 0x8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
-
-	for (int x = 4; x < 10; ++x)
+	BLAKE_G_PRE(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+
+	for(int x = 4; x < 10; ++x)
 	{
-		BLAKE_G(x, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
-		BLAKE_G(x, 0x2, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
-		BLAKE_G(x, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
-		BLAKE_G(x, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
-		BLAKE_G(x, 0x8, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
-		BLAKE_G(x, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
-		BLAKE_G(x, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
-		BLAKE_G(x, 0xE, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+		BLAKE_G(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+		BLAKE_G(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+		BLAKE_G(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+		BLAKE_G(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+		BLAKE_G(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+		BLAKE_G(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+		BLAKE_G(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+		BLAKE_G(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
 	}
-#endif
 
 	V.lo ^= V.hi ^ tmpblock;
 
-	((uint8*)inout)[0]=V.lo;
+	((uint8*)out)[0] = V.lo;
+}
+#endif
+
+#if __CUDA_ARCH__ >= 500
+
+#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \
+	idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
+	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE(a, b, c, d, key1,key2) { \
+	a += key1; \
+	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += key2; \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE_G_PRE(idx0,idx1, a, b, c, d, key) { \
+	a += key[idx0]; \
+	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += key[idx1]; \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
 }
 
-static __forceinline__ __host__
-void Blake2Shost(uint32_t * inout, const uint32_t * inkey)
+#define BLAKE_G_PRE0(idx0,idx1, a, b, c, d, key) { \
+	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE_G_PRE1(idx0,idx1, a, b, c, d, key) { \
+	a += key[idx0]; \
+	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE_G_PRE2(idx0,idx1, a, b, c, d, key) { \
+	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += key[idx1]; \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+static __forceinline__ __device__
+void Blake2S_v2(uint32_t *out, const uint32_t* __restrict__  inout, const  uint32_t * __restrict__ TheKey)
 {
 	uint16 V;
-	uint32_t idx;
 	uint8 tmpblock;
 
-	V.hi = BLAKE2S_IV_Vechost;
-	V.lo = BLAKE2S_IV_Vechost;
+	V.hi = BLAKE2S_IV_Vec;
+	V.lo = BLAKE2S_IV_Vec;
 	V.lo.s0 ^= 0x01012020;
 
 	// Copy input block for later
@@ -322,469 +430,1020 @@ void Blake2Shost(uint32_t * inout, const uint32_t * inkey)
 
 	V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;
 
-	for (int x = 0; x < 10; ++x)
-	{
-		BLAKE_Ghost(x, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inkey);
-		BLAKE_Ghost(x, 0x2, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inkey);
-		BLAKE_Ghost(x, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inkey);
-		BLAKE_Ghost(x, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inkey);
-		BLAKE_Ghost(x, 0x8, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inkey);
-		BLAKE_Ghost(x, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inkey);
-		BLAKE_Ghost(x, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inkey);
-		BLAKE_Ghost(x, 0xE, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inkey);
-	}
+	//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	BLAKE_G_PRE0(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE2(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	BLAKE_G_PRE0(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	BLAKE_G_PRE1(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	BLAKE_G_PRE2(9, 0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(5, 7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(2, 4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(10, 15, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(14, 1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(11, 12, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(6, 8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(3, 13, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	BLAKE_G_PRE1(2, 12, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(6, 10, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(0, 11, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE2(8, 3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(4, 13, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(7, 5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(15, 14, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(1, 9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	BLAKE_G_PRE2(12, 5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(1, 15, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(14, 13, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(4, 10, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(0, 7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(6, 3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(9, 2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(8, 11, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	BLAKE_G_PRE0(13, 11, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(7, 14, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(12, 1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(3, 9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(5, 0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(15, 4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(8, 6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE(2, 10, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	BLAKE_G_PRE1(6, 15, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(14, 9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(11, 3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(0, 8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(12, 2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(13, 7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(1, 4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(10, 5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+	BLAKE_G_PRE2(10, 2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(8, 4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(7, 6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE(1, 5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(15, 11, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(9, 14, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(3, 12, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(13, 0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 
 	V.lo ^= V.hi;
 	V.lo ^= tmpblock;
 
-	V.hi = BLAKE2S_IV_Vechost;
+	V.hi = BLAKE2S_IV_Vec;
 	tmpblock = V.lo;
 
 	V.hi.s4 ^= 128;
 	V.hi.s6 = ~V.hi.s6;
 
-	for (int x = 0; x < 10; ++x)
-	{
-		BLAKE_Ghost(x, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
-		BLAKE_Ghost(x, 0x2, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
-		BLAKE_Ghost(x, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
-		BLAKE_Ghost(x, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
-		BLAKE_Ghost(x, 0x8, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
-		BLAKE_Ghost(x, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
-		BLAKE_Ghost(x, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
-		BLAKE_Ghost(x, 0xE, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	// { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	BLAKE_G_PRE(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	BLAKE_G_PRE(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	BLAKE_G_PRE(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[9], inout[0]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[5], inout[7]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[2], inout[4]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[10], inout[15]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[14], inout[1]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[11], inout[12]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[6], inout[8]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[3], inout[13]);
+
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[2], inout[12]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[6], inout[10]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[0], inout[11]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[8], inout[3]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[4], inout[13]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[7], inout[5]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[15], inout[14]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[1], inout[9]);
+
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[12], inout[5]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[1], inout[15]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[14], inout[13]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[4], inout[10]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[0], inout[7]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[6], inout[3]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[9], inout[2]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[8], inout[11]);
+	// 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10,
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[13], inout[11]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[7], inout[14]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[12], inout[1]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[3], inout[9]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[5], inout[0]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[15], inout[4]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[8], inout[6]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[2], inout[10]);
+	// 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5,
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[6], inout[15]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[14], inout[9]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[11], inout[3]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[0], inout[8]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[12], inout[2]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[13], inout[7]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[1], inout[4]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[10], inout[5]);
+	// 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0,
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[10], inout[2]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[8], inout[4]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[7], inout[6]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[1], inout[5]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[15], inout[11]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[9], inout[14]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[3], inout[12]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[13], inout[0]);
+
+	V.lo ^= V.hi;
+	V.lo ^= tmpblock;
+
+	((uint8*)out)[0] = V.lo;
+}
+#endif /* __CUDA_ARCH__ >= 500 */
+
+
+static __forceinline__ __device__
+uint16 salsa_small_scalar_rnd(const uint16 &X)
+{
+	uint16 state = X;
+	uint32_t t;
+
+	#pragma unroll 1
+	for(int i = 0; i < 10; i++) {
+		SALSA_CORE(state);
 	}
 
-	V.lo ^= V.hi ^ tmpblock;
+	return(X + state);
+}
 
-	((uint8*)inout)[0] = V.lo;
+static __device__ __forceinline__
+uint16 chacha_small_parallel_rnd(const uint16 &X)
+{
+	uint16 st = X;
+
+	#pragma nounroll
+	for(int i = 0; i < 10; i++) {
+		CHACHA_CORE_PARALLEL(st);
+	}
+	return(X + st);
 }
 
+static __device__ __forceinline__
+void neoscrypt_chacha(uint16 *XV)
+{
+	uint16 temp;
+
+	XV[0] = chacha_small_parallel_rnd(XV[0] ^ XV[3]);
+	temp  = chacha_small_parallel_rnd(XV[1] ^ XV[0]);
+	XV[1] = chacha_small_parallel_rnd(XV[2] ^ temp);
+	XV[3] = chacha_small_parallel_rnd(XV[3] ^ XV[1]);
+	XV[2] = temp;
+}
+
+static __device__ __forceinline__
+void neoscrypt_salsa(uint16 *XV)
+{
+	uint16 temp;
+
+	XV[0] = salsa_small_scalar_rnd(XV[0] ^ XV[3]);
+	temp  = salsa_small_scalar_rnd(XV[1] ^ XV[0]);
+	XV[1] = salsa_small_scalar_rnd(XV[2] ^ temp);
+	XV[3] = salsa_small_scalar_rnd(XV[3] ^ XV[1]);
+	XV[2] = temp;
+}
+
+
+#if __CUDA_ARCH__ < 500
 static __forceinline__ __device__
-void fastkdf256(const uint32_t* password, uint8_t* output)
+void fastkdf256_v1(int thread, const uint32_t nonce, const uint32_t * const __restrict__  s_data)
 {
-	uint8_t bufidx = 0;
+	uint2x4 output[8];
+	uint8_t bufidx;
 	uchar4 bufhelper;
-	uint8_t A[320],B[288];
-
-	((uintx64*)A)[0] = ((uintx64*)password)[0];
-	((uint816 *)A)[4] =  ((uint816 *)password)[0];
+	uint32_t B[64];
+	uint32_t qbuf, rbuf, bitbuf;
+	uint32_t input[BLAKE2S_BLOCK_SIZE / 4];
+	uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = {0};
 
-	((uintx64*)B)[0] = ((uintx64*)password)[0];
-	((uint48 *)B)[8] = ((uint48 *)password)[0];
+	const uint32_t data18 = s_data[18];
+	const uint32_t data20 = s_data[0];
 
-	uint32_t input[BLAKE2S_BLOCK_SIZE/4]; uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = { 0 };
+	((uintx64*)(B))[0] = ((uintx64*)s_data)[0];
+	((uint32_t*)B)[19] = nonce;
+	((uint32_t*)B)[39] = nonce;
+	((uint32_t*)B)[59] = nonce;
 
 	((uint816*)input)[0] = ((uint816*)input_init)[0];
 	((uint48*)key)[0] = ((uint48*)key_init)[0];
 
-	for (int i = 0; i < 32; ++i)
+	#pragma unroll 1
+	for(int i = 0; i < 31; i++)
 	{
 		bufhelper = ((uchar4*)input)[0];
-		for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
 			bufhelper += ((uchar4*)input)[x];
+		}
 		bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
 
-		int qbuf = bufidx/4;
-		int rbuf = bufidx&3;
-		int bitbuf = rbuf << 3;
+		qbuf = bufidx / 4;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+
 		uint32_t shifted[9];
 
-		shift256R(shifted, ((uint8*)input)[0], bitbuf);
+		shift256R4(shifted, ((uint8*)input)[0], bitbuf);
 
-		for (int k = 0; k < 9; ++k) {
-			((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
+		//#pragma unroll
+		uint32_t temp[9];
+
+		for(int k = 0; k < 9; ++k)
+		{
+			uint32_t indice = (k + qbuf) & 0x3f;
+			temp[k] = B[indice] ^ shifted[k];
+			B[indice] = temp[k];
+		}
+#if __CUDA_ARCH__ >= 320  || !defined(__CUDA_ARCH__)
+		uint32_t a = s_data[qbuf & 0x3f], b;
+		//#pragma unroll
+		for(int k = 0; k<16; k+=2)
+		{
+			b = s_data[(qbuf + k + 1) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = s_data[(qbuf + k + 2) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
 		}
 
-		if (bufidx < BLAKE2S_KEY_SIZE)                          {((uint8*)B)[8] = ((uint8*)B)[0];}
-		else if (bufidx > FASTKDF_BUFFER_SIZE-BLAKE2S_OUT_SIZE) {((uint8*)B)[0] = ((uint8*)B)[8];}
-
-		if (i<31) {
-			for (int k = 0; k <BLAKE2S_BLOCK_SIZE / 4; k++) {
-				((uchar4*)(input))[k] = make_uchar4(
-					(A + bufidx)[4 * k], (A + bufidx)[4 * k + 1],
-					(A + bufidx)[4 * k + 2], (A + bufidx)[4 * k + 3]
-				);
-			}
-
-			for (int k = 0; k <BLAKE2S_KEY_SIZE / 4; k++) {
-				((uchar4*)(key))[k] = make_uchar4(
-					(B + bufidx)[4 * k], (B + bufidx)[4 * k + 1],
-					(B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3]
-				);
-			}
-
-			Blake2S((uint32_t*)input, key);
+		const uint32_t noncepos = 19 - qbuf % 20;
+		if(noncepos <= 16 && qbuf<60)
+		{
+			if(noncepos != 0)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if(noncepos != 16)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
 		}
+
+		for(int k = 0; k<8; k++)
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[k]) : "r"(temp[k]), "r"(temp[k + 1]), "r"(bitbuf));
+#endif
+		Blake2S(input, input, key); //yeah right...
 	}
+	bufhelper = ((uchar4*)input)[0];
 
-	int left = FASTKDF_BUFFER_SIZE - bufidx;
-	int qleft =left/4;
-	int rleft =left&3;
-	for (int k = 0; k < qleft; ++k) {
-		((uchar4*)output)[k] = make_uchar4(
-			(B + bufidx)[4 * k], (B + bufidx)[4 * k + 1],
-			(B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3]
-		) ^ ((uchar4*)A)[k];
+	for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
+	{
+		bufhelper += ((uchar4*)input)[x];
 	}
-	for (int i = 4*qleft; i < 4*qleft+rleft; ++i) {
-		output[i] = (B + bufidx)[i] ^ A[i];
+
+	bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
+
+	qbuf = bufidx / 4;
+	rbuf = bufidx & 3;
+	bitbuf = rbuf << 3;
+	for(int i = 0; i<64; i++)
+#if __CUDA_ARCH__ >= 320
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(((uint32_t*)output)[i]) : "r"(B[(qbuf + i) & 0x3f]), "r"(B[(qbuf + i + 1) & 0x3f4]), "r"(bitbuf));
+#endif
+	((ulonglong4*)output)[0] ^= ((ulonglong4*)input)[0];
+
+	((uintx64*)output)[0] ^= ((uintx64*)s_data)[0];
+	((uint32_t*)output)[19] ^= nonce;
+	((uint32_t*)output)[39] ^= nonce;
+	((uint32_t*)output)[59] ^= nonce;
+
+	for(int i = 0; i<8; i++)
+		(Input + 8 * thread)[i] = output[i];
+}
+#endif
+
+#if __CUDA_ARCH__ >= 500
+static __forceinline__ __device__
+void fastkdf256_v2(int thread, const uint32_t nonce, const uint32_t* const __restrict__ s_data) //, uint2x4 * output)
+{
+	uint2x4 output[8];
+	uint8_t bufidx;
+	uchar4 bufhelper;
+	const uint32_t data18 = s_data[18];
+	const uint32_t data20 = s_data[0];
+	uint32_t input[16];
+	uint32_t key[16] = {0};
+	uint32_t qbuf, rbuf, bitbuf;
+
+#define Bshift 16*thread
+
+	uint32_t *const B = (uint32_t*)&B2[Bshift];
+	((uintx64*)(B))[0] = ((uintx64*)s_data)[0];
+
+	B[19] = nonce;
+	B[39] = nonce;
+	B[59] = nonce;
+
+	((ulonglong4*)input)[0] = ((ulonglong4*)input_init)[0];
+	((uint2x4*)key)[0] = ((uint2x4*)key_init)[0];
+
+	#pragma unroll 1
+	for(int i = 0; i < 31; i++)
+	{
+		bufhelper = ((uchar4*)input)[0];
+		for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			bufhelper += ((uchar4*)input)[x];
+		}
+		bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
+
+		qbuf = bufidx / 4;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+		uint32_t shifted[9];
+
+		shift256R4(shifted, ((uint8*)input)[0], bitbuf);
+
+		uint32_t temp[9];
+
+		for(int k = 0; k < 9; ++k)
+			temp[k] = __ldg(&B[(k + qbuf) & 0x3f]) ^ shifted[k];
+
+		uint32_t a = s_data[qbuf & 0x3f], b;
+
+		//#pragma unroll
+		for(int k = 0; k<16; k+=2)
+		{
+			b = s_data[(qbuf + k + 1) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = s_data[(qbuf + k + 2) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
+		}
+
+		const uint32_t noncepos = 19 - qbuf % 20;
+		if(noncepos <= 16 && qbuf<60)
+		{
+			if(noncepos != 0)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if(noncepos != 16)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
+		}
+
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+
+		Blake2S_v2(input, input, key);
+
+		for(int k = 0; k < 9; k++)
+			B[(k + qbuf) & 0x3f] = temp[k];
 	}
-	for (int i = qleft*4+rleft; i < (qleft+1)*4; ++i) {
-		((uint8_t *)output)[i] = ((uint8_t *)B)[i - left] ^ ((uint8_t *)A)[i];
+
+	bufhelper = ((uchar4*)input)[0];
+	for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
+	{
+		bufhelper += ((uchar4*)input)[x];
 	}
-	for (int i = qleft+1; i < FASTKDF_BUFFER_SIZE/4; ++i) {
-		((uchar4 *)output)[i] = make_uchar4(B[4*i - left],B[4*i+1-left],
-		 B[4*i+2-left],B[4*i+3-left]) ^ ((uchar4 *)A)[i];
+	bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
+
+	qbuf = bufidx / 4;
+	rbuf = bufidx & 3;
+	bitbuf = rbuf << 3;
+
+	for(int i = 0; i<64; i++)
+	{
+		const uint32_t a = (qbuf + i) & 0x3f, b = (qbuf + i + 1) & 0x3f;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(((uint32_t*)output)[i]) : "r"(__ldg(&B[a])), "r"(__ldg(&B[b])), "r"(bitbuf));
 	}
+
+	output[0] ^= ((uint2x4*)input)[0];
+	for(int i = 0; i<8; i++)
+		output[i] ^= ((uint2x4*)s_data)[i];
+
+	((uint32_t*)output)[19] ^= nonce;
+	((uint32_t*)output)[39] ^= nonce;
+	((uint32_t*)output)[59] ^= nonce;;
+	((ulonglong16 *)(Input + 8 * thread))[0] = ((ulonglong16*)output)[0];
 }
+#endif
 
+#if __CUDA_ARCH__ < 500
 static __forceinline__ __device__
-void fastkdf32(const uint32_t * password, const uint32_t * salt, uint32_t * output)
+void fastkdf32_v1(int thread, const uint32_t nonce, const uint32_t * const __restrict__ salt, const uint32_t *const __restrict__ s_data, uint32_t &output)
 {
-	uint8_t bufidx = 0;
+	uint8_t bufidx;
 	uchar4 bufhelper;
+	uint32_t temp[9];
 
-	uint8_t A[320];
-	uint8_t B[288];
-
-	// Initialize the password buffer
-	((uintx64*)A)[0] = ((uintx64*)password)[0];
-	((uint816*)A)[4] = ((uint816*)password)[0];
-	((uintx64*)B)[0] = ((uintx64*)salt)[0];
-	((uintx64*)B)[1] = ((uintx64*)salt)[0];
+#define Bshift 16*thread
 
-	uint32_t input[BLAKE2S_BLOCK_SIZE/4];
-	uint32_t key[BLAKE2S_BLOCK_SIZE/4] = { 0 };
+	uint32_t* const B0 = (uint32_t*)&B2[Bshift];
+	const uint32_t cdata7 = s_data[7];
+	const uint32_t data18 = s_data[18];
+	const uint32_t data20 = s_data[0];
 
-	((uint816*)input)[0] = ((uint816*)password)[0];
+	((uintx64*)B0)[0] = ((uintx64*)salt)[0];
+	uint32_t input[BLAKE2S_BLOCK_SIZE / 4]; uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = {0};
+	((uint816*)input)[0] = ((uint816*)s_data)[0];
 	((uint48*)key)[0] = ((uint48*)salt)[0];
+	uint32_t qbuf, rbuf, bitbuf;
 
-	for (int i = 0; i < 32; ++i)
+	#pragma nounroll
+	for(int i = 0; i < 31; i++)
 	{
-		Blake2S((uint32_t*)input, key);
+		Blake2S(input, input, key);
 
 		bufidx = 0;
 		bufhelper = ((uchar4*)input)[0];
-
-		for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
 			bufhelper += ((uchar4*)input)[x];
-
+		}
 		bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
-		int qbuf = bufidx / 4;
-		int rbuf = bufidx & 3;
-		int bitbuf = rbuf << 3;
+		qbuf = bufidx / 4;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
 		uint32_t shifted[9];
 
-		shift256R(shifted, ((uint8*)input)[0], bitbuf);
+		shift256R4(shifted, ((uint8*)input)[0], bitbuf);
 
-		for (int k = 0; k < 9; ++k) {
-			((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
+		for(int k = 0; k < 9; k++) {
+			temp[k] = B0[(k + qbuf) & 0x3f];
 		}
 
-		if (i<31) {
-			if (bufidx < BLAKE2S_KEY_SIZE)                            {((uint8*)B)[8] = ((uint8*)B)[0];}
-			else if (bufidx > FASTKDF_BUFFER_SIZE - BLAKE2S_OUT_SIZE) {((uint8*)B)[0] = ((uint8*)B)[8];}
-
-			for (uint8_t k = 0; k < BLAKE2S_BLOCK_SIZE/4; k++) {
-				((uchar4*)(input))[k] = make_uchar4(
-					(A + bufidx)[4 * k], (A + bufidx)[4 * k + 1],
-					(A + bufidx)[4 * k + 2], (A + bufidx)[4 * k + 3]
-				);
-			}
-			for (uint8_t k = 0; k < BLAKE2S_KEY_SIZE / 4; k++) {
-				((uchar4*)(key))[k] = make_uchar4(
-					(B + bufidx)[4 * k], (B + bufidx)[4 * k + 1],
-					(B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3]
-				);
-			}
+		((uint2x4*)temp)[0] ^= ((uint2x4*)shifted)[0];
+		temp[8] ^= shifted[8];
+
+#if __CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__)
+		uint32_t a = s_data[qbuf & 0x3f], b;
+		//#pragma unroll
+		for(int k = 0; k<16; k+=2)
+		{
+			b = s_data[(qbuf + k + 1) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = s_data[(qbuf + k + 2) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
 		}
-	}
 
-	uchar4 unfucked[1];
-	unfucked[0] = make_uchar4(B[28 + bufidx], B[29 + bufidx],B[30 + bufidx], B[31 + bufidx]);
-	((uint32_t*)output)[7] = ((uint32_t*)unfucked)[0] ^ ((uint32_t*)A)[7];
-}
+		const uint32_t noncepos = 19 - qbuf % 20;
+		if(noncepos <= 16 && qbuf < 60)
+		{
+			if(noncepos != 0)	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if(noncepos != 16)	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
+		}
 
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+#else
+//#error SM 3.0 code missing here
+#endif
+		for(int k = 0; k < 9; k++) {
+			B0[(k + qbuf) & 0x3f] = temp[k];
+		}
+	}
 
-#define SALSA(a,b,c,d) { \
-    t =a+d; b^=rotateL(t,  7); \
-    t =b+a; c^=rotateL(t,  9); \
-    t =c+b; d^=rotateL(t, 13); \
-    t =d+c; a^=rotateL(t, 18); \
-}
+	Blake2S(input, input, key);
 
-#define SALSA_CORE(state) { \
-    SALSA(state.s0,state.s4,state.s8,state.sc); \
-    SALSA(state.s5,state.s9,state.sd,state.s1); \
-    SALSA(state.sa,state.se,state.s2,state.s6); \
-    SALSA(state.sf,state.s3,state.s7,state.sb); \
-    SALSA(state.s0,state.s1,state.s2,state.s3); \
-    SALSA(state.s5,state.s6,state.s7,state.s4); \
-    SALSA(state.sa,state.sb,state.s8,state.s9); \
-    SALSA(state.sf,state.sc,state.sd,state.se); \
-}
+	bufidx = 0;
+	bufhelper = ((uchar4*)input)[0];
+	for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) {
+		bufhelper += ((uchar4*)input)[x];
+	}
 
-#if __CUDA_ARCH__ >=500
-#define CHACHA_STEP(a,b,c,d) { \
-    a += b; d = __byte_perm(d^a,0,0x1032); \
-    c += d; b = rotateL(b^c, 12); \
-    a += b; d = __byte_perm(d^a,0,0x2103); \
-    c += d; b = rotateL(b^c, 7); \
-}
+	bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
+	qbuf = bufidx / 4;
+	rbuf = bufidx & 3;
+	bitbuf = rbuf << 3;
+
+	for(int k = 7; k < 9; k++) {
+		temp[k] = B0[(k + qbuf) & 0x3f];
+	}
+#if __CUDA_ARCH__ >= 320
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
 #else
-#define CHACHA_STEP(a,b,c,d) { \
-    a += b; d = rotateL(d^a,16); \
-    c += d; b = rotateL(b^c, 12); \
-    a += b; d = rotateL(d^a,8); \
-    c += d; b = rotateL(b^c, 7); \
+	output = (MAKE_ULONGLONG(temp[7], temp[8]) >> bitbuf); // to check maybe 7/8 reversed
+#endif
+	output ^= input[7] ^ cdata7;
 }
 #endif
 
-#define CHACHA_CORE_PARALLEL(state) { \
-    CHACHA_STEP(state.lo.s0, state.lo.s4, state.hi.s0, state.hi.s4); \
-    CHACHA_STEP(state.lo.s1, state.lo.s5, state.hi.s1, state.hi.s5); \
-    CHACHA_STEP(state.lo.s2, state.lo.s6, state.hi.s2, state.hi.s6); \
-    CHACHA_STEP(state.lo.s3, state.lo.s7, state.hi.s3, state.hi.s7); \
-    CHACHA_STEP(state.lo.s0, state.lo.s5, state.hi.s2, state.hi.s7); \
-    CHACHA_STEP(state.lo.s1, state.lo.s6, state.hi.s3, state.hi.s4); \
-    CHACHA_STEP(state.lo.s2, state.lo.s7, state.hi.s0, state.hi.s5); \
-    CHACHA_STEP(state.lo.s3, state.lo.s4, state.hi.s1, state.hi.s6); \
-}
+#if __CUDA_ARCH__ >= 500
+static __forceinline__ __device__
+void fastkdf32_v3(int thread, const uint32_t nonce, const uint32_t* __restrict__ salt, const uint32_t* __restrict__ s_data, uint32_t &output)
+{
+	uint32_t temp[9];
+	uint8_t bufidx;
+	uchar4 bufhelper;
 
+#define Bshift 16*thread
 
-static __forceinline__ __device__ uint16 salsa_small_scalar_rnd(const uint16 &X)
-{
-	uint16 state = X;
-	uint32_t t;
+	uint32_t*const B0 = (uint32_t*)&B2[Bshift];
+	const uint32_t cdata7 = s_data[7];
+	const uint32_t data18 = s_data[18];
+	const uint32_t data20 = s_data[0];
 
-	for (int i = 0; i < 10; ++i) { SALSA_CORE(state);}
+	((uintx64*)B0)[0] = ((uintx64*)salt)[0];
+	uint32_t input[BLAKE2S_BLOCK_SIZE / 4]; uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = {0};
+	((uint816*)input)[0] = ((uint816*)s_data)[0];
+	((uint48*)key)[0] = ((uint48*)salt)[0];
+	uint32_t qbuf, rbuf, bitbuf;
 
-	return(X + state);
+	#pragma nounroll
+	for(int i = 0; i < 31; i++)
+	{
+		Blake2S_v2(input, input, key);
+
+		bufidx = 0;
+		bufhelper = ((uchar4*)input)[0];
+		for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			bufhelper += ((uchar4*)input)[x];
+		}
+		bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
+		qbuf = bufidx / 4;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+		uint32_t shifted[9];
+
+		shift256R4(shifted, ((uint8*)input)[0], bitbuf);
+
+		for(int k = 0; k < 9; k++)
+		{
+			temp[k] = __ldg(&B0[(k + qbuf) & 0x3f]);
+		}
+
+		((uint2x4*)temp)[0] ^= ((uint2x4*)shifted)[0];
+		temp[8] ^= shifted[8];
+
+		uint32_t a = s_data[qbuf & 0x3f], b;
+		//#pragma unroll
+		for(int k = 0; k<16; k+=2)
+		{
+			b = s_data[(qbuf + k + 1) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = s_data[(qbuf + k + 2) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
+		}
+
+		const uint32_t noncepos = 19 - qbuf % 20;
+		if(noncepos <= 16 && qbuf<60)
+		{
+			if(noncepos != 0)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if(noncepos != 16)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
+		}
+
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+
+		for(int k = 0; k < 9; k++)
+		{
+			B0[(k + qbuf) & 0x3f] = temp[k];
+		}
+	}
+
+	Blake2S_v2(input, input, key);
+
+	bufidx = 0;
+	bufhelper = ((uchar4*)input)[0];
+	for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
+	{
+		bufhelper += ((uchar4*)input)[x];
+	}
+	bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
+	qbuf = bufidx / 4;
+	rbuf = bufidx & 3;
+	bitbuf = rbuf << 3;
+
+	temp[7] = __ldg(&B0[(qbuf + 7) & 0x3f]);
+	temp[8] = __ldg(&B0[(qbuf + 8) & 0x3f]);
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+	output ^= input[7] ^ cdata7;
 }
+#endif
 
-static __device__ __forceinline__ uint16 chacha_small_parallel_rnd(const uint16 &X)
-{
-	uint16 st = X;
 
-	for (int i = 0; i < 10; ++i) {CHACHA_CORE_PARALLEL(st);}
-	return(X + st);
+#define BLAKE_Ghost(idx0, idx1, a, b, c, d, key) { \
+	idx = BLAKE2S_SIGMA_host[idx0][idx1]; a += key[idx]; \
+	a += b; d = ROTR32(d^a,16); \
+	c += d; b = ROTR32(b^c, 12); \
+	idx = BLAKE2S_SIGMA_host[idx0][idx1+1]; a += key[idx]; \
+	a += b; d = ROTR32(d^a,8); \
+	c += d; b = ROTR32(b^c, 7); \
 }
 
-static __device__ __forceinline__ void neoscrypt_chacha(uint16 *XV)
+static void Blake2Shost(uint32_t * inout, const uint32_t * inkey)
 {
-	XV[0] ^= XV[3];
-	uint16 temp;
+	uint16 V;
+	uint32_t idx;
+	uint8 tmpblock;
 
-	XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0];
-	 temp = chacha_small_parallel_rnd(XV[1]); XV[2] ^= temp;
-	XV[1] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[1];
-	XV[3] = chacha_small_parallel_rnd(XV[3]);
-	XV[2] = temp;
+	V.hi = BLAKE2S_IV_Vechost;
+	V.lo = BLAKE2S_IV_Vechost;
+	V.lo.s0 ^= 0x01012020;
+
+	// Copy input block for later
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;
+
+	for(int x = 0; x < 10; ++x)
+	{
+		BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inkey);
+		BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inkey);
+		BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inkey);
+		BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inkey);
+		BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inkey);
+		BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inkey);
+		BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inkey);
+		BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inkey);
+	}
+
+	V.lo ^= V.hi;
+	V.lo ^= tmpblock;
+
+	V.hi = BLAKE2S_IV_Vechost;
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= 128;
+	V.hi.s6 = ~V.hi.s6;
+
+	for(int x = 0; x < 10; ++x)
+	{
+		BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+		BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+		BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+		BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+		BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+		BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+		BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+		BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	}
+
+	V.lo ^= V.hi ^ tmpblock;
+
+	((uint8*)inout)[0] = V.lo;
 }
 
-static __device__ __forceinline__ void neoscrypt_salsa(uint16 *XV)
+
+#define SHIFT 128
+#define TPB 128
+#define TPB2 64
+
+__global__
+__launch_bounds__(TPB2, 1)
+void neoscrypt_gpu_hash_start(int stratum, int threads, uint32_t startNonce)
 {
-	XV[0] ^= XV[3];
-	uint16 temp;
+	__shared__ uint32_t s_data[64];
 
-	XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0];
-	 temp = salsa_small_scalar_rnd(XV[1]); XV[2] ^= temp;
-	XV[1] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[1];
-	XV[3] = salsa_small_scalar_rnd(XV[3]);
-	XV[2] = temp;
-}
+#if TPB2<64
+#error TPB2 too low
+#else
+#if TPB2>64
+	if(threadIdx.x<64)
+#endif
+#endif
+	s_data[threadIdx.x] = c_data[threadIdx.x];
+	__syncthreads();
 
+	const int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t nonce = startNonce + thread;
+	const uint32_t ZNonce = (stratum) ? cuda_swab32(nonce) : nonce; //freaking morons !!!
 
-#define SHIFT 130
+#if __CUDA_ARCH__ < 500
+	fastkdf256_v1(thread, ZNonce, s_data);
+#else
+	fastkdf256_v2(thread, ZNonce, s_data);
+#endif
+}
 
-__global__ __launch_bounds__(128, 1)
-void neoscrypt_gpu_hash_k0(uint32_t threads, uint32_t startNonce, int stratum)
+__global__
+__launch_bounds__(TPB, 1)
+void neoscrypt_gpu_hash_chacha1_stream1(int threads, uint32_t startNonce)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-//	if (thread < threads)
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const int shift = SHIFT * 8 * thread;
+	const unsigned int shiftTr = 8 * thread;
+
+	uint2x4 X[8];
+	for(int i = 0; i<8; i++)
+		X[i] = __ldg4(&(Input + shiftTr)[i]);
+
+	#pragma nounroll
+	for(int i = 0; i < 128; i++)
 	{
-		uint32_t data[80];
-		uint16 X[4];
-		uint32_t shift = thread * SHIFT * 16;
-		const uint32_t nonce = startNonce + thread;
-
-		for (int i = 0; i<20; i++) {
-			((uint4*)data)[i] = ((uint4 *)c_data)[i];
-		}  //ld.local.v4
-		data[19] = (stratum) ? cuda_swab32(nonce) : nonce; //freaking morons !!!
-		data[39] = data[19];
-		data[59] = data[19];
-
-		fastkdf256(data, (uint8_t*)X);
-
-		((uintx64 *)(W + shift))[0] = ((uintx64 *)X)[0];
-//		((ulonglong16 *)(W + shift))[0] = ((ulonglong16 *)X)[0];
+		uint32_t offset = shift + i * 8;
+		for(int j = 0; j<8; j++)
+			(W + offset)[j] = X[j];
+		neoscrypt_chacha((uint16*)X);
 	}
+
+	for(int i = 0; i<8; i++)
+		(Tr + shiftTr)[i] = X[i];
 }
 
-__global__ __launch_bounds__(128, 1)
-void neoscrypt_gpu_hash_k01(uint32_t threads, uint32_t startNonce)
+__global__
+__launch_bounds__(TPB, 1)
+void neoscrypt_gpu_hash_chacha2_stream1(int threads, uint32_t startNonce)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-//	if (thread < threads)
+	const int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const int shift = SHIFT * 8 * thread;
+	const int shiftTr = 8 * thread;
+
+	uint2x4 X[8];
+	#pragma unroll
+	for(int i = 0; i<8; i++)
+		X[i] = __ldg4(&(Tr + shiftTr)[i]);
+
+	#pragma nounroll
+	for(int t = 0; t < 128; t++)
 	{
-		uint16 X[4];
-		uint32_t shift = thread * SHIFT * 16;
-		((uintx64 *)X)[0]= ldg256(&(W + shift)[0]);
+		int idx = (X[6].x.x & 0x7F) << 3;
 
-		//#pragma unroll
-		for (int i = 0; i < 128; ++i)
-		{
-			neoscrypt_chacha(X);
-			((ulonglong16 *)(W + shift))[i+1] = ((ulonglong16 *)X)[0];
-//			((uintx64 *)(W + shift))[i + 1] = ((uintx64 *)X)[0];
-		}
+		for(int j = 0; j<8; j++)
+			X[j] ^= __ldg4(&(W + shift + idx)[j]);
+		neoscrypt_chacha((uint16*)X);
 	}
+	#pragma unroll
+	for(int i = 0; i<8; i++)
+		(Tr + shiftTr)[i] = X[i]; // best checked
 }
 
-__global__ __launch_bounds__(128, 1)
-void neoscrypt_gpu_hash_k2(uint32_t threads, uint32_t startNonce)
+__global__
+__launch_bounds__(TPB, 1)
+void neoscrypt_gpu_hash_salsa1_stream1(int threads, uint32_t startNonce)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-//	if (thread < threads)
-	{
-		uint16 X[4];
-		uint32_t shift = thread * SHIFT * 16;
-		((uintx64 *)X)[0] = ldg256(&(W + shift)[2048]);
+	const int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const int shift = SHIFT * 8 * thread;
+	const int shiftTr = 8 * thread;
 
-		for (int t = 0; t < 128; t++)
-		{
-			int idx = X[3].lo.s0 & 0x7F;
-			((uintx64 *)X)[0] ^= ldg256(&(W + shift)[idx << 4]);
-			neoscrypt_chacha(X);
-
-		}
-		((uintx64 *)(W + shift))[129] = ((uintx64*)X)[0];  // best checked
+	uint2x4 Z[8];
+	#pragma unroll
+	for(int i = 0; i<8; i++)
+		Z[i] = __ldg4(&(Input + shiftTr)[i]);
 
+	#pragma nounroll
+	for(int i = 0; i < 128; i++)
+	{
+		for(int j = 0; j<8; j++)
+			(W2 + shift + i * 8)[j] = Z[j];
+		neoscrypt_salsa((uint16*)Z);
 	}
+	#pragma unroll
+	for(int i = 0; i<8; i++)
+		(Tr2 + shiftTr)[i] = Z[i];
 }
 
-__global__ __launch_bounds__(128, 1)
-void neoscrypt_gpu_hash_k3(uint32_t threads, uint32_t startNonce)
+__global__
+__launch_bounds__(TPB, 1)
+void neoscrypt_gpu_hash_salsa2_stream1(int threads, uint32_t startNonce)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-//	if (thread < threads)
-	{
-		uint32_t shift = thread * SHIFT * 16;
-		uint16 Z[4];
+	const int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const int shift = SHIFT * 8 * thread;
+	const int shiftTr = 8 * thread;
 
-		((uintx64*)Z)[0] = ldg256(&(W + shift)[0]);
+	uint2x4 X[8];
+	#pragma unroll
+	for(int i = 0; i<8; i++)
+		X[i] = __ldg4(&(Tr2 + shiftTr)[i]);
 
-		//#pragma unroll
-		for (int i = 0; i < 128; ++i) {
-			neoscrypt_salsa(Z);
-			((ulonglong16 *)(W + shift))[i+1] = ((ulonglong16 *)Z)[0];
-//			((uintx64 *)(W + shift))[i + 1] = ((uintx64 *)Z)[0];
-		}
+	#pragma nounroll
+	for(int t = 0; t < 128; t++)
+	{
+		int idx = (X[6].x.x & 0x7F) << 3;
+
+		for(int j = 0; j<8; j++)
+			X[j] ^= __ldg4(&(W2 + shift + idx)[j]);
+		neoscrypt_salsa((uint16*)X);
 	}
+	#pragma unroll
+	for(int i = 0; i<8; i++)
+		(Tr2 + shiftTr)[i] = X[i]; // best checked
 }
 
-__global__ __launch_bounds__(128, 1)
-void neoscrypt_gpu_hash_k4(uint32_t threads, uint32_t startNonce, uint32_t *nonceRes, int stratum)
+__global__
+__launch_bounds__(TPB2, 8)
+void neoscrypt_gpu_hash_ending(int stratum, int threads, uint32_t startNonce, uint32_t *nonceVector)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		const uint32_t nonce = startNonce + thread;
+	__shared__ uint32_t s_data[64];
 
-		uint32_t shift = thread * SHIFT * 16;
-		uint16 Z[4];
-		uint32_t outbuf[8];
-		uint32_t data[80];
+#if TPB2<64
+#error TPB2 too low
+#elif TPB2>64
+	if(threadIdx.x<64)
+#endif
+		s_data[threadIdx.x] = c_data[threadIdx.x];
+	__syncthreads();
+	const int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t nonce = startNonce + thread;
 
-		for (int i=0; i<20; i++) {
-			((uint4*)data)[i] = ((uint4 *)c_data)[i];
-		}
+	const int shiftTr = 8 * thread;
+	uint2x4 Z[8];
+	uint32_t outbuf;
 
-		data[19] = (stratum) ? cuda_swab32(nonce) : nonce;
-		data[39] = data[19];
-		data[59] = data[19];
-		((uintx64 *)Z)[0] = ldg256(&(W + shift)[2048]);
-		for (int t = 0; t < 128; t++)
-		{
-			int idx = Z[3].lo.s0 & 0x7F;
-			((uintx64 *)Z)[0] ^= ldg256(&(W + shift)[idx << 4]);
-			neoscrypt_salsa(Z);
-		}
-		((uintx64 *)Z)[0] ^= ldg256(&(W + shift)[2064]);
-		fastkdf32(data, (uint32_t*)Z, outbuf);
-#if __CUDA_ARCH__ < 320
-		// workaround required when using SM 3.0 shift256R() func (tested on SM 5.0)
-		if (thread == 0)
-			printf("", outbuf[7]);
+	const uint32_t ZNonce = (stratum) ? cuda_swab32(nonce) : nonce;
+
+	#pragma unroll
+	for(int i = 0; i<8; i++)
+		Z[i] = __ldg4(&(Tr2 + shiftTr)[i]) ^ __ldg4(&(Tr + shiftTr)[i]);
+
+#if __CUDA_ARCH__ < 500
+	fastkdf32_v1(thread, ZNonce, (uint32_t*)Z, s_data, outbuf);
+#else
+	fastkdf32_v3(thread, ZNonce, (uint32_t*)Z, s_data, outbuf);
 #endif
-		if (outbuf[7] <= pTarget[7]) {
-			atomicMin(nonceRes, nonce); // init val is UINT32_MAX
-		}
+
+	if(outbuf <= pTarget[7])
+	{
+		uint32_t tmp = atomicExch(nonceVector, nonce);
+		if(tmp != UINT32_MAX)
+			nonceVector[1] = tmp;
 	}
 }
 
+static __thread uint32_t *hash1 = NULL;
+static __thread uint32_t *hash2 = NULL; // 2 streams
+static __thread uint32_t *Trans1 = NULL;
+static __thread uint32_t *Trans2 = NULL; // 2 streams
+static __thread uint32_t *Trans3 = NULL; // 2 streams
+static __thread uint32_t *Bhash = NULL;
+
 __host__
-void neoscrypt_cpu_init(int thr_id, uint32_t threads)
+void neoscrypt_init_2stream(int thr_id, uint32_t threads)
 {
-	cuda_get_arch(thr_id);
-	cudaMalloc(&d_NNonce[thr_id], sizeof(uint32_t));
-	CUDA_SAFE_CALL(cudaMalloc(&d_buffer[thr_id], (size_t) 256 * SHIFT * threads));
-	cudaMemcpyToSymbol(W, &d_buffer[thr_id], sizeof(uint4*), 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol(BLAKE2S_SIGMA, BLAKE2S_SIGMA_host, sizeof(BLAKE2S_SIGMA_host), 0, cudaMemcpyHostToDevice);
+	CUDA_SAFE_CALL(cudaStreamCreate(&stream[0]));
+	CUDA_SAFE_CALL(cudaStreamCreate(&stream[1]));
+
+	CUDA_SAFE_CALL(cudaMalloc(&d_NNonce[thr_id], 2 * sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMalloc(&hash1, 32 * 128 * sizeof(uint64_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&hash2, 32 * 128 * sizeof(uint64_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&Trans1, 32 * sizeof(uint64_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&Trans2, 32 * sizeof(uint64_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&Trans3, 32 * sizeof(uint64_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&Bhash, 128 * sizeof(uint32_t) * threads));
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(B2,    &Bhash,  sizeof(uint2x4*), 0, cudaMemcpyHostToDevice, stream[0]));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(W,     &hash1,  sizeof(uint2x4*), 0, cudaMemcpyHostToDevice, stream[0]));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(W2,    &hash2,  sizeof(uint2x4*), 0, cudaMemcpyHostToDevice, stream[0]));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(Tr,    &Trans1, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice, stream[0]));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(Tr2,   &Trans2, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice, stream[0]));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(Input, &Trans3, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice, stream[0]));
 }
 
 __host__
-void neoscrypt_cpu_free(int thr_id)
+void neoscrypt_free_2stream(int thr_id)
 {
 	cudaFree(d_NNonce[thr_id]);
-	cudaFree(d_buffer[thr_id]);
+
+	cudaFree(hash1);
+	cudaFree(hash2);
+	cudaFree(Trans1);
+	cudaFree(Trans2);
+	cudaFree(Trans3);
+	cudaFree(Bhash);
+
+	cudaStreamDestroy(stream[0]);
+	CUDA_SAFE_CALL(cudaStreamDestroy(stream[1]));
 }
 
 __host__
-uint32_t neoscrypt_cpu_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, int have_stratum, int order)
+void neoscrypt_hash_k4_2stream(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *result, bool stratum)
 {
-	uint32_t result[MAX_GPUS];
-	memset(result, 0xff, sizeof(result));
-	cudaMemset(d_NNonce[thr_id], 0xff, sizeof(uint32_t));
+	CUDA_SAFE_CALL(cudaMemsetAsync(d_NNonce[thr_id], 0xff, 2 * sizeof(uint32_t), stream[1]));
+
+	const int threadsperblock = TPB;
 
-	const uint32_t threadsperblock = 128;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	neoscrypt_gpu_hash_k0  <<< grid, block >>>(threads, startNounce, have_stratum);
-	neoscrypt_gpu_hash_k01 <<< grid, block >>>(threads, startNounce);
-	neoscrypt_gpu_hash_k2  <<< grid, block >>>(threads, startNounce);
-	neoscrypt_gpu_hash_k3  <<< grid, block >>>(threads, startNounce);
-	neoscrypt_gpu_hash_k4  <<< grid, block >>>(threads, startNounce, d_NNonce[thr_id], have_stratum);
+	const int threadsperblock2 = TPB2;
+	dim3 grid2((threads + threadsperblock2 - 1) / threadsperblock2);
+	dim3 block2(threadsperblock2);
+
+	neoscrypt_gpu_hash_start <<<grid2, block2, 0, stream[0]>>> (stratum, threads, startNounce); //fastkdf
+
+	CUDA_SAFE_CALL(cudaStreamSynchronize(stream[0]));
+
+	neoscrypt_gpu_hash_salsa1_stream1  <<<grid, block, 0, stream[0]>>> (threads, startNounce);
+	neoscrypt_gpu_hash_chacha1_stream1 <<<grid, block, 0, stream[1]>>> (threads, startNounce);
 
-	MyStreamSynchronize(NULL, order, thr_id);
-	cudaMemcpy(&result[thr_id], d_NNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	neoscrypt_gpu_hash_salsa2_stream1  <<<grid, block, 0, stream[0]>>> (threads, startNounce);
+	neoscrypt_gpu_hash_chacha2_stream1 <<<grid, block, 0, stream[1]>>> (threads, startNounce);
 
-	return result[thr_id];
+	CUDA_SAFE_CALL(cudaDeviceSynchronize());
+
+	neoscrypt_gpu_hash_ending <<<grid2, block2>>> (stratum, threads, startNounce, d_NNonce[thr_id]); //fastkdf+end
+
+	CUDA_SAFE_CALL(cudaMemcpy(result, d_NNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
 }
 
 __host__
 void neoscrypt_setBlockTarget(uint32_t* pdata, const void *target)
 {
-	unsigned char PaddedMessage[80*4]; //bring balance to the force
-	uint32_t input[16], key[16] = { 0 };
+	uint32_t PaddedMessage[64];
+	uint32_t input[16], key[16] = {0};
+
+	for(int i = 0; i < 19; i++)
+	{
+		PaddedMessage[i     ] = pdata[i];
+		PaddedMessage[i + 20] = pdata[i];
+		PaddedMessage[i + 40] = pdata[i];
+	}
+	for(int i = 0; i<4; i++)
+		PaddedMessage[i + 60] = pdata[i];
 
-	memcpy(PaddedMessage,     pdata, 80);
-	memcpy(PaddedMessage + 80, pdata, 80);
-	memcpy(PaddedMessage + 160, pdata, 80);
-	memcpy(PaddedMessage + 240, pdata, 80);
+	PaddedMessage[19] = 0;
+	PaddedMessage[39] = 0;
+	PaddedMessage[59] = 0;
 
 	((uint16*)input)[0] = ((uint16*)pdata)[0];
 	((uint8*)key)[0] = ((uint8*)pdata)[0];
 
-	Blake2Shost(input,key);
+	Blake2Shost(input, key);
 
 	cudaMemcpyToSymbol(pTarget, target, 32, 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol(input_init, input, sizeof(input), 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol(key_init, key, sizeof(key), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(input_init, input, 64, 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(key_init, key, 64, 0, cudaMemcpyHostToDevice);
 
-	cudaMemcpyToSymbol(c_data, PaddedMessage, 80*4, 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_data, PaddedMessage, 64 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+	CUDA_SAFE_CALL(cudaGetLastError());
 }
-
diff --git a/neoscrypt/neoscrypt.cpp b/neoscrypt/neoscrypt.cpp
index b0cb1cb..b710cdc 100644
--- a/neoscrypt/neoscrypt.cpp
+++ b/neoscrypt/neoscrypt.cpp
@@ -3,9 +3,10 @@
 #include "neoscrypt/neoscrypt.h"
 
 extern void neoscrypt_setBlockTarget(uint32_t * data, const void *ptarget);
-extern void neoscrypt_cpu_init(int thr_id, uint32_t threads);
-extern void neoscrypt_cpu_free(int thr_id);
-extern uint32_t neoscrypt_cpu_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, int have_stratum, int order);
+
+extern void neoscrypt_init_2stream(int thr_id, uint32_t threads);
+extern void neoscrypt_free_2stream(int thr_id);
+extern void neoscrypt_hash_k4_2stream(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *result, bool stratum);
 
 static bool init[MAX_GPUS] = { 0 };
 
@@ -35,12 +36,12 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
 		cudaGetLastError(); // reset errors if device is not "reset"
 
 		if (device_sm[dev_id] <= 300) {
-			applog(LOG_ERR, "Sorry neoscrypt is not supported on SM 3.0 devices");
+			gpulog(LOG_ERR, thr_id, "Sorry neoscrypt is not supported on SM 3.0 devices");
 			proper_exit(EXIT_CODE_CUDA_ERROR);
 		}
 
-		applog(LOG_INFO, "GPU #%d: Using %d cuda threads", dev_id, throughput);
-		neoscrypt_cpu_init(thr_id, throughput);
+		gpulog(LOG_INFO, thr_id, "Using %d cuda threads", throughput);
+		neoscrypt_init_2stream(thr_id, throughput);
 
 		init[thr_id] = true;
 	}
@@ -56,26 +57,28 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
 	neoscrypt_setBlockTarget(endiandata,ptarget);
 
 	do {
-		uint32_t foundNonce = neoscrypt_cpu_hash_k4(thr_id, throughput, pdata[19], have_stratum, 0);
-		if (foundNonce != UINT32_MAX)
-		{
-			uint32_t _ALIGN(64) vhash64[8];
+		uint32_t foundNonces[2] = { UINT32_MAX, UINT32_MAX };
+		neoscrypt_hash_k4_2stream(thr_id, throughput, pdata[19], foundNonces, have_stratum);
 
-			*hashes_done = pdata[19] - first_nonce + 1;
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (foundNonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
 
 			if (have_stratum) {
-				be32enc(&endiandata[19], foundNonce);
+				be32enc(&endiandata[19], foundNonces[0]);
 			} else {
-				endiandata[19] = foundNonce;
+				endiandata[19] = foundNonces[0];
 			}
-			neoscrypt((uchar*)vhash64, (uchar*) endiandata, 0x80000620U);
+			neoscrypt((uchar*)vhash, (uchar*) endiandata, 0x80000620U);
 
-			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
-				work_set_target_ratio(work, vhash64);
-				pdata[19] = foundNonce;
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work_set_target_ratio(work, vhash);
+				pdata[19] = foundNonces[0];
 				return 1;
 			} else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+				gpulog(LOG_WARNING, thr_id, "nonce %08x does not validate on CPU!", foundNonces[0]);
 			}
 		}
 
@@ -100,7 +103,7 @@ void free_neoscrypt(int thr_id)
 
 	cudaThreadSynchronize();
 
-	neoscrypt_cpu_free(thr_id);
+	neoscrypt_free_2stream(thr_id);
 	init[thr_id] = false;
 
 	cudaDeviceSynchronize();
diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu
index 684a50b..91a0e11 100644
--- a/quark/quarkcoin.cu
+++ b/quark/quarkcoin.cu
@@ -294,6 +294,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce,
 // cleanup
 extern "C" void free_quark(int thr_id)
 {
+	int dev_id = device_map[thr_id];
 	if (!init[thr_id])
 		return;
 
@@ -301,9 +302,13 @@ extern "C" void free_quark(int thr_id)
 
 	cudaFree(d_hash[thr_id]);
 
-	cudaFree(d_branch1Nonces[thr_id]);
-	cudaFree(d_branch2Nonces[thr_id]);
-	cudaFree(d_branch3Nonces[thr_id]);
+	if (cuda_arch[dev_id] >= 300) {
+		cudaFree(d_branch1Nonces[thr_id]);
+		cudaFree(d_branch2Nonces[thr_id]);
+		cudaFree(d_branch3Nonces[thr_id]);
+	} else {
+		cudaFree(d_hash_br2[thr_id]);
+	}
 
 	quark_blake512_cpu_free(thr_id);
 	quark_groestl512_cpu_free(thr_id);