From f262850270c377690c4348c3f600954c48dc3a99 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sat, 16 Jul 2016 18:54:00 +0200
Subject: [PATCH] nanashi r10 with proper utf8

---
 Algo256/cuda_blake256.cu         |  661 +++++++++--
 Algo256/cuda_bmw256.cu           |  304 +++--
 Algo256/cuda_cubehash256.cu      |  482 ++++----
 Algo256/cuda_skein256.cu         |  451 ++++++--
 ccminer.cpp                      |  582 +++++++++-
 ccminer.vcxproj                  |   35 +-
 ccminer.vcxproj.filters          |    8 +-
 configure.ac                     |    2 +-
 cuda_helper.h                    |   30 +-
 lyra2/cuda_lyra2.cu              |  627 +++++++---
 lyra2/cuda_lyra2_sm2.cuh         |    7 +-
 lyra2/cuda_lyra2_sm5.cuh         |  701 ++++++++++++
 lyra2/cuda_lyra2v2.cu            |  656 +++++++----
 lyra2/cuda_lyra2v2_sm3.cuh       |  338 ------
 lyra2/lyra2RE.cu                 |   63 +-
 lyra2/lyra2REv2.cu               |  180 ++-
 miner.h                          |    4 +
 neoscrypt/cuda_neoscrypt.cu      | 1834 +++++++++++++++++++++---------
 neoscrypt/cuda_vectors.h         |    4 +-
 neoscrypt/neoscrypt.cpp          |   80 +-
 nvml.cpp                         |    4 +-
 quark/cuda_quark_blake512_sp.cuh |    7 +-
 util.cpp                         |    2 +-
 23 files changed, 5150 insertions(+), 1912 deletions(-)
 create mode 100644 lyra2/cuda_lyra2_sm5.cuh
 delete mode 100644 lyra2/cuda_lyra2v2_sm3.cuh

diff --git a/Algo256/cuda_blake256.cu b/Algo256/cuda_blake256.cu
index c3326e6..78c038a 100644
--- a/Algo256/cuda_blake256.cu
+++ b/Algo256/cuda_blake256.cu
@@ -8,17 +8,28 @@ extern "C" {
 }
 
 #include "cuda_helper.h"
-
 #include <memory.h>
 
-static __device__ uint64_t cuda_swab32ll(uint64_t x) {
-	return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
+#define UINT2(x,y) make_uint2(x,y)
+
+__device__ __inline__ uint2 ROR8(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x0765);
+	result.y = __byte_perm(a.x, a.y, 0x0765);
+
+	return result;
 }
 
-__constant__ static uint32_t c_data[3+1];
 
-__constant__ static uint32_t sigma[16][16];
-static uint32_t  c_sigma[16][16] = {
+//static __device__ uint64_t cuda_swab32ll(uint64_t x) {
+//	return MAKE_ULONGLONG(cuda_swab32(_LOWORD(x)), cuda_swab32(_HIWORD(x)));
+//}
+
+__constant__ static uint32_t  c_data[3];
+
+//__constant__ static uint8_t sigma[16][16];
+static uint8_t  c_sigma[16][16] = {
 	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
 	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
 	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
@@ -46,7 +57,7 @@ static const uint32_t  c_IV256[8] = {
 
 __device__ __constant__ static uint32_t cpu_h[8];
 
-__device__ __constant__ static  uint32_t  u256[16];
+//__device__ __constant__ static  uint32_t  u256[16];
 static const uint32_t  c_u256[16] = {
 	0x243F6A88, 0x85A308D3,
 	0x13198A2E, 0x03707344,
@@ -59,24 +70,22 @@ static const uint32_t  c_u256[16] = {
 };
 
 #define GS2(a,b,c,d,x) { \
-	const uint32_t idx1 = sigma[r][x]; \
-	const uint32_t idx2 = sigma[r][x+1]; \
+	const uint8_t idx1 = sigma[r][x]; \
+	const uint8_t idx2 = sigma[r][x+1]; \
 	v[a] += (m[idx1] ^ u256[idx2]) + v[b]; \
-	v[d] = SPH_ROTL32(v[d] ^ v[a], 16); \
+	v[d] = __byte_perm(v[d] ^ v[a],0, 0x1032); \
 	v[c] += v[d]; \
 	v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \
 \
 	v[a] += (m[idx2] ^ u256[idx1]) + v[b]; \
-	v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \
+	v[d] = __byte_perm(v[d] ^ v[a],0, 0x0321); \
 	v[c] += v[d]; \
 	v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \
 }
 
-//#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n)))
-//#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
 #define hostGS(a,b,c,d,x) { \
-	const uint32_t idx1 = c_sigma[r][x]; \
-	const uint32_t idx2 = c_sigma[r][x+1]; \
+	const uint8_t idx1 = c_sigma[r][x]; \
+	const uint8_t idx2 = c_sigma[r][x+1]; \
 	v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \
 	v[d] = ROTR32(v[d] ^ v[a], 16); \
 	v[c] += v[d]; \
@@ -86,14 +95,47 @@ static const uint32_t  c_u256[16] = {
 	v[d] = ROTR32(v[d] ^ v[a], 8); \
 	v[c] += v[d]; \
 	v[b] = ROTR32(v[b] ^ v[c], 7); \
-	}
+				}
 
-/* Second part (64-80) msg never change, store it */
-__device__ __constant__ static const uint32_t  c_Padding[16] = {
-	0, 0, 0, 0,
-	0x80000000, 0, 0, 0,
-	0, 0, 0, 0,
-	0, 1, 0, 640,
+#define GSPREC(a,b,c,d,x,y) { \
+	v[a] += (m[x] ^ u256[y]) + v[b]; \
+	v[d] = __byte_perm(v[d] ^ v[a],0, 0x1032); \
+	v[c] += v[d]; \
+	v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \
+	v[a] += (m[y] ^ u256[x]) + v[b]; \
+	v[d] = __byte_perm(v[d] ^ v[a],0, 0x0321); \
+	v[c] += v[d]; \
+	v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \
+					}
+
+__constant__ uint64_t keccak_round_constants[24] = {
+	0x0000000000000001ull, 0x0000000000008082ull,
+	0x800000000000808aull, 0x8000000080008000ull,
+	0x000000000000808bull, 0x0000000080000001ull,
+	0x8000000080008081ull, 0x8000000000008009ull,
+	0x000000000000008aull, 0x0000000000000088ull,
+	0x0000000080008009ull, 0x000000008000000aull,
+	0x000000008000808bull, 0x800000000000008bull,
+	0x8000000000008089ull, 0x8000000000008003ull,
+	0x8000000000008002ull, 0x8000000000000080ull,
+	0x000000000000800aull, 0x800000008000000aull,
+	0x8000000080008081ull, 0x8000000000008080ull,
+	0x0000000080000001ull, 0x8000000080008008ull
+};
+
+__constant__ uint2 keccak_round_constants35[24] = {
+	{ 0x00000001ul, 0x00000000 }, { 0x00008082ul, 0x00000000 },
+	{ 0x0000808aul, 0x80000000 }, { 0x80008000ul, 0x80000000 },
+	{ 0x0000808bul, 0x00000000 }, { 0x80000001ul, 0x00000000 },
+	{ 0x80008081ul, 0x80000000 }, { 0x00008009ul, 0x80000000 },
+	{ 0x0000008aul, 0x00000000 }, { 0x00000088ul, 0x00000000 },
+	{ 0x80008009ul, 0x00000000 }, { 0x8000000aul, 0x00000000 },
+	{ 0x8000808bul, 0x00000000 }, { 0x0000008bul, 0x80000000 },
+	{ 0x00008089ul, 0x80000000 }, { 0x00008003ul, 0x80000000 },
+	{ 0x00008002ul, 0x80000000 }, { 0x00000080ul, 0x80000000 },
+	{ 0x0000800aul, 0x00000000 }, { 0x8000000aul, 0x80000000 },
+	{ 0x80008081ul, 0x80000000 }, { 0x00008080ul, 0x80000000 },
+	{ 0x80000001ul, 0x00000000 }, { 0x80008008ul, 0x80000000 }
 };
 
 __host__ __forceinline__
@@ -132,116 +174,545 @@ static void blake256_compress1st(uint32_t *h, const uint32_t *block, const uint3
 		hostGS(3, 4, 0x9, 0xE, 0xE);
 	}
 
-	for (int i = 0; i < 16; i++) {
-		int j = i & 7;
-		h[j] ^= v[i];
-	}
+	h[0] ^= v[0] ^ v[8];
+	h[1] ^= v[1] ^ v[9];
+	h[2] ^= v[2] ^ v[10];
+	h[3] ^= v[3] ^ v[11];
+	h[4] ^= v[4] ^ v[12];
+	h[5] ^= v[5] ^ v[13];
+	h[6] ^= v[6] ^ v[14];
+	h[7] ^= v[7] ^ v[15];
 }
+#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a))))
 
-__device__ __forceinline__
-static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint32_t T0)
+static void __forceinline__ __device__ keccak_block(uint2 *s)
 {
-	uint32_t m[16];
-	uint32_t v[16];
-
-	m[0] = block[0];
-	m[1] = block[1];
-	m[2] = block[2];
-	m[3] = block[3];
-
-	#pragma unroll
-	for (int i = 4; i < 16; i++) {
-		m[i] = c_Padding[i];
-	}
-
-	#pragma unroll 8
-	for (int i = 0; i < 8; i++)
-		v[i] = h[i];
-
-	v[8] =  u256[0];
-	v[9] =  u256[1];
-	v[10] = u256[2];
-	v[11] = u256[3];
+	uint2 bc[5], tmpxor[5], tmp1, tmp2;
+	//	uint2 s[25];
 
-	v[12] = u256[4] ^ T0;
-	v[13] = u256[5] ^ T0;
-	v[14] = u256[6];
-	v[15] = u256[7];
-
-	#pragma unroll 14
-	for (int r = 0; r < 14; r++) {
-		/* column step */
-		GS2(0, 4, 0x8, 0xC, 0x0);
-		GS2(1, 5, 0x9, 0xD, 0x2);
-		GS2(2, 6, 0xA, 0xE, 0x4);
-		GS2(3, 7, 0xB, 0xF, 0x6);
-		/* diagonal step */
-		GS2(0, 5, 0xA, 0xF, 0x8);
-		GS2(1, 6, 0xB, 0xC, 0xA);
-		GS2(2, 7, 0x8, 0xD, 0xC);
-		GS2(3, 4, 0x9, 0xE, 0xE);
-	}
-
-	#pragma unroll 16
-	for (int i = 0; i < 16; i++) {
-		int j = i & 7;
-		h[j] ^= v[i];
+#pragma unroll 1
+	for (int i = 0; i < 24; i++)
+	{
+#pragma unroll
+		for (uint32_t x = 0; x < 5; x++)
+			tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20];
+
+		bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1);
+		bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1);
+		bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1);
+		bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1);
+		bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1);
+
+		tmp1 = s[1] ^ bc[0];
+
+		s[0] ^= bc[4];
+		s[1] = ROL2(s[6] ^ bc[0], 44);
+		s[6] = ROL2(s[9] ^ bc[3], 20);
+		s[9] = ROL2(s[22] ^ bc[1], 61);
+		s[22] = ROL2(s[14] ^ bc[3], 39);
+		s[14] = ROL2(s[20] ^ bc[4], 18);
+		s[20] = ROL2(s[2] ^ bc[1], 62);
+		s[2] = ROL2(s[12] ^ bc[1], 43);
+		s[12] = ROL2(s[13] ^ bc[2], 25);
+		s[13] = ROL8(s[19] ^ bc[3]);
+		s[19] = ROR8(s[23] ^ bc[2]);
+		s[23] = ROL2(s[15] ^ bc[4], 41);
+		s[15] = ROL2(s[4] ^ bc[3], 27);
+		s[4] = ROL2(s[24] ^ bc[3], 14);
+		s[24] = ROL2(s[21] ^ bc[0], 2);
+		s[21] = ROL2(s[8] ^ bc[2], 55);
+		s[8] = ROL2(s[16] ^ bc[0], 45);
+		s[16] = ROL2(s[5] ^ bc[4], 36);
+		s[5] = ROL2(s[3] ^ bc[2], 28);
+		s[3] = ROL2(s[18] ^ bc[2], 21);
+		s[18] = ROL2(s[17] ^ bc[1], 15);
+		s[17] = ROL2(s[11] ^ bc[0], 10);
+		s[11] = ROL2(s[7] ^ bc[1], 6);
+		s[7] = ROL2(s[10] ^ bc[4], 3);
+		s[10] = ROL2(tmp1, 1);
+
+		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+		s[0] ^= keccak_round_constants35[i];
 	}
 }
 
-__global__ __launch_bounds__(256,3)
-void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t * Hash)
+//__launch_bounds__(256)
+__global__
+void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t * Hash)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
+		const uint32_t nonce = startNonce + thread;
 		uint32_t h[8];
-		uint32_t input[4];
+		//		uint32_t input[4];
+		const uint32_t T0 = 640;
+#pragma unroll 8
+		for (int i = 0; i<8; i++) { h[i] = cpu_h[i]; }
+
+		uint32_t v[16];
+
+		const uint32_t c_Padding[12] = {
+			0x80000000, 0, 0, 0,
+			0, 0, 0, 0,
+			0, 1, 0, 640
+		};
+
+		const uint32_t  u256[16] =
+		{
+			0x243F6A88, 0x85A308D3,
+			0x13198A2E, 0x03707344,
+			0xA4093822, 0x299F31D0,
+			0x082EFA98, 0xEC4E6C89,
+			0x452821E6, 0x38D01377,
+			0xBE5466CF, 0x34E90C6C,
+			0xC0AC29B7, 0xC97C50DD,
+			0x3F84D5B5, 0xB5470917
+		};
+
+		uint32_t m[16] =
+		{
+			c_data[0], c_data[1], c_data[2], nonce,
+			c_Padding[0], c_Padding[1], c_Padding[2], c_Padding[3],
+			c_Padding[4], c_Padding[5], c_Padding[6], c_Padding[7],
+			c_Padding[8], c_Padding[9], c_Padding[10], c_Padding[11]
+		};
+
+#pragma unroll 8
+		for (int i = 0; i < 8; i++)
+			v[i] = h[i];
+
+		v[8] = u256[0];
+		v[9] = u256[1];
+		v[10] = u256[2];
+		v[11] = u256[3];
+		v[12] = u256[4] ^ T0;
+		v[13] = u256[5] ^ T0;
+		v[14] = u256[6];
+		v[15] = u256[7];
+
+		//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		GSPREC(0, 4, 0x8, 0xC, 0, 1);
+		GSPREC(1, 5, 0x9, 0xD, 2, 3);
+		GSPREC(2, 6, 0xA, 0xE, 4, 5);
+		GSPREC(3, 7, 0xB, 0xF, 6, 7);
+		GSPREC(0, 5, 0xA, 0xF, 8, 9);
+		GSPREC(1, 6, 0xB, 0xC, 10, 11);
+		GSPREC(2, 7, 0x8, 0xD, 12, 13);
+		GSPREC(3, 4, 0x9, 0xE, 14, 15);
+		//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		GSPREC(0, 4, 0x8, 0xC, 14, 10);
+		GSPREC(1, 5, 0x9, 0xD, 4, 8);
+		GSPREC(2, 6, 0xA, 0xE, 9, 15);
+		GSPREC(3, 7, 0xB, 0xF, 13, 6);
+		GSPREC(0, 5, 0xA, 0xF, 1, 12);
+		GSPREC(1, 6, 0xB, 0xC, 0, 2);
+		GSPREC(2, 7, 0x8, 0xD, 11, 7);
+		GSPREC(3, 4, 0x9, 0xE, 5, 3);
+		//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		GSPREC(0, 4, 0x8, 0xC, 11, 8);
+		GSPREC(1, 5, 0x9, 0xD, 12, 0);
+		GSPREC(2, 6, 0xA, 0xE, 5, 2);
+		GSPREC(3, 7, 0xB, 0xF, 15, 13);
+		GSPREC(0, 5, 0xA, 0xF, 10, 14);
+		GSPREC(1, 6, 0xB, 0xC, 3, 6);
+		GSPREC(2, 7, 0x8, 0xD, 7, 1);
+		GSPREC(3, 4, 0x9, 0xE, 9, 4);
+		//	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		GSPREC(0, 4, 0x8, 0xC, 7, 9);
+		GSPREC(1, 5, 0x9, 0xD, 3, 1);
+		GSPREC(2, 6, 0xA, 0xE, 13, 12);
+		GSPREC(3, 7, 0xB, 0xF, 11, 14);
+		GSPREC(0, 5, 0xA, 0xF, 2, 6);
+		GSPREC(1, 6, 0xB, 0xC, 5, 10);
+		GSPREC(2, 7, 0x8, 0xD, 4, 0);
+		GSPREC(3, 4, 0x9, 0xE, 15, 8);
+
+		//	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+		GSPREC(0, 4, 0x8, 0xC, 9, 0);
+		GSPREC(1, 5, 0x9, 0xD, 5, 7);
+		GSPREC(2, 6, 0xA, 0xE, 2, 4);
+		GSPREC(3, 7, 0xB, 0xF, 10, 15);
+		GSPREC(0, 5, 0xA, 0xF, 14, 1);
+		GSPREC(1, 6, 0xB, 0xC, 11, 12);
+		GSPREC(2, 7, 0x8, 0xD, 6, 8);
+		GSPREC(3, 4, 0x9, 0xE, 3, 13);
+		//	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+		GSPREC(0, 4, 0x8, 0xC, 2, 12);
+		GSPREC(1, 5, 0x9, 0xD, 6, 10);
+		GSPREC(2, 6, 0xA, 0xE, 0, 11);
+		GSPREC(3, 7, 0xB, 0xF, 8, 3);
+		GSPREC(0, 5, 0xA, 0xF, 4, 13);
+		GSPREC(1, 6, 0xB, 0xC, 7, 5);
+		GSPREC(2, 7, 0x8, 0xD, 15, 14);
+		GSPREC(3, 4, 0x9, 0xE, 1, 9);
+
+		//	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+		GSPREC(0, 4, 0x8, 0xC, 12, 5);
+		GSPREC(1, 5, 0x9, 0xD, 1, 15);
+		GSPREC(2, 6, 0xA, 0xE, 14, 13);
+		GSPREC(3, 7, 0xB, 0xF, 4, 10);
+		GSPREC(0, 5, 0xA, 0xF, 0, 7);
+		GSPREC(1, 6, 0xB, 0xC, 6, 3);
+		GSPREC(2, 7, 0x8, 0xD, 9, 2);
+		GSPREC(3, 4, 0x9, 0xE, 8, 11);
+
+		//	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+		GSPREC(0, 4, 0x8, 0xC, 13, 11);
+		GSPREC(1, 5, 0x9, 0xD, 7, 14);
+		GSPREC(2, 6, 0xA, 0xE, 12, 1);
+		GSPREC(3, 7, 0xB, 0xF, 3, 9);
+		GSPREC(0, 5, 0xA, 0xF, 5, 0);
+		GSPREC(1, 6, 0xB, 0xC, 15, 4);
+		GSPREC(2, 7, 0x8, 0xD, 8, 6);
+		GSPREC(3, 4, 0x9, 0xE, 2, 10);
+		//	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+		GSPREC(0, 4, 0x8, 0xC, 6, 15);
+		GSPREC(1, 5, 0x9, 0xD, 14, 9);
+		GSPREC(2, 6, 0xA, 0xE, 11, 3);
+		GSPREC(3, 7, 0xB, 0xF, 0, 8);
+		GSPREC(0, 5, 0xA, 0xF, 12, 2);
+		GSPREC(1, 6, 0xB, 0xC, 13, 7);
+		GSPREC(2, 7, 0x8, 0xD, 1, 4);
+		GSPREC(3, 4, 0x9, 0xE, 10, 5);
+		//	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+		GSPREC(0, 4, 0x8, 0xC, 10, 2);
+		GSPREC(1, 5, 0x9, 0xD, 8, 4);
+		GSPREC(2, 6, 0xA, 0xE, 7, 6);
+		GSPREC(3, 7, 0xB, 0xF, 1, 5);
+		GSPREC(0, 5, 0xA, 0xF, 15, 11);
+		GSPREC(1, 6, 0xB, 0xC, 9, 14);
+		GSPREC(2, 7, 0x8, 0xD, 3, 12);
+		GSPREC(3, 4, 0x9, 0xE, 13, 0);
+		//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		GSPREC(0, 4, 0x8, 0xC, 0, 1);
+		GSPREC(1, 5, 0x9, 0xD, 2, 3);
+		GSPREC(2, 6, 0xA, 0xE, 4, 5);
+		GSPREC(3, 7, 0xB, 0xF, 6, 7);
+		GSPREC(0, 5, 0xA, 0xF, 8, 9);
+		GSPREC(1, 6, 0xB, 0xC, 10, 11);
+		GSPREC(2, 7, 0x8, 0xD, 12, 13);
+		GSPREC(3, 4, 0x9, 0xE, 14, 15);
+
+		//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		GSPREC(0, 4, 0x8, 0xC, 14, 10);
+		GSPREC(1, 5, 0x9, 0xD, 4, 8);
+		GSPREC(2, 6, 0xA, 0xE, 9, 15);
+		GSPREC(3, 7, 0xB, 0xF, 13, 6);
+		GSPREC(0, 5, 0xA, 0xF, 1, 12);
+		GSPREC(1, 6, 0xB, 0xC, 0, 2);
+		GSPREC(2, 7, 0x8, 0xD, 11, 7);
+		GSPREC(3, 4, 0x9, 0xE, 5, 3);
+
+		//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		GSPREC(0, 4, 0x8, 0xC, 11, 8);
+		GSPREC(1, 5, 0x9, 0xD, 12, 0);
+		GSPREC(2, 6, 0xA, 0xE, 5, 2);
+		GSPREC(3, 7, 0xB, 0xF, 15, 13);
+		GSPREC(0, 5, 0xA, 0xF, 10, 14);
+		GSPREC(1, 6, 0xB, 0xC, 3, 6);
+		GSPREC(2, 7, 0x8, 0xD, 7, 1);
+		GSPREC(3, 4, 0x9, 0xE, 9, 4);
+		//	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		GSPREC(0, 4, 0x8, 0xC, 7, 9);
+		GSPREC(1, 5, 0x9, 0xD, 3, 1);
+		GSPREC(2, 6, 0xA, 0xE, 13, 12);
+		GSPREC(3, 7, 0xB, 0xF, 11, 14);
+		GSPREC(0, 5, 0xA, 0xF, 2, 6);
+		GSPREC(1, 6, 0xB, 0xC, 5, 10);
+		GSPREC(2, 7, 0x8, 0xD, 4, 0);
+		GSPREC(3, 4, 0x9, 0xE, 15, 8);
+
+
+
+
+		h[0] = cuda_swab32(h[0] ^ v[0] ^ v[8]);
+		h[1] = cuda_swab32(h[1] ^ v[1] ^ v[9]);
+		h[2] = cuda_swab32(h[2] ^ v[2] ^ v[10]);
+		h[3] = cuda_swab32(h[3] ^ v[3] ^ v[11]);
+		h[4] = cuda_swab32(h[4] ^ v[4] ^ v[12]);
+		h[5] = cuda_swab32(h[5] ^ v[5] ^ v[13]);
+		h[6] = cuda_swab32(h[6] ^ v[6] ^ v[14]);
+		h[7] = cuda_swab32(h[7] ^ v[7] ^ v[15]);
+
+		uint2 keccak_gpu_state[25] = { 0 };
+		keccak_gpu_state[0].x = h[0];
+		keccak_gpu_state[0].y = h[1];
+		keccak_gpu_state[1].x = h[2];
+		keccak_gpu_state[1].y = h[3];
+		keccak_gpu_state[2].x = h[4];
+		keccak_gpu_state[2].y = h[5];
+		keccak_gpu_state[3].x = h[6];
+		keccak_gpu_state[3].y = h[7];
+		keccak_gpu_state[4] = UINT2(1, 0);
+
+		keccak_gpu_state[16] = UINT2(0, 0x80000000);
+		keccak_block(keccak_gpu_state);
+		uint64_t *outputHash = (uint64_t *)Hash;
+#pragma unroll 4
+		for (int i = 0; i<4; i++)
+			outputHash[i*threads + thread] = devectorize(keccak_gpu_state[i]);
+	}
 
-		#pragma unroll
-		for (int i = 0; i < 8; i++) h[i] = cpu_h[i];
 
-		#pragma unroll
-		for (int i = 0; i < 3; ++i) input[i] = c_data[i];
 
-		input[3] = startNonce + thread;
-		blake256_compress2nd(h, input, 640);
+}
+
 
-		#pragma unroll
-		for (int i = 0; i<4; i++) {
-			Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2*i+1]));
-		}
+__global__ __launch_bounds__(256, 4)
+void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t * Hash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNonce + thread;
+		uint32_t h[8];
+		//		uint32_t input[4];
+		const uint32_t T0 = 640;
+#pragma unroll 8
+		for (int i = 0; i<8; i++) { h[i] = cpu_h[i]; }
+
+		uint32_t v[16];
+
+		const uint32_t c_Padding[12] = {
+			0x80000000, 0, 0, 0,
+			0, 0, 0, 0,
+			0, 1, 0, 640
+		};
+
+		const uint32_t  u256[16] =
+		{
+			0x243F6A88, 0x85A308D3,
+			0x13198A2E, 0x03707344,
+			0xA4093822, 0x299F31D0,
+			0x082EFA98, 0xEC4E6C89,
+			0x452821E6, 0x38D01377,
+			0xBE5466CF, 0x34E90C6C,
+			0xC0AC29B7, 0xC97C50DD,
+			0x3F84D5B5, 0xB5470917
+		};
+
+		uint32_t m[16] =
+		{
+			c_data[0], c_data[1], c_data[2], nonce,
+			c_Padding[0], c_Padding[1], c_Padding[2], c_Padding[3],
+			c_Padding[4], c_Padding[5], c_Padding[6], c_Padding[7],
+			c_Padding[8], c_Padding[9], c_Padding[10], c_Padding[11]
+		};
+
+#pragma unroll 8
+		for (int i = 0; i < 8; i++)
+			v[i] = h[i];
+
+		v[8] = u256[0];
+		v[9] = u256[1];
+		v[10] = u256[2];
+		v[11] = u256[3];
+		v[12] = u256[4] ^ T0;
+		v[13] = u256[5] ^ T0;
+		v[14] = u256[6];
+		v[15] = u256[7];
+
+		//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		GSPREC(0, 4, 0x8, 0xC, 0, 1);
+		GSPREC(1, 5, 0x9, 0xD, 2, 3);
+		GSPREC(2, 6, 0xA, 0xE, 4, 5);
+		GSPREC(3, 7, 0xB, 0xF, 6, 7);
+		GSPREC(0, 5, 0xA, 0xF, 8, 9);
+		GSPREC(1, 6, 0xB, 0xC, 10, 11);
+		GSPREC(2, 7, 0x8, 0xD, 12, 13);
+		GSPREC(3, 4, 0x9, 0xE, 14, 15);
+		//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		GSPREC(0, 4, 0x8, 0xC, 14, 10);
+		GSPREC(1, 5, 0x9, 0xD, 4, 8);
+		GSPREC(2, 6, 0xA, 0xE, 9, 15);
+		GSPREC(3, 7, 0xB, 0xF, 13, 6);
+		GSPREC(0, 5, 0xA, 0xF, 1, 12);
+		GSPREC(1, 6, 0xB, 0xC, 0, 2);
+		GSPREC(2, 7, 0x8, 0xD, 11, 7);
+		GSPREC(3, 4, 0x9, 0xE, 5, 3);
+		//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		GSPREC(0, 4, 0x8, 0xC, 11, 8);
+		GSPREC(1, 5, 0x9, 0xD, 12, 0);
+		GSPREC(2, 6, 0xA, 0xE, 5, 2);
+		GSPREC(3, 7, 0xB, 0xF, 15, 13);
+		GSPREC(0, 5, 0xA, 0xF, 10, 14);
+		GSPREC(1, 6, 0xB, 0xC, 3, 6);
+		GSPREC(2, 7, 0x8, 0xD, 7, 1);
+		GSPREC(3, 4, 0x9, 0xE, 9, 4);
+		//	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		GSPREC(0, 4, 0x8, 0xC, 7, 9);
+		GSPREC(1, 5, 0x9, 0xD, 3, 1);
+		GSPREC(2, 6, 0xA, 0xE, 13, 12);
+		GSPREC(3, 7, 0xB, 0xF, 11, 14);
+		GSPREC(0, 5, 0xA, 0xF, 2, 6);
+		GSPREC(1, 6, 0xB, 0xC, 5, 10);
+		GSPREC(2, 7, 0x8, 0xD, 4, 0);
+		GSPREC(3, 4, 0x9, 0xE, 15, 8);
+
+		//	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+		GSPREC(0, 4, 0x8, 0xC, 9, 0);
+		GSPREC(1, 5, 0x9, 0xD, 5, 7);
+		GSPREC(2, 6, 0xA, 0xE, 2, 4);
+		GSPREC(3, 7, 0xB, 0xF, 10, 15);
+		GSPREC(0, 5, 0xA, 0xF, 14, 1);
+		GSPREC(1, 6, 0xB, 0xC, 11, 12);
+		GSPREC(2, 7, 0x8, 0xD, 6, 8);
+		GSPREC(3, 4, 0x9, 0xE, 3, 13);
+		//	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+		GSPREC(0, 4, 0x8, 0xC, 2, 12);
+		GSPREC(1, 5, 0x9, 0xD, 6, 10);
+		GSPREC(2, 6, 0xA, 0xE, 0, 11);
+		GSPREC(3, 7, 0xB, 0xF, 8, 3);
+		GSPREC(0, 5, 0xA, 0xF, 4, 13);
+		GSPREC(1, 6, 0xB, 0xC, 7, 5);
+		GSPREC(2, 7, 0x8, 0xD, 15, 14);
+		GSPREC(3, 4, 0x9, 0xE, 1, 9);
+
+		//	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+		GSPREC(0, 4, 0x8, 0xC, 12, 5);
+		GSPREC(1, 5, 0x9, 0xD, 1, 15);
+		GSPREC(2, 6, 0xA, 0xE, 14, 13);
+		GSPREC(3, 7, 0xB, 0xF, 4, 10);
+		GSPREC(0, 5, 0xA, 0xF, 0, 7);
+		GSPREC(1, 6, 0xB, 0xC, 6, 3);
+		GSPREC(2, 7, 0x8, 0xD, 9, 2);
+		GSPREC(3, 4, 0x9, 0xE, 8, 11);
+
+		//	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+		GSPREC(0, 4, 0x8, 0xC, 13, 11);
+		GSPREC(1, 5, 0x9, 0xD, 7, 14);
+		GSPREC(2, 6, 0xA, 0xE, 12, 1);
+		GSPREC(3, 7, 0xB, 0xF, 3, 9);
+		GSPREC(0, 5, 0xA, 0xF, 5, 0);
+		GSPREC(1, 6, 0xB, 0xC, 15, 4);
+		GSPREC(2, 7, 0x8, 0xD, 8, 6);
+		GSPREC(3, 4, 0x9, 0xE, 2, 10);
+		//	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+		GSPREC(0, 4, 0x8, 0xC, 6, 15);
+		GSPREC(1, 5, 0x9, 0xD, 14, 9);
+		GSPREC(2, 6, 0xA, 0xE, 11, 3);
+		GSPREC(3, 7, 0xB, 0xF, 0, 8);
+		GSPREC(0, 5, 0xA, 0xF, 12, 2);
+		GSPREC(1, 6, 0xB, 0xC, 13, 7);
+		GSPREC(2, 7, 0x8, 0xD, 1, 4);
+		GSPREC(3, 4, 0x9, 0xE, 10, 5);
+		//	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+		GSPREC(0, 4, 0x8, 0xC, 10, 2);
+		GSPREC(1, 5, 0x9, 0xD, 8, 4);
+		GSPREC(2, 6, 0xA, 0xE, 7, 6);
+		GSPREC(3, 7, 0xB, 0xF, 1, 5);
+		GSPREC(0, 5, 0xA, 0xF, 15, 11);
+		GSPREC(1, 6, 0xB, 0xC, 9, 14);
+		GSPREC(2, 7, 0x8, 0xD, 3, 12);
+		GSPREC(3, 4, 0x9, 0xE, 13, 0);
+		//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		GSPREC(0, 4, 0x8, 0xC, 0, 1);
+		GSPREC(1, 5, 0x9, 0xD, 2, 3);
+		GSPREC(2, 6, 0xA, 0xE, 4, 5);
+		GSPREC(3, 7, 0xB, 0xF, 6, 7);
+		GSPREC(0, 5, 0xA, 0xF, 8, 9);
+		GSPREC(1, 6, 0xB, 0xC, 10, 11);
+		GSPREC(2, 7, 0x8, 0xD, 12, 13);
+		GSPREC(3, 4, 0x9, 0xE, 14, 15);
+
+		//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		GSPREC(0, 4, 0x8, 0xC, 14, 10);
+		GSPREC(1, 5, 0x9, 0xD, 4, 8);
+		GSPREC(2, 6, 0xA, 0xE, 9, 15);
+		GSPREC(3, 7, 0xB, 0xF, 13, 6);
+		GSPREC(0, 5, 0xA, 0xF, 1, 12);
+		GSPREC(1, 6, 0xB, 0xC, 0, 2);
+		GSPREC(2, 7, 0x8, 0xD, 11, 7);
+		GSPREC(3, 4, 0x9, 0xE, 5, 3);
+
+		//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		GSPREC(0, 4, 0x8, 0xC, 11, 8);
+		GSPREC(1, 5, 0x9, 0xD, 12, 0);
+		GSPREC(2, 6, 0xA, 0xE, 5, 2);
+		GSPREC(3, 7, 0xB, 0xF, 15, 13);
+		GSPREC(0, 5, 0xA, 0xF, 10, 14);
+		GSPREC(1, 6, 0xB, 0xC, 3, 6);
+		GSPREC(2, 7, 0x8, 0xD, 7, 1);
+		GSPREC(3, 4, 0x9, 0xE, 9, 4);
+		//	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		GSPREC(0, 4, 0x8, 0xC, 7, 9);
+		GSPREC(1, 5, 0x9, 0xD, 3, 1);
+		GSPREC(2, 6, 0xA, 0xE, 13, 12);
+		GSPREC(3, 7, 0xB, 0xF, 11, 14);
+		GSPREC(0, 5, 0xA, 0xF, 2, 6);
+		GSPREC(1, 6, 0xB, 0xC, 5, 10);
+		GSPREC(2, 7, 0x8, 0xD, 4, 0);
+		GSPREC(3, 4, 0x9, 0xE, 15, 8);
+
+		h[0] = cuda_swab32(h[0] ^ v[0] ^ v[8]);
+		h[1] = cuda_swab32(h[1] ^ v[1] ^ v[9]);
+		h[2] = cuda_swab32(h[2] ^ v[2] ^ v[10]);
+		h[3] = cuda_swab32(h[3] ^ v[3] ^ v[11]);
+		h[4] = cuda_swab32(h[4] ^ v[4] ^ v[12]);
+		h[5] = cuda_swab32(h[5] ^ v[5] ^ v[13]);
+		h[6] = cuda_swab32(h[6] ^ v[6] ^ v[14]);
+		h[7] = cuda_swab32(h[7] ^ v[7] ^ v[15]);
+
+		Hash[((0 * threads) + thread) * 2] = (h[0]);
+		Hash[((0 * threads) + thread) * 2 + 1] = (h[1]);
+		Hash[((1 * threads) + thread) * 2] = (h[2]);
+		Hash[((1 * threads) + thread) * 2 + 1] = (h[3]);
+		Hash[((2 * threads) + thread) * 2] = (h[4]);
+		Hash[((2 * threads) + thread) * 2 + 1] = (h[5]);
+		Hash[((3 * threads) + thread) * 2] = (h[6]);
+		Hash[((3 * threads) + thread) * 2 + 1] = (h[7]);
 	}
 }
 
 __host__
 void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order)
 {
-	const uint32_t threadsperblock = 256;
+	const uint32_t threadsperblock = 64;
 
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	blake256_gpu_hash_80 <<<grid, block>>> (threads, startNonce, Hash);
-	MyStreamSynchronize(NULL, order, thr_id);
+	blake256_gpu_hash_80 << <grid, block >> > (threads, startNonce, (uint32_t *)Hash);
 }
 
 __host__
 void blake256_cpu_setBlock_80(uint32_t *pdata)
 {
-	uint32_t h[8], data[20];
-
+	uint32_t h[8];
+	uint32_t data[20];
 	memcpy(data, pdata, 80);
-	memcpy(h, c_IV256, sizeof(c_IV256));
+	for (int i = 0; i<8; i++) {
+		h[i] = c_IV256[i];
+	}
 	blake256_compress1st(h, pdata, 512);
 
 	cudaMemcpyToSymbol(cpu_h, h, sizeof(h), 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol(c_data, &data[16], sizeof(c_data), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_data, &data[16], 3 * 4, 0, cudaMemcpyHostToDevice);
 }
 
 __host__
-void blake256_cpu_init(int thr_id, uint32_t threads)
+void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order)
 {
-	cudaMemcpyToSymbol(u256, c_u256, sizeof(c_u256), 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice);
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	blakeKeccak256_gpu_hash_80 << <grid, block >> > (threads, startNonce, (uint32_t *)Hash);
+}
+
+__host__
+void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order, cudaStream_t stream)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	blakeKeccak256_gpu_hash_80 << <grid, block, 0, stream >> > (threads, startNonce, (uint32_t *)Hash);
 }
diff --git a/Algo256/cuda_bmw256.cu b/Algo256/cuda_bmw256.cu
index 0fde12e..b301749 100644
--- a/Algo256/cuda_bmw256.cu
+++ b/Algo256/cuda_bmw256.cu
@@ -14,87 +14,85 @@ __constant__ uint64_t pTarget[4];
 #define shl(x, n) ((x) << (n))
 #define shr(x, n) ((x) >> (n))
 
-#define ss0(x) (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x),  4) ^ SPH_ROTL32((x), 19))
-#define ss1(x) (shr((x), 1) ^ shl((x), 2) ^ SPH_ROTL32((x),  8) ^ SPH_ROTL32((x), 23))
-#define ss2(x) (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25))
-#define ss3(x) (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29))
-#define ss4(x) (shr((x), 1) ^ (x))
-#define ss5(x) (shr((x), 2) ^ (x))
-
+#define ss0(x)  (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x),  4) ^ SPH_ROTL32((x), 19))
+#define ss1(x)  (shr((x), 1) ^ shl((x), 2) ^ __byte_perm(x,0,0x2103) ^ SPH_ROTL32((x), 23))
+#define ss2(x)  (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25))
+#define ss3(x)  (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29))
+#define ss4(x)  (shr((x), 1) ^ (x))
+#define ss5(x)  (shr((x), 2) ^ (x))
 #define rs1(x) SPH_ROTL32((x),  3)
 #define rs2(x) SPH_ROTL32((x),  7)
 #define rs3(x) SPH_ROTL32((x), 13)
-#define rs4(x) SPH_ROTL32((x), 16)
+#define rs4(x) __byte_perm(x,0,0x1032)
 #define rs5(x) SPH_ROTL32((x), 19)
 #define rs6(x) SPH_ROTL32((x), 23)
 #define rs7(x) SPH_ROTL32((x), 27)
 
 /* Message expansion function 1 */
-__forceinline__ __device__
-uint32_t expand32_1(int i, uint32_t *M32, const uint32_t *H, uint32_t *Q)
+__forceinline__ __device__ uint32_t expand32_1(int i, uint32_t *M32, const uint32_t *H, uint32_t *Q)
 {
 	return (ss1(Q[i - 16]) + ss2(Q[i - 15]) + ss3(Q[i - 14]) + ss0(Q[i - 13])
 		+ ss1(Q[i - 12]) + ss2(Q[i - 11]) + ss3(Q[i - 10]) + ss0(Q[i - 9])
 		+ ss1(Q[i - 8]) + ss2(Q[i - 7]) + ss3(Q[i - 6]) + ss0(Q[i - 5])
 		+ ss1(Q[i - 4]) + ss2(Q[i - 3]) + ss3(Q[i - 2]) + ss0(Q[i - 1])
-		+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1)
-			+ SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1)
-			- SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
+		+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
 }
 
 /* Message expansion function 2 */
-__forceinline__ __device__
-uint32_t expand32_2(int i, uint32_t *M32, const uint32_t *H, uint32_t *Q)
+__forceinline__ __device__ uint32_t expand32_2(const int i, uint32_t *M32, const uint32_t *H, uint32_t *Q)
 {
-	return (Q[i - 16] + rs1(Q[i - 15]) + Q[i - 14] + rs2(Q[i - 13])
-		+ Q[i - 12] + rs3(Q[i - 11]) + Q[i - 10] + rs4(Q[i - 9])
-		+ Q[i - 8] + rs5(Q[i - 7]) + Q[i - 6] + rs6(Q[i - 5])
-		+ Q[i - 4] + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1])
-		+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1)
-			+ SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1)
-			- SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
+	return (
+		rs2(Q[i - 13]) + rs3(Q[i - 11]) + rs4(Q[i - 9]) + rs1(Q[i - 15]) +
+		+rs5(Q[i - 7]) + rs6(Q[i - 5]) + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1]));
 }
 
-__forceinline__ __device__
-void Compression256(uint32_t *  M32)
+__forceinline__ __device__ void Compression256(uint32_t M32[16])
 {
-	uint32_t Q[32], XL32, XH32;
-
 	const uint32_t H[16] = {
-		0x40414243, 0x44454647, 0x48494A4B, 0x4C4D4E4F,
-		0x50515253, 0x54555657, 0x58595A5B, 0x5C5D5E5F,
-		0x60616263, 0x64656667, 0x68696A6B, 0x6C6D6E6F,
-		0x70717273, 0x74757677, 0x78797A7B, 0x7C7D7E7F
+		(0x40414243), (0x44454647),
+		(0x48494A4B), (0x4C4D4E4F),
+		(0x50515253), (0x54555657),
+		(0x58595A5B), (0x5C5D5E5F),
+		(0x60616263), (0x64656667),
+		(0x68696A6B), (0x6C6D6E6F),
+		(0x70717273), (0x74757677),
+		(0x78797A7B), (0x7C7D7E7F)
 	};
 
-	Q[0]  = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]);
-	Q[1]  = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]);
-	Q[2]  = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9]  ^ H[9])  - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
-	Q[3]  = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (M32[8]  ^ H[8])  - (M32[10] ^ H[10]) + (M32[13] ^ H[13]);
-	Q[4]  = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (M32[9]  ^ H[9])  - (M32[11] ^ H[11]) - (M32[14] ^ H[14]);
-	Q[5]  = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (M32[10] ^ H[10]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
-	Q[6]  = (M32[4] ^ H[4]) - (M32[0] ^ H[0]) - (M32[3]  ^ H[3])  - (M32[11] ^ H[11]) + (M32[13] ^ H[13]);
-	Q[7]  = (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[5]  ^ H[5])  - (M32[12] ^ H[12]) - (M32[14] ^ H[14]);
-	Q[8]  = (M32[2] ^ H[2]) - (M32[5] ^ H[5]) - (M32[6]  ^ H[6])  + (M32[13] ^ H[13]) - (M32[15] ^ H[15]);
-	Q[9]  = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (M32[6]  ^ H[6])  - (M32[7]  ^ H[7])  + (M32[14] ^ H[14]);
-	Q[10] = (M32[8] ^ H[8]) - (M32[1] ^ H[1]) - (M32[4]  ^ H[4])  - (M32[7]  ^ H[7])  + (M32[15] ^ H[15]);
-	Q[11] = (M32[8] ^ H[8]) - (M32[0] ^ H[0]) - (M32[2]  ^ H[2])  - (M32[5]  ^ H[5])  + (M32[9]  ^ H[9]);
-	Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (M32[6]  ^ H[6])  - (M32[9]  ^ H[9])  + (M32[10] ^ H[10]);
-	Q[13] = (M32[2] ^ H[2]) + (M32[4] ^ H[4]) + (M32[7]  ^ H[7])  + (M32[10] ^ H[10]) + (M32[11] ^ H[11]);
-	Q[14] = (M32[3] ^ H[3]) - (M32[5] ^ H[5]) + (M32[8]  ^ H[8])  - (M32[11] ^ H[11]) - (M32[12] ^ H[12]);
-	Q[15] = (M32[12] ^ H[12]) - (M32[4] ^ H[4]) - (M32[6] ^ H[6]) - (M32[9]  ^ H[9])  + (M32[13] ^ H[13]);
-
-	/*  Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe. */
-	Q[0]  = ss0(Q[0])  + H[1];
-	Q[1]  = ss1(Q[1])  + H[2];
-	Q[2]  = ss2(Q[2])  + H[3];
-	Q[3]  = ss3(Q[3])  + H[4];
-	Q[4]  = ss4(Q[4])  + H[5];
-	Q[5]  = ss0(Q[5])  + H[6];
-	Q[6]  = ss1(Q[6])  + H[7];
-	Q[7]  = ss2(Q[7])  + H[8];
-	Q[8]  = ss3(Q[8])  + H[9];
-	Q[9]  = ss4(Q[9])  + H[10];
+	M32[8] = 0x80;
+	M32[14] = 0x100;
+
+	//	int i;
+	uint32_t XL32, XH32, Q[32];
+
+	Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]);
+	Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]);
+	Q[2] = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9] ^ H[9]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
+	Q[3] = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (M32[8] ^ H[8]) - (M32[10] ^ H[10]) + (M32[13] ^ H[13]);
+	Q[4] = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (M32[9] ^ H[9]) - (M32[11] ^ H[11]) - (M32[14] ^ H[14]);
+	Q[5] = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (M32[10] ^ H[10]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
+	Q[6] = (M32[4] ^ H[4]) - (M32[0] ^ H[0]) - (M32[3] ^ H[3]) - (M32[11] ^ H[11]) + (M32[13] ^ H[13]);
+	Q[7] = (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[5] ^ H[5]) - (M32[12] ^ H[12]) - (M32[14] ^ H[14]);
+	Q[8] = (M32[2] ^ H[2]) - (M32[5] ^ H[5]) - (M32[6] ^ H[6]) + (M32[13] ^ H[13]) - (M32[15] ^ H[15]);
+	Q[9] = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (M32[6] ^ H[6]) - (M32[7] ^ H[7]) + (M32[14] ^ H[14]);
+	Q[10] = (M32[8] ^ H[8]) - (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[7] ^ H[7]) + (M32[15] ^ H[15]);
+	Q[11] = (M32[8] ^ H[8]) - (M32[0] ^ H[0]) - (M32[2] ^ H[2]) - (M32[5] ^ H[5]) + (M32[9] ^ H[9]);
+	Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[10] ^ H[10]);
+	Q[13] = (M32[2] ^ H[2]) + (M32[4] ^ H[4]) + (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[11] ^ H[11]);
+	Q[14] = (M32[3] ^ H[3]) - (M32[5] ^ H[5]) + (M32[8] ^ H[8]) - (M32[11] ^ H[11]) - (M32[12] ^ H[12]);
+	Q[15] = (M32[12] ^ H[12]) - (M32[4] ^ H[4]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[13] ^ H[13]);
+
+	/*  Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe.*/
+	Q[0] = ss0(Q[0]) + H[1];
+	Q[1] = ss1(Q[1]) + H[2];
+	Q[2] = ss2(Q[2]) + H[3];
+	Q[3] = ss3(Q[3]) + H[4];
+	Q[4] = ss4(Q[4]) + H[5];
+	Q[5] = ss0(Q[5]) + H[6];
+	Q[6] = ss1(Q[6]) + H[7];
+	Q[7] = ss2(Q[7]) + H[8];
+	Q[8] = ss3(Q[8]) + H[9];
+	Q[9] = ss4(Q[9]) + H[10];
 	Q[10] = ss0(Q[10]) + H[11];
 	Q[11] = ss1(Q[11]) + H[12];
 	Q[12] = ss2(Q[12]) + H[13];
@@ -109,13 +107,91 @@ void Compression256(uint32_t *  M32)
 	/* The following relation for these parameters should is satisfied: */
 	/* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16                           */
 
-	#pragma unroll
-	for (int i=16; i<18; i++)
-		Q[i] = expand32_1(i, M32, H, Q);
-
-	#pragma nounroll
-	for (int i=18; i<32; i++)
-		Q[i] = expand32_2(i, M32, H, Q);
+	//	#pragma unroll
+	//	for (i = 0; i<2; i++)
+	//		Q[i + 16] = expand32_1(i + 16, M32, H, Q);
+
+	Q[16] = ss1(Q[16 - 16]) + ss2(Q[16 - 15]) + ss3(Q[16 - 14]) + ss0(Q[16 - 13])
+		+ ss1(Q[16 - 12]) + ss2(Q[16 - 11]) + ss3(Q[16 - 10]) + ss0(Q[16 - 9])
+		+ ss1(Q[16 - 8]) + ss2(Q[16 - 7]) + ss3(Q[16 - 6]) + ss0(Q[16 - 5])
+		+ ss1(Q[16 - 4]) + ss2(Q[16 - 3]) + ss3(Q[16 - 2]) + ss0(Q[16 - 1])
+		+ ((16 * (0x05555555ul) + SPH_ROTL32(M32[0], ((16 - 16) % 16) + 1) + SPH_ROTL32(M32[3], ((16 - 13) % 16) + 1)) ^ H[(16 - 16 + 7) % 16]);
+
+	Q[17] = ss1(Q[17 - 16]) + ss2(Q[17 - 15]) + ss3(Q[17 - 14]) + ss0(Q[17 - 13])
+		+ ss1(Q[17 - 12]) + ss2(Q[17 - 11]) + ss3(Q[17 - 10]) + ss0(Q[17 - 9])
+		+ ss1(Q[17 - 8]) + ss2(Q[17 - 7]) + ss3(Q[17 - 6]) + ss0(Q[17 - 5])
+		+ ss1(Q[17 - 4]) + ss2(Q[17 - 3]) + ss3(Q[17 - 2]) + ss0(Q[17 - 1])
+		+ ((17 * (0x05555555ul) + SPH_ROTL32(M32[(17 - 16) % 16], ((17 - 16) % 16) + 1) + SPH_ROTL32(M32[(17 - 13) % 16], ((17 - 13) % 16) + 1)) ^ H[(17 - 16 + 7) % 16]);
+
+
+	uint32_t precalc = Q[18 - 16] + Q[18 - 14] + Q[18 - 12] + Q[18 - 10] + Q[18 - 8] + Q[18 - 6]; //+ Q[18 - 4]
+	uint32_t precalc2 = Q[19 - 16] + Q[19 - 14] + Q[19 - 12] + Q[19 - 10] + Q[19 - 8] + Q[19 - 6];//+ Q[19 - 4]
+
+	//	#pragma unroll
+	//	for (i = 2 + 16; i < 16 + 16; i+=2)
+	//	{
+	precalc = precalc + Q[18 - 4];
+	precalc2 = precalc2 + Q[18 + 1 - 4];
+	uint32_t p1 = ((18 * (0x05555555ul) + SPH_ROTL32(M32[2], ((18 - 16) % 16) + 1) + SPH_ROTL32(M32[5], ((18 - 13) % 16) + 1)) ^ H[(18 - 16 + 7) % 16]);
+	uint32_t p2 = (((18 + 1)*(0x05555555ul) + SPH_ROTL32(M32[3], (((18 + 1) - 16) % 16) + 1) + SPH_ROTL32(M32[6], (((18 + 1) - 13) % 16) + 1)) ^ H[((18 + 1) - 16 + 7) % 16]);
+	Q[18] = precalc + expand32_2(18, M32, H, Q) + p1;
+	Q[18 + 1] = precalc2 + expand32_2(18 + 1, M32, H, Q) + p2;
+	precalc = precalc - Q[18 - 16];
+	precalc2 = precalc2 - Q[18 + 1 - 16];
+
+	precalc = precalc + Q[20 - 4];
+	precalc2 = precalc2 + Q[20 + 1 - 4];
+	p1 = ((20 * (0x05555555ul) + SPH_ROTL32(M32[4], ((20 - 16) % 16) + 1) + SPH_ROTL32(M32[7], ((20 - 13) % 16) + 1) - (0x100 << 15)) ^ H[(20 - 16 + 7) % 16]);
+	p2 = (((20 + 1)*(0x05555555ul) + SPH_ROTL32(M32[5], (((20 + 1) - 16) % 16) + 1) + (0x80 << 9)) ^ H[((20 + 1) - 16 + 7) % 16]);
+	Q[20] = precalc + expand32_2(20, M32, H, Q) + p1;
+	Q[20 + 1] = precalc2 + expand32_2(20 + 1, M32, H, Q) + p2;
+	precalc = precalc - Q[20 - 16];
+	precalc2 = precalc2 - Q[20 + 1 - 16];
+
+	precalc = precalc + Q[22 - 4];
+	precalc2 = precalc2 + Q[22 + 1 - 4];
+	p1 = ((22 * (0x05555555ul) + SPH_ROTL32(M32[6], ((22 - 16) % 16) + 1) - SPH_ROTL32(M32[0], ((22 - 6) % 16) + 1)) ^ H[(22 - 16 + 7) % 16]);
+	p2 = (((22 + 1)*(0x05555555ul) + SPH_ROTL32(M32[7], (((22 + 1) - 16) % 16) + 1) - SPH_ROTL32(M32[1], (((22 + 1) - 6) % 16) + 1)) ^ H[((22 + 1) - 16 + 7) % 16]);
+	Q[22] = precalc + expand32_2(22, M32, H, Q) + p1;
+	Q[22 + 1] = precalc2 + expand32_2(22 + 1, M32, H, Q) + p2;
+	precalc = precalc - Q[22 - 16];
+	precalc2 = precalc2 - Q[22 + 1 - 16];
+
+	precalc = precalc + Q[24 - 4];
+	precalc2 = precalc2 + Q[24 + 1 - 4];
+	p1 = ((24 * (0x05555555ul) + (0x80 << 9) - SPH_ROTL32(M32[2], ((24 - 6) % 16) + 1)) ^ H[(24 - 16 + 7) % 16]);
+	p2 = (((24 + 1)*(0x05555555ul) - SPH_ROTL32(M32[3], (((24 + 1) - 6) % 16) + 1)) ^ H[((24 + 1) - 16 + 7) % 16]);
+	Q[24] = precalc + expand32_2(24, M32, H, Q) + p1;
+	Q[24 + 1] = precalc2 + expand32_2(24 + 1, M32, H, Q) + p2;
+	precalc = precalc - Q[24 - 16];
+	precalc2 = precalc2 - Q[24 + 1 - 16];
+
+	precalc = precalc + Q[26 - 4];
+	precalc2 = precalc2 + Q[26 + 1 - 4];
+	p1 = ((26 * (0x05555555ul) - SPH_ROTL32(M32[4], ((26 - 6) % 16) + 1)) ^ H[(26 - 16 + 7) % 16]);
+	p2 = (((26 + 1)*(0x05555555ul) + (0x100 << 15) - SPH_ROTL32(M32[5], (((26 + 1) - 6) % 16) + 1)) ^ H[((26 + 1) - 16 + 7) % 16]);
+	Q[26] = precalc + expand32_2(26, M32, H, Q) + p1;
+	Q[26 + 1] = precalc2 + expand32_2(26 + 1, M32, H, Q) + p2;
+	precalc = precalc - Q[26 - 16];
+	precalc2 = precalc2 - Q[26 + 1 - 16];
+
+	precalc = precalc + Q[28 - 4];
+	precalc2 = precalc2 + Q[28 + 1 - 4];
+	p1 = ((28 * (0x05555555ul) - SPH_ROTL32(M32[6], ((28 - 6) % 16) + 1)) ^ H[(28 - 16 + 7) % 16]);
+	p2 = (((28 + 1)*(0x05555555ul) + SPH_ROTL32(M32[0], (((28 + 1) - 13) % 16) + 1) - SPH_ROTL32(M32[7], (((28 + 1) - 6) % 16) + 1)) ^ H[((28 + 1) - 16 + 7) % 16]);
+	Q[28] = precalc + expand32_2(28, M32, H, Q) + p1;
+	Q[28 + 1] = precalc2 + expand32_2(28 + 1, M32, H, Q) + p2;
+	precalc = precalc - Q[28 - 16];
+	precalc2 = precalc2 - Q[28 + 1 - 16];
+
+	precalc = precalc + Q[30 - 4];
+	precalc2 = precalc2 + Q[30 + 1 - 4];
+	p1 = ((30 * (0x05555555ul) + (0x100 << 15) + SPH_ROTL32(M32[1], ((30 - 13) % 16) + 1) - (0x80 << 9)) ^ H[(30 - 16 + 7) % 16]);
+	p2 = (((30 + 1)*(0x05555555ul) + SPH_ROTL32(M32[2], (((30 + 1) - 13) % 16) + 1)) ^ H[((30 + 1) - 16 + 7) % 16]);
+	Q[30] = precalc + expand32_2(30, M32, H, Q) + p1;
+	Q[30 + 1] = precalc2 + expand32_2(30 + 1, M32, H, Q) + p2;
+	precalc = precalc - Q[30 - 16];
+	precalc2 = precalc2 - Q[30 + 1 - 16];
 
 	/* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */
 	/* 16 new variables that are prooduced in the Message Expansion part.                    */
@@ -145,17 +221,18 @@ void Compression256(uint32_t *  M32)
 	M32[15] = SPH_ROTL32(M32[3], 16) + (XH32     ^     Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
 }
 
-__forceinline__ __device__
-void Compression256_2(uint32_t *  M32)
+__forceinline__ __device__ void Compression256_2(uint32_t  M32[16])
 {
-	uint32_t XL32, XH32, Q[32];
-
 	const uint32_t H[16] = {
-		0xaaaaaaa0, 0xaaaaaaa1, 0xaaaaaaa2, 0xaaaaaaa3,
-		0xaaaaaaa4, 0xaaaaaaa5, 0xaaaaaaa6, 0xaaaaaaa7,
-		0xaaaaaaa8, 0xaaaaaaa9, 0xaaaaaaaa, 0xaaaaaaab,
-		0xaaaaaaac, 0xaaaaaaad, 0xaaaaaaae, 0xaaaaaaaf
+		(0xaaaaaaa0), (0xaaaaaaa1), (0xaaaaaaa2),
+		(0xaaaaaaa3), (0xaaaaaaa4), (0xaaaaaaa5),
+		(0xaaaaaaa6), (0xaaaaaaa7), (0xaaaaaaa8),
+		(0xaaaaaaa9), (0xaaaaaaaa), (0xaaaaaaab),
+		(0xaaaaaaac), (0xaaaaaaad), (0xaaaaaaae),
+		(0xaaaaaaaf)
 	};
+	int i;
+	uint32_t XL32, XH32, Q[32];
 
 	Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]);
 	Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]);
@@ -199,45 +276,69 @@ void Compression256_2(uint32_t *  M32)
 	/* The following relation for these parameters should is satisfied: */
 	/* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16                           */
 
-	#pragma unroll
-	for (int i = 16; i<18; i++)
-		Q[i] = expand32_1(i, M32, H, Q);
+#pragma unroll
+	for (i = 0; i<2; i++)
+		Q[i + 16] = expand32_1(i + 16, M32, H, Q);
+
+	/*	#pragma unroll
+	for (i = 2; i<16; i++)
+	Q[i + 16] = expand32_2(i + 16, M32, H, Q);
+	*/
+	uint32_t precalc = Q[18 - 16] + Q[18 - 14] + Q[18 - 12] + Q[18 - 10] + Q[18 - 8] + Q[18 - 6]; //+ Q[18 - 4]
+	uint32_t precalc2 = Q[19 - 16] + Q[19 - 14] + Q[19 - 12] + Q[19 - 10] + Q[19 - 8] + Q[19 - 6];//+ Q[19 - 4]
+
+#pragma unroll
+	for (i = 2 + 16; i < 16 + 16; i += 2)
+	{
+		precalc = precalc + Q[i - 4];
+		precalc2 = precalc2 + Q[i + 1 - 4];
+		uint32_t p1 = ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]);
+		uint32_t p2 = (((i + 1)*(0x05555555ul) + SPH_ROTL32(M32[((i + 1) - 16) % 16], (((i + 1) - 16) % 16) + 1) + SPH_ROTL32(M32[((i + 1) - 13) % 16], (((i + 1) - 13) % 16) + 1) - SPH_ROTL32(M32[((i + 1) - 6) % 16], (((i + 1) - 6) % 16) + 1)) ^ H[((i + 1) - 16 + 7) % 16]);
+		Q[i] = precalc + expand32_2(i, M32, H, Q) + p1;
+		Q[i + 1] = precalc2 + expand32_2(i + 1, M32, H, Q) + p2;
+		precalc = precalc - Q[i - 16];
+		precalc2 = precalc2 - Q[i + 1 - 16];
+	}
+
 
-	#pragma nounroll
-	for (int i = 18; i<32; i++)
-		Q[i] = expand32_2(i, M32, H, Q);
 
 	/* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */
 	/* 16 new variables that are prooduced in the Message Expansion part.                    */
 	XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23];
-	XH32 = XL32 ^ Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31];
+	XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31];
+
+
+	M32[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32    ^ Q[26] ^ Q[2]);
+	M32[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32    ^ Q[27] ^ Q[3]);
+	M32[14] = SPH_ROTL32(M32[2], 15) + (XH32     ^     Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]);
+	M32[15] = SPH_ROTL32(M32[3], 16) + (XH32     ^     Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
+
 
-	M32[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32 ^ Q[26] ^ Q[2]);
-	M32[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32 ^ Q[27] ^ Q[3]);
-	M32[14] = SPH_ROTL32(M32[2], 15) + (XH32 ^ Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]);
-	M32[15] = SPH_ROTL32(M32[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
 }
 
 #define TPB 512
 __global__ __launch_bounds__(TPB, 2)
-void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *const __restrict__ nonceVector)
+void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *const __restrict__ nonceVector, uint32_t Target)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint32_t message[16] = { 0 };
-
-		LOHI(message[0], message[1], __ldg(&g_hash[thread]));
-		LOHI(message[2], message[3], __ldg(&g_hash[thread + 1 * threads]));
-		LOHI(message[4], message[5], __ldg(&g_hash[thread + 2 * threads]));
-		LOHI(message[6], message[7], __ldg(&g_hash[thread + 3 * threads]));
-
-		message[8]=0x80;
-		message[14]=0x100;
-		Compression256(message);
-		Compression256_2(message);
-
-		if (((uint64_t*)message)[7] <= pTarget[3])
+		uint2 message[8] = { 0 };
+
+		message[0] = __ldg(&g_hash[thread + 0 * threads]);
+		message[1] = __ldg(&g_hash[thread + 1 * threads]);
+		message[2] = __ldg(&g_hash[thread + 2 * threads]);
+		message[3] = __ldg(&g_hash[thread + 3 * threads]);
+		//LOHI(message[2], message[3], __ldg(&g_hash[thread + 1 * threads]));
+		//LOHI(message[4], message[5], __ldg(&g_hash[thread + 2 * threads]));
+		//LOHI(message[6], message[7], __ldg(&g_hash[thread + 3 * threads]));
+
+		message[4].x = 0x80;
+		message[7].x = 0x100;
+		Compression256((uint32_t*)message);
+		Compression256_2((uint32_t*)message);
+
+		if (message[7].y <= Target)
 		{
 			uint32_t tmp = atomicExch(&nonceVector[0], startNounce + thread);
 			if (tmp != 0)
@@ -247,7 +348,7 @@ void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash
 }
 
 __host__
-void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces)
+void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces, uint32_t Target)
 {
 	const uint32_t threadsperblock = TPB;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
@@ -255,13 +356,12 @@ void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint
 
 	cudaMemset(d_GNonce[thr_id], 0, 2 * sizeof(uint32_t));
 
-	bmw256_gpu_hash_32 << <grid, block >> >(threads, startNounce, g_hash, d_GNonce[thr_id]);
+	bmw256_gpu_hash_32 << <grid, block >> >(threads, startNounce, (uint2*)g_hash, d_GNonce[thr_id], Target);
 	cudaMemcpy(d_gnounce[thr_id], d_GNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	resultnonces[0] = *(d_gnounce[thr_id]);
 	resultnonces[1] = *(d_gnounce[thr_id] + 1);
 }
 
-
 __host__
 void bmw256_cpu_init(int thr_id, uint32_t threads)
 {
@@ -276,8 +376,10 @@ void bmw256_cpu_free(int thr_id)
 	cudaFreeHost(d_gnounce[thr_id]);
 }
 
+/*
 __host__
 void bmw256_setTarget(const void *pTargetIn)
 {
 	cudaMemcpyToSymbol(pTarget, pTargetIn, 32, 0, cudaMemcpyHostToDevice);
 }
+*/
\ No newline at end of file
diff --git a/Algo256/cuda_cubehash256.cu b/Algo256/cuda_cubehash256.cu
index 76b9c52..ed889e5 100644
--- a/Algo256/cuda_cubehash256.cu
+++ b/Algo256/cuda_cubehash256.cu
@@ -3,179 +3,247 @@
 #define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
 #define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
 
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#define __CUDA_ARCH__ 520
+#endif
+
 #if __CUDA_ARCH__ < 350
 #define LROT(x,bits) ((x << bits) | (x >> (32 - bits)))
 #else
 #define LROT(x, bits) __funnelshift_l(x, x, bits)
 #endif
 
-#if __CUDA_ARCH__ < 500
-#define TPB 576
-#else
-#define TPB 1024
-#endif
+#define TPB35 576
+#define TPB50 1024
 
 #define ROTATEUPWARDS7(a)  LROT(a,7)
 #define ROTATEUPWARDS11(a) LROT(a,11)
 
-//#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
-#define SWAP(a,b) { a ^= b; b ^= a; a ^= b; }
-
 __device__ __forceinline__ void rrounds(uint32_t x[2][2][2][2][2])
 {
 	int r;
-	int j;
-	int k;
-	int l;
-	int m;
-
-	#pragma unroll 2
-	for (r = 0; r < CUBEHASH_ROUNDS; ++r) {
-
-		/* "add x_0jklm into x_1jklmn modulo 2^32" */
-#pragma unroll 2
-		for (j = 0; j < 2; ++j)
-#pragma unroll 2
-			for (k = 0; k < 2; ++k)
-#pragma unroll 2
-				for (l = 0; l < 2; ++l)
-#pragma unroll 2
-					for (m = 0; m < 2; ++m)
-						x[1][j][k][l][m] += x[0][j][k][l][m];
 
+	uint32_t x0[2][2][2][2];
+	uint32_t x1[2][2][2][2];
+
+	for (r = 0; r < CUBEHASH_ROUNDS; r += 2) {
 		/* "rotate x_0jklm upwards by 7 bits" */
-#pragma unroll 2
-		for (j = 0; j < 2; ++j)
-#pragma unroll 2
-			for (k = 0; k < 2; ++k)
-#pragma unroll 2
-				for (l = 0; l < 2; ++l)
-#pragma unroll 2
-					for (m = 0; m < 2; ++m)
-						x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]);
-
-		/* "swap x_00klm with x_01klm" */
-#pragma unroll 2
-		for (k = 0; k < 2; ++k)
-#pragma unroll 2
-			for (l = 0; l < 2; ++l)
-#pragma unroll 2
-				for (m = 0; m < 2; ++m)
-					SWAP(x[0][0][k][l][m], x[0][1][k][l][m])
-
-					/* "xor x_1jklm into x_0jklm" */
-#pragma unroll 2
-					for (j = 0; j < 2; ++j)
-#pragma unroll 2
-						for (k = 0; k < 2; ++k)
-#pragma unroll 2
-							for (l = 0; l < 2; ++l)
-#pragma unroll 2
-								for (m = 0; m < 2; ++m)
-									x[0][j][k][l][m] ^= x[1][j][k][l][m];
-
-		/* "swap x_1jk0m with x_1jk1m" */
-#pragma unroll 2
-		for (j = 0; j < 2; ++j)
-#pragma unroll 2
-			for (k = 0; k < 2; ++k)
-#pragma unroll 2
-				for (m = 0; m < 2; ++m)
-					SWAP(x[1][j][k][0][m], x[1][j][k][1][m])
-
-					/* "add x_0jklm into x_1jklm modulo 2^32" */
-#pragma unroll 2
-					for (j = 0; j < 2; ++j)
-#pragma unroll 2
-						for (k = 0; k < 2; ++k)
-#pragma unroll 2
-							for (l = 0; l < 2; ++l)
-#pragma unroll 2
-								for (m = 0; m < 2; ++m)
-									x[1][j][k][l][m] += x[0][j][k][l][m];
+		x0[0][0][0][0] = ROTATEUPWARDS7(x[0][0][0][0][0]);
+		x0[0][0][0][1] = ROTATEUPWARDS7(x[0][0][0][0][1]);
+		x0[0][0][1][0] = ROTATEUPWARDS7(x[0][0][0][1][0]);
+		x0[0][0][1][1] = ROTATEUPWARDS7(x[0][0][0][1][1]);
+		x0[0][1][0][0] = ROTATEUPWARDS7(x[0][0][1][0][0]);
+		x0[0][1][0][1] = ROTATEUPWARDS7(x[0][0][1][0][1]);
+		x0[0][1][1][0] = ROTATEUPWARDS7(x[0][0][1][1][0]);
+		x0[0][1][1][1] = ROTATEUPWARDS7(x[0][0][1][1][1]);
+		x0[1][0][0][0] = ROTATEUPWARDS7(x[0][1][0][0][0]);
+		x0[1][0][0][1] = ROTATEUPWARDS7(x[0][1][0][0][1]);
+		x0[1][0][1][0] = ROTATEUPWARDS7(x[0][1][0][1][0]);
+		x0[1][0][1][1] = ROTATEUPWARDS7(x[0][1][0][1][1]);
+		x0[1][1][0][0] = ROTATEUPWARDS7(x[0][1][1][0][0]);
+		x0[1][1][0][1] = ROTATEUPWARDS7(x[0][1][1][0][1]);
+		x0[1][1][1][0] = ROTATEUPWARDS7(x[0][1][1][1][0]);
+		x0[1][1][1][1] = ROTATEUPWARDS7(x[0][1][1][1][1]);
+
+		/* "add x_0jklm into x_1jklm modulo 2^32" */
+		x1[0][0][0][0] = x[1][0][0][0][0] + x[0][0][0][0][0];
+		x1[0][0][0][1] = x[1][0][0][0][1] + x[0][0][0][0][1];
+		x1[0][0][1][0] = x[1][0][0][1][0] + x[0][0][0][1][0];
+		x1[0][0][1][1] = x[1][0][0][1][1] + x[0][0][0][1][1];
+		x1[0][1][0][0] = x[1][0][1][0][0] + x[0][0][1][0][0];
+		x1[0][1][0][1] = x[1][0][1][0][1] + x[0][0][1][0][1];
+		x1[0][1][1][0] = x[1][0][1][1][0] + x[0][0][1][1][0];
+		x1[0][1][1][1] = x[1][0][1][1][1] + x[0][0][1][1][1];
+		x1[1][0][0][0] = x[1][1][0][0][0] + x[0][1][0][0][0];
+		x1[1][0][0][1] = x[1][1][0][0][1] + x[0][1][0][0][1];
+		x1[1][0][1][0] = x[1][1][0][1][0] + x[0][1][0][1][0];
+		x1[1][0][1][1] = x[1][1][0][1][1] + x[0][1][0][1][1];
+		x1[1][1][0][0] = x[1][1][1][0][0] + x[0][1][1][0][0];
+		x1[1][1][0][1] = x[1][1][1][0][1] + x[0][1][1][0][1];
+		x1[1][1][1][0] = x[1][1][1][1][0] + x[0][1][1][1][0];
+		x1[1][1][1][1] = x[1][1][1][1][1] + x[0][1][1][1][1];
+
+		/* "xor x_1~jklm into x_0jklm" */
+		x[0][0][0][0][0] = x0[0][0][0][0] ^ x1[1][0][0][0];
+		x[0][0][0][0][1] = x0[0][0][0][1] ^ x1[1][0][0][1];
+		x[0][0][0][1][0] = x0[0][0][1][0] ^ x1[1][0][1][0];
+		x[0][0][0][1][1] = x0[0][0][1][1] ^ x1[1][0][1][1];
+		x[0][0][1][0][0] = x0[0][1][0][0] ^ x1[1][1][0][0];
+		x[0][0][1][0][1] = x0[0][1][0][1] ^ x1[1][1][0][1];
+		x[0][0][1][1][0] = x0[0][1][1][0] ^ x1[1][1][1][0];
+		x[0][0][1][1][1] = x0[0][1][1][1] ^ x1[1][1][1][1];
+		x[0][1][0][0][0] = x0[1][0][0][0] ^ x1[0][0][0][0];
+		x[0][1][0][0][1] = x0[1][0][0][1] ^ x1[0][0][0][1];
+		x[0][1][0][1][0] = x0[1][0][1][0] ^ x1[0][0][1][0];
+		x[0][1][0][1][1] = x0[1][0][1][1] ^ x1[0][0][1][1];
+		x[0][1][1][0][0] = x0[1][1][0][0] ^ x1[0][1][0][0];
+		x[0][1][1][0][1] = x0[1][1][0][1] ^ x1[0][1][0][1];
+		x[0][1][1][1][0] = x0[1][1][1][0] ^ x1[0][1][1][0];
+		x[0][1][1][1][1] = x0[1][1][1][1] ^ x1[0][1][1][1];
 
 		/* "rotate x_0jklm upwards by 11 bits" */
-#pragma unroll 2
-		for (j = 0; j < 2; ++j)
-#pragma unroll 2
-			for (k = 0; k < 2; ++k)
-#pragma unroll 2
-				for (l = 0; l < 2; ++l)
-#pragma unroll 2
-					for (m = 0; m < 2; ++m)
-						x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]);
-
-		/* "swap x_0j0lm with x_0j1lm" */
-#pragma unroll 2
-		for (j = 0; j < 2; ++j)
-#pragma unroll 2
-			for (l = 0; l < 2; ++l)
-#pragma unroll 2
-				for (m = 0; m < 2; ++m)
-					SWAP(x[0][j][0][l][m], x[0][j][1][l][m])
-
-					/* "xor x_1jklm into x_0jklm" */
-#pragma unroll 2
-					for (j = 0; j < 2; ++j)
-#pragma unroll 2
-						for (k = 0; k < 2; ++k)
-#pragma unroll 2
-							for (l = 0; l < 2; ++l)
-#pragma unroll 2
-								for (m = 0; m < 2; ++m)
-									x[0][j][k][l][m] ^= x[1][j][k][l][m];
-
-		/* "swap x_1jkl0 with x_1jkl1" */
-#pragma unroll 2
-		for (j = 0; j < 2; ++j)
-#pragma unroll 2
-			for (k = 0; k < 2; ++k)
-#pragma unroll 2
-				for (l = 0; l < 2; ++l)
-					SWAP(x[1][j][k][l][0], x[1][j][k][l][1])
+		x0[0][0][0][0] = ROTATEUPWARDS11(x[0][0][0][0][0]);
+		x0[0][0][0][1] = ROTATEUPWARDS11(x[0][0][0][0][1]);
+		x0[0][0][1][0] = ROTATEUPWARDS11(x[0][0][0][1][0]);
+		x0[0][0][1][1] = ROTATEUPWARDS11(x[0][0][0][1][1]);
+		x0[0][1][0][0] = ROTATEUPWARDS11(x[0][0][1][0][0]);
+		x0[0][1][0][1] = ROTATEUPWARDS11(x[0][0][1][0][1]);
+		x0[0][1][1][0] = ROTATEUPWARDS11(x[0][0][1][1][0]);
+		x0[0][1][1][1] = ROTATEUPWARDS11(x[0][0][1][1][1]);
+		x0[1][0][0][0] = ROTATEUPWARDS11(x[0][1][0][0][0]);
+		x0[1][0][0][1] = ROTATEUPWARDS11(x[0][1][0][0][1]);
+		x0[1][0][1][0] = ROTATEUPWARDS11(x[0][1][0][1][0]);
+		x0[1][0][1][1] = ROTATEUPWARDS11(x[0][1][0][1][1]);
+		x0[1][1][0][0] = ROTATEUPWARDS11(x[0][1][1][0][0]);
+		x0[1][1][0][1] = ROTATEUPWARDS11(x[0][1][1][0][1]);
+		x0[1][1][1][0] = ROTATEUPWARDS11(x[0][1][1][1][0]);
+		x0[1][1][1][1] = ROTATEUPWARDS11(x[0][1][1][1][1]);
+
+		/* "add x_0jklm into x_1~jk~lm modulo 2^32" */
+		x[1][1][0][1][0] = x1[1][0][1][0] + x[0][0][0][0][0];
+		x[1][1][0][1][1] = x1[1][0][1][1] + x[0][0][0][0][1];
+		x[1][1][0][0][0] = x1[1][0][0][0] + x[0][0][0][1][0];
+		x[1][1][0][0][1] = x1[1][0][0][1] + x[0][0][0][1][1];
+		x[1][1][1][1][0] = x1[1][1][1][0] + x[0][0][1][0][0];
+		x[1][1][1][1][1] = x1[1][1][1][1] + x[0][0][1][0][1];
+		x[1][1][1][0][0] = x1[1][1][0][0] + x[0][0][1][1][0];
+		x[1][1][1][0][1] = x1[1][1][0][1] + x[0][0][1][1][1];
+		x[1][0][0][1][0] = x1[0][0][1][0] + x[0][1][0][0][0];
+		x[1][0][0][1][1] = x1[0][0][1][1] + x[0][1][0][0][1];
+		x[1][0][0][0][0] = x1[0][0][0][0] + x[0][1][0][1][0];
+		x[1][0][0][0][1] = x1[0][0][0][1] + x[0][1][0][1][1];
+		x[1][0][1][1][0] = x1[0][1][1][0] + x[0][1][1][0][0];
+		x[1][0][1][1][1] = x1[0][1][1][1] + x[0][1][1][0][1];
+		x[1][0][1][0][0] = x1[0][1][0][0] + x[0][1][1][1][0];
+		x[1][0][1][0][1] = x1[0][1][0][1] + x[0][1][1][1][1];
+
+		/* "xor x_1~j~k~lm into x_0jklm" */
+		x[0][0][0][0][0] = x0[0][0][0][0] ^ x[1][1][1][1][0];
+		x[0][0][0][0][1] = x0[0][0][0][1] ^ x[1][1][1][1][1];
+		x[0][0][0][1][0] = x0[0][0][1][0] ^ x[1][1][1][0][0];
+		x[0][0][0][1][1] = x0[0][0][1][1] ^ x[1][1][1][0][1];
+		x[0][0][1][0][0] = x0[0][1][0][0] ^ x[1][1][0][1][0];
+		x[0][0][1][0][1] = x0[0][1][0][1] ^ x[1][1][0][1][1];
+		x[0][0][1][1][0] = x0[0][1][1][0] ^ x[1][1][0][0][0];
+		x[0][0][1][1][1] = x0[0][1][1][1] ^ x[1][1][0][0][1];
+		x[0][1][0][0][0] = x0[1][0][0][0] ^ x[1][0][1][1][0];
+		x[0][1][0][0][1] = x0[1][0][0][1] ^ x[1][0][1][1][1];
+		x[0][1][0][1][0] = x0[1][0][1][0] ^ x[1][0][1][0][0];
+		x[0][1][0][1][1] = x0[1][0][1][1] ^ x[1][0][1][0][1];
+		x[0][1][1][0][0] = x0[1][1][0][0] ^ x[1][0][0][1][0];
+		x[0][1][1][0][1] = x0[1][1][0][1] ^ x[1][0][0][1][1];
+		x[0][1][1][1][0] = x0[1][1][1][0] ^ x[1][0][0][0][0];
+		x[0][1][1][1][1] = x0[1][1][1][1] ^ x[1][0][0][0][1];
 
-	}
-}
-
-__device__ __forceinline__ void block_tox(const uint32_t *in, uint32_t x[2][2][2][2][2])
-{
-	x[0][0][0][0][0] ^= in[0];
-	x[0][0][0][0][1] ^= in[1];
-	x[0][0][0][1][0] ^= in[2];
-	x[0][0][0][1][1] ^= in[3];
-	x[0][0][1][0][0] ^= in[4];
-	x[0][0][1][0][1] ^= in[5];
-	x[0][0][1][1][0] ^= in[6];
-	x[0][0][1][1][1] ^= in[7];
-}
-
-__device__ __forceinline__ void hash_fromx(uint32_t *out, uint32_t x[2][2][2][2][2])
-{
-	out[0] = x[0][0][0][0][0];
-	out[1] = x[0][0][0][0][1];
-	out[2] = x[0][0][0][1][0];
-	out[3] = x[0][0][0][1][1];
-	out[4] = x[0][0][1][0][0];
-	out[5] = x[0][0][1][0][1];
-	out[6] = x[0][0][1][1][0];
-	out[7] = x[0][0][1][1][1];
-
-}
-
-__device__ __forceinline__
-void Update32(uint32_t x[2][2][2][2][2], const uint32_t *data)
-{
-	/* "xor the block into the first b bytes of the state" */
-	/* "and then transform the state invertibly through r identical rounds" */
-	block_tox(data, x);
-	rrounds(x);
-}
+		/* "rotate x_0jklm upwards by 7 bits" */
+		x0[0][0][0][0] = ROTATEUPWARDS7(x[0][0][0][0][0]);
+		x0[0][0][0][1] = ROTATEUPWARDS7(x[0][0][0][0][1]);
+		x0[0][0][1][0] = ROTATEUPWARDS7(x[0][0][0][1][0]);
+		x0[0][0][1][1] = ROTATEUPWARDS7(x[0][0][0][1][1]);
+		x0[0][1][0][0] = ROTATEUPWARDS7(x[0][0][1][0][0]);
+		x0[0][1][0][1] = ROTATEUPWARDS7(x[0][0][1][0][1]);
+		x0[0][1][1][0] = ROTATEUPWARDS7(x[0][0][1][1][0]);
+		x0[0][1][1][1] = ROTATEUPWARDS7(x[0][0][1][1][1]);
+		x0[1][0][0][0] = ROTATEUPWARDS7(x[0][1][0][0][0]);
+		x0[1][0][0][1] = ROTATEUPWARDS7(x[0][1][0][0][1]);
+		x0[1][0][1][0] = ROTATEUPWARDS7(x[0][1][0][1][0]);
+		x0[1][0][1][1] = ROTATEUPWARDS7(x[0][1][0][1][1]);
+		x0[1][1][0][0] = ROTATEUPWARDS7(x[0][1][1][0][0]);
+		x0[1][1][0][1] = ROTATEUPWARDS7(x[0][1][1][0][1]);
+		x0[1][1][1][0] = ROTATEUPWARDS7(x[0][1][1][1][0]);
+		x0[1][1][1][1] = ROTATEUPWARDS7(x[0][1][1][1][1]);
+
+		/* "add x_0jklm into x_1~j~k~l~m modulo 2^32" */
+		x1[1][1][1][1] = x[1][1][1][1][1] + x[0][0][0][0][0];
+		x1[1][1][1][0] = x[1][1][1][1][0] + x[0][0][0][0][1];
+		x1[1][1][0][1] = x[1][1][1][0][1] + x[0][0][0][1][0];
+		x1[1][1][0][0] = x[1][1][1][0][0] + x[0][0][0][1][1];
+		x1[1][0][1][1] = x[1][1][0][1][1] + x[0][0][1][0][0];
+		x1[1][0][1][0] = x[1][1][0][1][0] + x[0][0][1][0][1];
+		x1[1][0][0][1] = x[1][1][0][0][1] + x[0][0][1][1][0];
+		x1[1][0][0][0] = x[1][1][0][0][0] + x[0][0][1][1][1];
+		x1[0][1][1][1] = x[1][0][1][1][1] + x[0][1][0][0][0];
+		x1[0][1][1][0] = x[1][0][1][1][0] + x[0][1][0][0][1];
+		x1[0][1][0][1] = x[1][0][1][0][1] + x[0][1][0][1][0];
+		x1[0][1][0][0] = x[1][0][1][0][0] + x[0][1][0][1][1];
+		x1[0][0][1][1] = x[1][0][0][1][1] + x[0][1][1][0][0];
+		x1[0][0][1][0] = x[1][0][0][1][0] + x[0][1][1][0][1];
+		x1[0][0][0][1] = x[1][0][0][0][1] + x[0][1][1][1][0];
+		x1[0][0][0][0] = x[1][0][0][0][0] + x[0][1][1][1][1];
+
+		/* "xor x_1j~k~l~m into x_0jklm" */
+		x[0][0][0][0][0] = x0[0][0][0][0] ^ x1[0][1][1][1];
+		x[0][0][0][0][1] = x0[0][0][0][1] ^ x1[0][1][1][0];
+		x[0][0][0][1][0] = x0[0][0][1][0] ^ x1[0][1][0][1];
+		x[0][0][0][1][1] = x0[0][0][1][1] ^ x1[0][1][0][0];
+		x[0][0][1][0][0] = x0[0][1][0][0] ^ x1[0][0][1][1];
+		x[0][0][1][0][1] = x0[0][1][0][1] ^ x1[0][0][1][0];
+		x[0][0][1][1][0] = x0[0][1][1][0] ^ x1[0][0][0][1];
+		x[0][0][1][1][1] = x0[0][1][1][1] ^ x1[0][0][0][0];
+		x[0][1][0][0][0] = x0[1][0][0][0] ^ x1[1][1][1][1];
+		x[0][1][0][0][1] = x0[1][0][0][1] ^ x1[1][1][1][0];
+		x[0][1][0][1][0] = x0[1][0][1][0] ^ x1[1][1][0][1];
+		x[0][1][0][1][1] = x0[1][0][1][1] ^ x1[1][1][0][0];
+		x[0][1][1][0][0] = x0[1][1][0][0] ^ x1[1][0][1][1];
+		x[0][1][1][0][1] = x0[1][1][0][1] ^ x1[1][0][1][0];
+		x[0][1][1][1][0] = x0[1][1][1][0] ^ x1[1][0][0][1];
+		x[0][1][1][1][1] = x0[1][1][1][1] ^ x1[1][0][0][0];
 
-__device__ __forceinline__
-void Update32_const(uint32_t x[2][2][2][2][2])
-{
-	x[0][0][0][0][0] ^= 0x80;
-	rrounds(x);
+		/* "rotate x_0jklm upwards by 11 bits" */
+		x0[0][0][0][0] = ROTATEUPWARDS11(x[0][0][0][0][0]);
+		x0[0][0][0][1] = ROTATEUPWARDS11(x[0][0][0][0][1]);
+		x0[0][0][1][0] = ROTATEUPWARDS11(x[0][0][0][1][0]);
+		x0[0][0][1][1] = ROTATEUPWARDS11(x[0][0][0][1][1]);
+		x0[0][1][0][0] = ROTATEUPWARDS11(x[0][0][1][0][0]);
+		x0[0][1][0][1] = ROTATEUPWARDS11(x[0][0][1][0][1]);
+		x0[0][1][1][0] = ROTATEUPWARDS11(x[0][0][1][1][0]);
+		x0[0][1][1][1] = ROTATEUPWARDS11(x[0][0][1][1][1]);
+		x0[1][0][0][0] = ROTATEUPWARDS11(x[0][1][0][0][0]);
+		x0[1][0][0][1] = ROTATEUPWARDS11(x[0][1][0][0][1]);
+		x0[1][0][1][0] = ROTATEUPWARDS11(x[0][1][0][1][0]);
+		x0[1][0][1][1] = ROTATEUPWARDS11(x[0][1][0][1][1]);
+		x0[1][1][0][0] = ROTATEUPWARDS11(x[0][1][1][0][0]);
+		x0[1][1][0][1] = ROTATEUPWARDS11(x[0][1][1][0][1]);
+		x0[1][1][1][0] = ROTATEUPWARDS11(x[0][1][1][1][0]);
+		x0[1][1][1][1] = ROTATEUPWARDS11(x[0][1][1][1][1]);
+
+		/* "add x_0jklm into x_1j~kl~m modulo 2^32" */
+		x[1][0][1][0][1] = x1[0][1][0][1] + x[0][0][0][0][0];
+		x[1][0][1][0][0] = x1[0][1][0][0] + x[0][0][0][0][1];
+		x[1][0][1][1][1] = x1[0][1][1][1] + x[0][0][0][1][0];
+		x[1][0][1][1][0] = x1[0][1][1][0] + x[0][0][0][1][1];
+		x[1][0][0][0][1] = x1[0][0][0][1] + x[0][0][1][0][0];
+		x[1][0][0][0][0] = x1[0][0][0][0] + x[0][0][1][0][1];
+		x[1][0][0][1][1] = x1[0][0][1][1] + x[0][0][1][1][0];
+		x[1][0][0][1][0] = x1[0][0][1][0] + x[0][0][1][1][1];
+		x[1][1][1][0][1] = x1[1][1][0][1] + x[0][1][0][0][0];
+		x[1][1][1][0][0] = x1[1][1][0][0] + x[0][1][0][0][1];
+		x[1][1][1][1][1] = x1[1][1][1][1] + x[0][1][0][1][0];
+		x[1][1][1][1][0] = x1[1][1][1][0] + x[0][1][0][1][1];
+		x[1][1][0][0][1] = x1[1][0][0][1] + x[0][1][1][0][0];
+		x[1][1][0][0][0] = x1[1][0][0][0] + x[0][1][1][0][1];
+		x[1][1][0][1][1] = x1[1][0][1][1] + x[0][1][1][1][0];
+		x[1][1][0][1][0] = x1[1][0][1][0] + x[0][1][1][1][1];
+
+		/* "xor x_1jkl~m into x_0jklm" */
+		x[0][0][0][0][0] = x0[0][0][0][0] ^ x[1][0][0][0][1];
+		x[0][0][0][0][1] = x0[0][0][0][1] ^ x[1][0][0][0][0];
+		x[0][0][0][1][0] = x0[0][0][1][0] ^ x[1][0][0][1][1];
+		x[0][0][0][1][1] = x0[0][0][1][1] ^ x[1][0][0][1][0];
+		x[0][0][1][0][0] = x0[0][1][0][0] ^ x[1][0][1][0][1];
+		x[0][0][1][0][1] = x0[0][1][0][1] ^ x[1][0][1][0][0];
+		x[0][0][1][1][0] = x0[0][1][1][0] ^ x[1][0][1][1][1];
+		x[0][0][1][1][1] = x0[0][1][1][1] ^ x[1][0][1][1][0];
+		x[0][1][0][0][0] = x0[1][0][0][0] ^ x[1][1][0][0][1];
+		x[0][1][0][0][1] = x0[1][0][0][1] ^ x[1][1][0][0][0];
+		x[0][1][0][1][0] = x0[1][0][1][0] ^ x[1][1][0][1][1];
+		x[0][1][0][1][1] = x0[1][0][1][1] ^ x[1][1][0][1][0];
+		x[0][1][1][0][0] = x0[1][1][0][0] ^ x[1][1][1][0][1];
+		x[0][1][1][0][1] = x0[1][1][0][1] ^ x[1][1][1][0][0];
+		x[0][1][1][1][0] = x0[1][1][1][0] ^ x[1][1][1][1][1];
+		x[0][1][1][1][1] = x0[1][1][1][1] ^ x[1][1][1][1][0];
+	}
 }
 
 __device__ __forceinline__
@@ -185,27 +253,44 @@ void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
 	x[1][1][1][1][1] ^= 1U;
 
 	/* "the state is then transformed invertibly through 10r identical rounds" */
-	#pragma unroll 2
 	for (int i = 0; i < 10; ++i) rrounds(x);
 
 	/* "output the first h/8 bytes of the state" */
-	hash_fromx(hashval, x);
+	hashval[0] = x[0][0][0][0][0];
+	hashval[1] = x[0][0][0][0][1];
+	hashval[2] = x[0][0][0][1][0];
+	hashval[3] = x[0][0][0][1][1];
+	hashval[4] = x[0][0][1][0][0];
+	hashval[5] = x[0][0][1][0][1];
+	hashval[6] = x[0][0][1][1][0];
+	hashval[7] = x[0][0][1][1][1];
 }
 
 #if __CUDA_ARCH__ >= 500
-
-__global__	__launch_bounds__(TPB, 1)
+__global__	__launch_bounds__(TPB50, 1)
+#else
+__global__	__launch_bounds__(TPB35, 1)
+#endif
 void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
+#if __CUDA_ARCH__ >= 500
 		uint2 Hash[4];
 
 		Hash[0] = __ldg(&g_hash[thread]);
 		Hash[1] = __ldg(&g_hash[thread + 1 * threads]);
 		Hash[2] = __ldg(&g_hash[thread + 2 * threads]);
 		Hash[3] = __ldg(&g_hash[thread + 3 * threads]);
+#else
+		uint32_t Hash[8];
+
+		LOHI(Hash[0], Hash[1], __ldg(&((uint64_t*)g_hash)[thread]));
+		LOHI(Hash[2], Hash[3], __ldg(&((uint64_t*)g_hash)[thread + 1 * threads]));
+		LOHI(Hash[4], Hash[5], __ldg(&((uint64_t*)g_hash)[thread + 2 * threads]));
+		LOHI(Hash[6], Hash[7], __ldg(&((uint64_t*)g_hash)[thread + 3 * threads]));
+#endif
 
 		uint32_t x[2][2][2][2][2] =
 		{
@@ -219,6 +304,7 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_ha
 			0x15815AEB, 0x4AB6AAD6, 0x9CDAF8AF, 0xD6032C0A
 		};
 
+#if __CUDA_ARCH__ >= 500
 		x[0][0][0][0][0] ^= Hash[0].x;
 		x[0][0][0][0][1] ^= Hash[0].y;
 		x[0][0][0][1][0] ^= Hash[1].x;
@@ -227,48 +313,7 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_ha
 		x[0][0][1][0][1] ^= Hash[2].y;
 		x[0][0][1][1][0] ^= Hash[3].x;
 		x[0][0][1][1][1] ^= Hash[3].y;
-
-		rrounds(x);
-		x[0][0][0][0][0] ^= 0x80U;
-		rrounds(x);
-
-		Final(x, (uint32_t*) Hash);
-
-		g_hash[thread] =               Hash[0];
-		g_hash[1 * threads + thread] = Hash[1];
-		g_hash[2 * threads + thread] = Hash[2];
-		g_hash[3 * threads + thread] = Hash[3];
-	}
-}
-
 #else
-
-__global__	__launch_bounds__(TPB, 1)
-void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *d_hash)
-{
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		uint32_t Hash[8];
-		uint64_t* g_hash = (uint64_t*) d_hash;
-
-		LOHI(Hash[0], Hash[1], __ldg(&g_hash[thread]));
-		LOHI(Hash[2], Hash[3], __ldg(&g_hash[thread + 1 * threads]));
-		LOHI(Hash[4], Hash[5], __ldg(&g_hash[thread + 2 * threads]));
-		LOHI(Hash[6], Hash[7], __ldg(&g_hash[thread + 3 * threads]));
-
-		uint32_t x[2][2][2][2][2] =
-		{
-			0xEA2BD4B4, 0xCCD6F29F, 0x63117E71, 0x35481EAE,
-			0x22512D5B, 0xE5D94E63, 0x7E624131, 0xF4CC12BE,
-			0xC2D0B696, 0x42AF2070, 0xD0720C35, 0x3361DA8C,
-			0x28CCECA4, 0x8EF8AD83, 0x4680AC00, 0x40E5FBAB,
-			0xD89041C3, 0x6107FBD5, 0x6C859D41, 0xF0B26679,
-			0x09392549, 0x5FA25603, 0x65C892FD, 0x93CB6285,
-			0x2AF2B5AE, 0x9E4B4E60, 0x774ABFDD, 0x85254725,
-			0x15815AEB, 0x4AB6AAD6, 0x9CDAF8AF, 0xD6032C0A
-		};
-
 		x[0][0][0][0][0] ^= Hash[0];
 		x[0][0][0][0][1] ^= Hash[1];
 		x[0][0][0][1][0] ^= Hash[2];
@@ -277,29 +322,48 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *d_ha
 		x[0][0][1][0][1] ^= Hash[5];
 		x[0][0][1][1][0] ^= Hash[6];
 		x[0][0][1][1][1] ^= Hash[7];
-
+#endif
 		rrounds(x);
 		x[0][0][0][0][0] ^= 0x80U;
 		rrounds(x);
 
+#if __CUDA_ARCH__ >= 500
+		Final(x, (uint32_t*)Hash);
+
+		g_hash[thread] = Hash[0];
+		g_hash[1 * threads + thread] = Hash[1];
+		g_hash[2 * threads + thread] = Hash[2];
+		g_hash[3 * threads + thread] = Hash[3];
+#else
 		Final(x, Hash);
 
-		g_hash[thread] =               ((uint64_t*)Hash)[0];
-		g_hash[1 * threads + thread] = ((uint64_t*)Hash)[1];
-		g_hash[2 * threads + thread] = ((uint64_t*)Hash)[2];
-		g_hash[3 * threads + thread] = ((uint64_t*)Hash)[3];
+		((uint64_t*)g_hash)[thread] = ((uint64_t*)Hash)[0];
+		((uint64_t*)g_hash)[1 * threads + thread] = ((uint64_t*)Hash)[1];
+		((uint64_t*)g_hash)[2 * threads + thread] = ((uint64_t*)Hash)[2];
+		((uint64_t*)g_hash)[3 * threads + thread] = ((uint64_t*)Hash)[3];
+#endif
 	}
 }
 
-#endif
-
 __host__
 void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order)
 {
-	uint32_t tpb = TPB;
+	uint32_t tpb = TPB35;
+	if (cuda_arch[thr_id] >= 500) tpb = TPB50;
+
+	dim3 grid((threads + tpb - 1) / tpb);
+	dim3 block(tpb);
+
+	cubehash256_gpu_hash_32 << <grid, block >> > (threads, startNounce, (uint2*)d_hash);
+}
+__host__
+void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order, cudaStream_t stream)
+{
+	uint32_t tpb = TPB35;
+	if (cuda_arch[thr_id] >= 500) tpb = TPB50;
 
-	dim3 grid((threads + tpb-1)/tpb);
+	dim3 grid((threads + tpb - 1) / tpb);
 	dim3 block(tpb);
 
-	cubehash256_gpu_hash_32 <<<grid, block>>> (threads, startNounce, (uint2*) d_hash);
+	cubehash256_gpu_hash_32 << <grid, block, 0, stream >> > (threads, startNounce, (uint2*)d_hash);
 }
diff --git a/Algo256/cuda_skein256.cu b/Algo256/cuda_skein256.cu
index cbeb660..44b3dad 100644
--- a/Algo256/cuda_skein256.cu
+++ b/Algo256/cuda_skein256.cu
@@ -13,40 +13,296 @@ void Round512v35(uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p
 }
 
 __forceinline__ __device__
-void Round_8_512v35(const uint2 *const __restrict__ ks, const uint2 *const __restrict__ ts,
-	uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int R)
+void Round_8_512v35_1(const uint2 ks[9], const uint2 ts[3],
+uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
 {
 	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
 	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
 	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
-	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
 
-	p0 += ks[(R+0) % 9];
-	p1 += ks[(R+1) % 9];
-	p2 += ks[(R+2) % 9];
-	p3 += ks[(R+3) % 9];
-	p4 += ks[(R+4) % 9];
-	p5 += ks[(R+5) % 9] + ts[(R+0) % 3];
-	p6 += ks[(R+6) % 9] + ts[(R+1) % 3];
-	p7 += ks[(R+7) % 9] + make_uint2(R, 0);
+	p0 += ks[1];
+	p1 += ks[2];
+	p2 += ks[3];
+	p3 += ks[4];
+	p4 += ks[5];
+	p5 += ks[6] + ts[1];
+	p6 += ks[7] + ts[2];
+	p7 += ks[8] + make_uint2(1, 0);
 
 	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
 	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
 	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
-	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8,  35, 56, 22);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
+
+	p0 += ks[2];
+	p1 += ks[3];
+	p2 += ks[4];
+	p3 += ks[5];
+	p4 += ks[6];
+	p5 += ks[7] + ts[2];
+	p6 += ks[8] + ts[0];
+	p7 += ks[0] + make_uint2(2, 0);
+}
+__forceinline__ __device__
+void Round_8_512v35_3(const uint2 ks[9], const uint2 ts[3],
+uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
 
-	p0 += ks[(R+1) % 9];
-	p1 += ks[(R+2) % 9];
-	p2 += ks[(R+3) % 9];
-	p3 += ks[(R+4) % 9];
-	p4 += ks[(R+5) % 9];
-	p5 += ks[(R+6) % 9] + ts[(R+1) % 3];
-	p6 += ks[(R+7) % 9] + ts[(R+2) % 3];
-	p7 += ks[(R+8) % 9] + make_uint2(R+1, 0);
+	p0 += ks[3];
+	p1 += ks[4];
+	p2 += ks[5];
+	p3 += ks[6];
+	p4 += ks[7];
+	p5 += ks[8] + ts[0];
+	p6 += ks[0] + ts[1];
+	p7 += ks[1] + make_uint2(3, 0);
+
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
+
+	p0 += ks[4];
+	p1 += ks[5];
+	p2 += ks[6];
+	p3 += ks[7];
+	p4 += ks[8];
+	p5 += ks[0] + ts[1];
+	p6 += ks[1] + ts[2];
+	p7 += ks[2] + make_uint2(4, 0);
+}
+__forceinline__ __device__
+void Round_8_512v35_5(const uint2 ks[9], const uint2 ts[3],
+uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
+
+	p0 += ks[5];
+	p1 += ks[6];
+	p2 += ks[7];
+	p3 += ks[8];
+	p4 += ks[0];
+	p5 += ks[1] + ts[2];
+	p6 += ks[2] + ts[0];
+	p7 += ks[3] + make_uint2(5, 0);
+
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
+
+	p0 += ks[6];
+	p1 += ks[7];
+	p2 += ks[8];
+	p3 += ks[0];
+	p4 += ks[1];
+	p5 += ks[2] + ts[0];
+	p6 += ks[3] + ts[1];
+	p7 += ks[4] + make_uint2(6, 0);
+}
+__forceinline__ __device__
+void Round_8_512v35_7(const uint2 ks[9], const uint2 ts[3],
+uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
+
+	p0 += ks[7];
+	p1 += ks[8];
+	p2 += ks[0];
+	p3 += ks[1];
+	p4 += ks[2];
+	p5 += ks[3] + ts[1];
+	p6 += ks[4] + ts[2];
+	p7 += ks[5] + make_uint2(7, 0);
+
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
+
+	p0 += ks[8];
+	p1 += ks[0];
+	p2 += ks[1];
+	p3 += ks[2];
+	p4 += ks[3];
+	p5 += ks[4] + ts[2];
+	p6 += ks[5] + ts[0];
+	p7 += ks[6] + make_uint2(8, 0);
 }
+__forceinline__ __device__
+void Round_8_512v35_9(const uint2 ks[9], const uint2 ts[3],
+uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
+
+	p0 += ks[0];
+	p1 += ks[1];
+	p2 += ks[2];
+	p3 += ks[3];
+	p4 += ks[4];
+	p5 += ks[5] + ts[0];
+	p6 += ks[6] + ts[1];
+	p7 += ks[7] + make_uint2(9, 0);
 
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
+
+	p0 += ks[1];
+	p1 += ks[2];
+	p2 += ks[3];
+	p3 += ks[4];
+	p4 += ks[5];
+	p5 += ks[6] + ts[1];
+	p6 += ks[7] + ts[2];
+	p7 += ks[8] + make_uint2(10, 0);
+}
 __forceinline__ __device__
-void Round_8_512v35_final(const uint2 *const __restrict__ ks, const uint2 *const __restrict__ ts,
+void Round_8_512v35_11(const uint2 ks[9], const uint2 ts[3],
+uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
+
+	p0 += ks[2];
+	p1 += ks[3];
+	p2 += ks[4];
+	p3 += ks[5];
+	p4 += ks[6];
+	p5 += ks[7] + ts[2];
+	p6 += ks[8] + ts[0];
+	p7 += ks[0] + make_uint2(11, 0);
+
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
+
+	p0 += ks[3];
+	p1 += ks[4];
+	p2 += ks[5];
+	p3 += ks[6];
+	p4 += ks[7];
+	p5 += ks[8] + ts[0];
+	p6 += ks[0] + ts[1];
+	p7 += ks[1] + make_uint2(12, 0);
+}
+__forceinline__ __device__
+void Round_8_512v35_13(const uint2 ks[9], const uint2 ts[3],
+uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
+
+	p0 += ks[4];
+	p1 += ks[5];
+	p2 += ks[6];
+	p3 += ks[7];
+	p4 += ks[8];
+	p5 += ks[0] + ts[1];
+	p6 += ks[1] + ts[2];
+	p7 += ks[2] + make_uint2(13, 0);
+
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
+
+	p0 += ks[5];
+	p1 += ks[6];
+	p2 += ks[7];
+	p3 += ks[8];
+	p4 += ks[0];
+	p5 += ks[1] + ts[2];
+	p6 += ks[2] + ts[0];
+	p7 += ks[3] + make_uint2(14, 0);
+}
+__forceinline__ __device__
+void Round_8_512v35_15(const uint2 ks[9], const uint2 ts[3],
+uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
+
+	p0 += ks[6];
+	p1 += ks[7];
+	p2 += ks[8];
+	p3 += ks[0];
+	p4 += ks[1];
+	p5 += ks[2] + ts[0];
+	p6 += ks[3] + ts[1];
+	p7 += ks[4] + make_uint2(15, 0);
+
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
+
+	p0 += ks[7];
+	p1 += ks[8];
+	p2 += ks[0];
+	p3 += ks[1];
+	p4 += ks[2];
+	p5 += ks[3] + ts[1];
+	p6 += ks[4] + ts[2];
+	p7 += ks[5] + make_uint2(16, 0);
+}
+__forceinline__ __device__
+void Round_8_512v35_17(const uint2 ks[9], const uint2 ts[3],
+uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
+
+	p0 += ks[8];
+	p1 += ks[0];
+	p2 += ks[1];
+	p3 += ks[2];
+	p4 += ks[3];
+	p5 += ks[4] + ts[2];
+	p6 += ks[5] + ts[0];
+	p7 += ks[6] + make_uint2(17, 0);
+
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
+
+	p0 += ks[0];
+	p1 += ks[1];
+	p2 += ks[2];
+	p3 += ks[3];
+	p4 += ks[4];
+	p5 += ks[5] + ts[0];
+	p6 += ks[6] + ts[1];
+	p7 += ks[7] + make_uint2(18, 0);
+}
+
+__forceinline__ __device__
+void Round_8_512v35_final(const uint2 ks[9], const uint2 ts[3],
 	uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
 {
 	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
@@ -74,96 +330,88 @@ void Round_8_512v35_final(const uint2 *const __restrict__ ks, const uint2 *const
 	p3 += ks[3];
 }
 
-__global__ __launch_bounds__(256,3)
-void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
+__global__ __launch_bounds__(256, 4)
+void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint2 skein_ks_parity = { 0xA9FC1A22, 0x1BD11BDA };
+
+	const uint2 h2[9] = {
+		{ 0x2FDB3E13, 0xCCD044A1 },
+		{ 0x1A79A9EB, 0xE8359030 },
+		{ 0x4F816E6F, 0x55AEA061 },
+		{ 0xAE9B94DB, 0x2A2767A4 },
+		{ 0x74DD7683, 0xEC06025E },
+		{ 0xC4746251, 0xE7A436CD },
+		{ 0x393AD185, 0xC36FBAF9 },
+		{ 0x33EDFC13, 0x3EEDBA18 },
+		{ 0xC73A4E2A, 0xB69D3CFC }
+	};
+	const uint2 t12[2][3] = {
+		{ { 0x20, 0 },
+		{ 0, 0xf0000000 },
+		{ 0x20, 0xf0000000 } },
+		{ { 0x08, 0 },
+		{ 0, 0xff000000 },
+		{ 0x08, 0xff000000 } }
+	};
 
 	if (thread < threads)
 	{
-		const uint2 skein_ks_parity = { 0xA9FC1A22, 0x1BD11BDA };
-		const uint2 t12[6] = {
-			{ 0x20, 0 },
-			{ 0,    0xf0000000 },
-			{ 0x20, 0xf0000000 },
-			{ 0x08, 0 },
-			{ 0,    0xff000000 },
-			{ 0x08, 0xff000000 }
-		};
 
-		uint2 h[9] = {
-			{ 0x2FDB3E13, 0xCCD044A1 },
-			{ 0x1A79A9EB, 0xE8359030 },
-			{ 0x4F816E6F, 0x55AEA061 },
-			{ 0xAE9B94DB, 0x2A2767A4 },
-			{ 0x74DD7683, 0xEC06025E },
-			{ 0xC4746251, 0xE7A436CD },
-			{ 0x393AD185, 0xC36FBAF9 },
-			{ 0x33EDFC13, 0x3EEDBA18 },
-			{ 0xC73A4E2A, 0xB69D3CFC }
-		};
 		uint2 dt0,dt1,dt2,dt3;
 		uint2 p0, p1, p2, p3, p4, p5, p6, p7;
 
-		LOHI(dt0.x,dt0.y,outputHash[thread]);
-		LOHI(dt1.x,dt1.y,outputHash[threads+thread]);
-		LOHI(dt2.x,dt2.y,outputHash[2*threads+thread]);
-		LOHI(dt3.x,dt3.y,outputHash[3*threads+thread]);
+		dt0 = __ldg(&outputHash[0 * threads + thread]);
+		dt1 = __ldg(&outputHash[1 * threads + thread]);
+		dt2 = __ldg(&outputHash[2 * threads + thread]);
+		dt3 = __ldg(&outputHash[3 * threads + thread]);
 
-		p0 = h[0] + dt0;
-		p1 = h[1] + dt1;
-		p2 = h[2] + dt2;
-		p3 = h[3] + dt3;
-		p4 = h[4];
-		p5 = h[5] + t12[0];
-		p6 = h[6] + t12[1];
-		p7 = h[7];
+		p0 = h2[0] + dt0;
+		p1 = h2[1] + dt1;
+		p2 = h2[2] + dt2;
+		p3 = h2[3] + dt3;
+		p4 = h2[4];
+		p5 = h2[5] + t12[0][0];
+		p6 = h2[6] + t12[0][1];
+		p7 = h2[7];
 
 		// forced unroll required
-		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 1);
-		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 3);
-		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 5);
-		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 7);
-		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 9);
-		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 11);
-		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 13);
-		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 15);
-		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 17);
+		Round_8_512v35_1(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_3(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_5(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_7(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_9(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_11(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_13(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_15(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_17(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
 
 		p0 ^= dt0;
 		p1 ^= dt1;
 		p2 ^= dt2;
 		p3 ^= dt3;
 
-		h[0] = p0;
-		h[1] = p1;
-		h[2] = p2;
-		h[3] = p3;
-		h[4] = p4;
-		h[5] = p5;
-		h[6] = p6;
-		h[7] = p7;
-		h[8] = skein_ks_parity ^ h[0] ^ h[1] ^ h[2] ^ h[3] ^ h[4] ^ h[5] ^ h[6] ^ h[7];
+		const uint2 h[9] = { p0, p1, p2, p3, p4, p5, p6, p7, skein_ks_parity ^ h[0] ^ h[1] ^ h[2] ^ h[3] ^ h[4] ^ h[5] ^ h[6] ^ h[7] };
 
-		const uint2 *t = t12+3;
-		p5 += t12[3];  //p5 already equal h[5]
-		p6 += t12[4];
+		p5 += t12[1][0];  //p5 already equal h[5]
+		p6 += t12[1][1];
 
 		// forced unroll
-		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 1);
-		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 3);
-		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 5);
-		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 7);
-		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 9);
-		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 11);
-		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 13);
-		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 15);
-		Round_8_512v35_final(h, t, p0, p1, p2, p3, p4, p5, p6, p7);
-
-		outputHash[thread]           = devectorize(p0);
-		outputHash[threads+thread]   = devectorize(p1);
-		outputHash[2*threads+thread] = devectorize(p2);
-		outputHash[3*threads+thread] = devectorize(p3);
+		Round_8_512v35_1(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_3(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_5(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_7(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_9(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_11(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_13(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_15(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_final(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
+
+		outputHash[0 * threads + thread] = p0;
+		outputHash[1 * threads + thread] = p1;
+		outputHash[2 * threads + thread] = p2;
+		outputHash[3 * threads + thread] = p3;
 	}
 }
 
@@ -304,10 +552,27 @@ void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, ui
 
 	// only 1kH/s perf change between kernels on a 960...
 	if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
-		skein256_gpu_hash_32<<<grid, block>>>(threads, startNounce, d_outputHash);
+		skein256_gpu_hash_32 << <grid, block >> >(threads, startNounce, (uint2*)d_outputHash);
 	else
-		skein256_gpu_hash_32_v30<<<grid, block>>>(threads, startNounce, d_outputHash);
+		skein256_gpu_hash_32_v30 << <grid, block >> >(threads, startNounce, d_outputHash);
 
-	MyStreamSynchronize(NULL, order, thr_id);
+	//MyStreamSynchronize(NULL, order, thr_id);
 }
 
+__host__
+void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order, cudaStream_t stream)
+{
+	const uint32_t threadsperblock = 256;
+	int dev_id = device_map[thr_id];
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	// only 1kH/s perf change between kernels on a 960...
+	if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
+		skein256_gpu_hash_32 << <grid, block, 0, stream >> >(threads, startNounce, (uint2*)d_outputHash);
+	else
+		skein256_gpu_hash_32_v30 << <grid, block,0, stream >> >(threads, startNounce, d_outputHash);
+
+	//MyStreamSynchronize(NULL, order, thr_id);
+}
diff --git a/ccminer.cpp b/ccminer.cpp
index ec35a77..9123c64 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -83,6 +83,7 @@ bool opt_debug_threads = false;
 bool opt_protocol = false;
 bool opt_benchmark = false;
 bool opt_showdiff = false;
+bool opt_eco_mode = false;
 
 // todo: limit use of these flags,
 // prefer the pools[] attributes
@@ -91,6 +92,7 @@ bool have_longpoll = false;
 bool want_stratum = true;
 bool have_stratum = false;
 bool allow_gbt = true;
+bool allow_getwork = true;
 bool allow_mininginfo = true;
 bool check_dups = false;
 bool check_stratum_jobs = false;
@@ -165,6 +167,8 @@ char *short_url = NULL;
 struct stratum_ctx stratum = { 0 };
 pthread_mutex_t stratum_sock_lock;
 pthread_mutex_t stratum_work_lock;
+static unsigned char pk_script[25] = { 0 };
+static size_t pk_script_size = 0;
 
 char *opt_cert;
 char *opt_proxy;
@@ -185,6 +189,7 @@ pthread_mutex_t stats_lock;
 double thr_hashrates[MAX_GPUS] = { 0 };
 uint64_t global_hashrate = 0;
 double   stratum_diff = 0.0;
+static char *lp_id;
 double   net_diff = 0;
 uint64_t net_hashrate = 0;
 uint64_t net_blocks = 0;
@@ -226,8 +231,8 @@ Options:\n\
 			jackpot     Jackpot\n\
 			keccak      Keccak-256 (Maxcoin)\n\
 			luffa       Joincoin\n\
-			lyra2       LyraBar\n\
-			lyra2v2     VertCoin\n\
+			lyra2       Lyra2RE(Crypto)\n\
+			lyra2v2     Lyra2REv2(VertCoin)\n\
 			mjollnir    Mjollnircoin\n\
 			myr-gr      Myriad-Groestl\n\
 			neoscrypt   FeatherCoin, Phoenix, UFO...\n\
@@ -256,6 +261,8 @@ Options:\n\
                         (matching 2nd gt640 in the PC)\n\
   -i  --intensity=N[,N] GPU intensity 8.0-25.0 (default: auto) \n\
                         Decimals are allowed for fine tuning \n\
+      --eco             Use Eco mode\n\
+	                    Auto tuning for low energy (Lyra2REv2 only)\n\
       --cuda-schedule   Set device threads scheduling mode (default: auto)\n\
   -f, --diff-factor     Divide difficulty by this factor (default 1.0) \n\
   -m, --diff-multiplier Multiply difficulty by this value (default 1.0) \n\
@@ -278,6 +285,8 @@ Options:\n\
                           long polling is unavailable, in seconds (default: 10)\n\
   -n, --ndevs           list cuda devices\n\
   -N, --statsavg        number of samples used to compute hashrate (default: 30)\n\
+      --coinbase-addr=ADDR  payout address for solo mining\n\
+      --no-getwork      disable getwork support\n\
       --no-gbt          disable getblocktemplate support (height check in solo)\n\
       --no-longpoll     disable X-Long-Polling support\n\
       --no-stratum      disable X-Stratum support\n\
@@ -329,6 +338,7 @@ struct option options[] = {
 	{ "background", 0, NULL, 'B' },
 	{ "benchmark", 0, NULL, 1005 },
 	{ "cert", 1, NULL, 1001 },
+	{ "coinbase-addr", 1, NULL, 1016 },
 	{ "config", 1, NULL, 'c' },
 	{ "cputest", 0, NULL, 1006 },
 	{ "cpu-affinity", 1, NULL, 1020 },
@@ -341,6 +351,7 @@ struct option options[] = {
 	{ "no-color", 0, NULL, 1002 },
 	{ "no-extranonce", 0, NULL, 1012 },
 	{ "no-gbt", 0, NULL, 1011 },
+	{ "no-getwork", 0, NULL, 1010 },
 	{ "no-longpoll", 0, NULL, 1003 },
 	{ "no-stratum", 0, NULL, 1007 },
 	{ "no-autotune", 0, NULL, 1004 },  // scrypt
@@ -394,6 +405,7 @@ struct option options[] = {
 	{ "diff-multiplier", 1, NULL, 'm' },
 	{ "diff-factor", 1, NULL, 'f' },
 	{ "diff", 1, NULL, 'f' }, // compat
+	{ "eco", 0, NULL, 1080 },
 	{ 0, 0, 0, 0 }
 };
 
@@ -892,7 +904,65 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		if (check_dups)
 			hashlog_remember_submit(work, nonce);
 
-	} else {
+	}
+	else if (work->txs2)
+	{
+
+		char data_str[2 * sizeof(work->data) + 1];
+		char *req;
+
+		for (int i = 0; i < ARRAY_SIZE(work->data); i++)
+			be32enc(work->data + i, work->data[i]);
+		cbin2hex(data_str, (char *)work->data, 80);
+		if (work->workid) {
+			char *params;
+			val = json_object();
+			json_object_set_new(val, "workid", json_string(work->workid));
+			params = json_dumps(val, 0);
+			json_decref(val);
+			req = (char*)malloc(128 + 2 * 80 + strlen(work->txs2) + strlen(params));
+			sprintf(req,
+				"{\"method\": \"submitblock\", \"params\": [\"%s%s\", %s], \"id\":4}\r\n",
+				data_str, work->txs2, params);
+			free(params);
+		}
+		else {
+			req = (char*)malloc(128 + 2 * 80 + strlen(work->txs2));
+			sprintf(req,
+				"{\"method\": \"submitblock\", \"params\": [\"%s%s\"], \"id\":4}\r\n",
+				data_str, work->txs2);
+		}
+
+		val = json_rpc_call_pool(curl, pool, req, false, false, NULL);
+		free(req);
+		if (unlikely(!val)) {
+			applog(LOG_ERR, "submit_upstream_work json_rpc_call failed");
+			return false;
+		}
+
+		res = json_object_get(val, "result");
+		if (json_is_object(res)) {
+			char *res_str;
+			bool sumres = false;
+			void *iter = json_object_iter(res);
+			while (iter) {
+				if (json_is_null(json_object_iter_value(iter))) {
+					sumres = true;
+					break;
+				}
+				iter = json_object_iter_next(res, iter);
+			}
+			res_str = json_dumps(res, 0);
+			share_result(sumres, work->pooln, work->sharediff, res_str);
+			free(res_str);
+		}
+		else
+			share_result(json_is_null(res), work->pooln, work->sharediff, json_string_value(res));
+
+		json_decref(val);
+
+	}
+	else {
 
 		int data_size = 128;
 		int adata_sz = data_size / sizeof(uint32_t);
@@ -924,6 +994,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 
 		/* issue JSON-RPC request */
 		val = json_rpc_call_pool(curl, pool, s, false, false, NULL);
+		free(str);
 		if (unlikely(!val)) {
 			applog(LOG_ERR, "submit_upstream_work json_rpc_call failed");
 			return false;
@@ -940,12 +1011,15 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 
 		json_decref(val);
 
-		free(str);
 	}
 
 	return true;
 }
 
+#ifndef ORG
+#define BLOCK_VERSION_CURRENT 7
+#endif
+
 /* simplified method to only get some extra infos in solo mode */
 static bool gbt_work_decode(const json_t *val, struct work *work)
 {
@@ -985,8 +1059,311 @@ static bool gbt_work_decode(const json_t *val, struct work *work)
 	return true;
 }
 
+#ifndef ORG
+int varint_encode(unsigned char *p, uint64_t n)
+{
+	int i;
+	if (n < 0xfd) {
+		p[0] = (uchar)n;
+		return 1;
+	}
+	if (n <= 0xffff) {
+		p[0] = 0xfd;
+		p[1] = n & 0xff;
+		p[2] = (uchar)(n >> 8);
+		return 3;
+	}
+	if (n <= 0xffffffff) {
+		p[0] = 0xfe;
+		for (i = 1; i < 5; i++) {
+			p[i] = n & 0xff;
+			n >>= 8;
+		}
+		return 5;
+	}
+	p[0] = 0xff;
+	for (i = 1; i < 9; i++) {
+		p[i] = n & 0xff;
+		n >>= 8;
+	}
+	return 9;
+}
+
+static bool gbt_work_decode_full(const json_t *val, struct work *work)
+{
+	int i, n;
+	uint32_t version, curtime, bits;
+	uint32_t prevhash[8];
+	uint32_t target[8];
+	int cbtx_size;
+	uchar *cbtx = NULL;
+	int tx_count, tx_size;
+	uchar txc_vi[9];
+	uchar(*merkle_tree)[32] = NULL;
+	bool coinbase_append = false;
+	bool submit_coinbase = false;
+	bool version_force = false;
+	bool version_reduce = false;
+	json_t *tmp, *txa;
+	bool rc = false;
+
+	tmp = json_object_get(val, "mutable");
+	if (tmp && json_is_array(tmp)) {
+		n = (int)json_array_size(tmp);
+		for (i = 0; i < n; i++) {
+			const char *s = json_string_value(json_array_get(tmp, i));
+			if (!s)
+				continue;
+			if (!strcmp(s, "coinbase/append"))
+				coinbase_append = true;
+			else if (!strcmp(s, "submit/coinbase"))
+				submit_coinbase = true;
+			else if (!strcmp(s, "version/force"))
+				version_force = true;
+			else if (!strcmp(s, "version/reduce"))
+				version_reduce = true;
+		}
+	}
+
+	tmp = json_object_get(val, "height");
+	if (!tmp || !json_is_integer(tmp)) {
+		applog(LOG_ERR, "JSON invalid height");
+		goto out;
+	}
+	work->height = (int)json_integer_value(tmp);
+	applog(LOG_BLUE, "Current block is %d", work->height);
+
+	tmp = json_object_get(val, "version");
+	if (!tmp || !json_is_integer(tmp)) {
+		applog(LOG_ERR, "JSON invalid version");
+		goto out;
+	}
+	version = (uint32_t)json_integer_value(tmp);
+	if ((version & 0xffU) > BLOCK_VERSION_CURRENT) {
+		if (version_reduce) {
+			version = (version & ~0xffU) | BLOCK_VERSION_CURRENT;
+		}
+		else if (allow_gbt && allow_getwork && !version_force) {
+			applog(LOG_DEBUG, "Switching to getwork, gbt version %d", version);
+			allow_gbt = false;
+			goto out;
+		}
+		else if (!version_force) {
+			applog(LOG_ERR, "Unrecognized block version: %u", version);
+			goto out;
+		}
+	}
+
+	if (unlikely(!jobj_binary(val, "previousblockhash", prevhash, sizeof(prevhash)))) {
+		applog(LOG_ERR, "JSON invalid previousblockhash");
+		goto out;
+	}
+
+	tmp = json_object_get(val, "curtime");
+	if (!tmp || !json_is_integer(tmp)) {
+		applog(LOG_ERR, "JSON invalid curtime");
+		goto out;
+	}
+	curtime = (uint32_t)json_integer_value(tmp);
+
+	if (unlikely(!jobj_binary(val, "bits", &bits, sizeof(bits)))) {
+		applog(LOG_ERR, "JSON invalid bits");
+		goto out;
+	}
+
+	/* find count and size of transactions */
+	txa = json_object_get(val, "transactions");
+	if (!txa || !json_is_array(txa)) {
+		applog(LOG_ERR, "JSON invalid transactions");
+		goto out;
+	}
+	tx_count = (int)json_array_size(txa);
+	tx_size = 0;
+	for (i = 0; i < tx_count; i++) {
+		const json_t *tx = json_array_get(txa, i);
+		const char *tx_hex = json_string_value(json_object_get(tx, "data"));
+		if (!tx_hex) {
+			applog(LOG_ERR, "JSON invalid transactions");
+			goto out;
+		}
+		tx_size += (int)(strlen(tx_hex) / 2);
+	}
+
+	/* build coinbase transaction */
+	tmp = json_object_get(val, "coinbasetxn");
+	if (tmp) {
+		const char *cbtx_hex = json_string_value(json_object_get(tmp, "data"));
+		cbtx_size = cbtx_hex ? (int)strlen(cbtx_hex) / 2 : 0;
+		cbtx = (uchar*)malloc(cbtx_size + 100);
+		if (cbtx_size < 60 || !hex2bin(cbtx, cbtx_hex, cbtx_size)) {
+			applog(LOG_ERR, "JSON invalid coinbasetxn");
+			goto out;
+		}
+	}
+	else {
+		int64_t cbvalue;
+		if (!pk_script_size) {
+			if (allow_getwork) {
+				applog(LOG_INFO, "No payout address provided, switching to getwork");
+				allow_gbt = false;
+			}
+			else
+				applog(LOG_ERR, "No payout address provided");
+			goto out;
+		}
+		tmp = json_object_get(val, "coinbasevalue");
+		if (!tmp || !json_is_number(tmp)) {
+			applog(LOG_ERR, "JSON invalid coinbasevalue");
+			goto out;
+		}
+		cbvalue = (int64_t)(json_is_integer(tmp) ? json_integer_value(tmp) : json_number_value(tmp));
+		cbtx = (uchar*)malloc(256);
+		le32enc((uint32_t *)cbtx, 1); /* version */
+		cbtx[4] = 1; /* in-counter */
+		memset(cbtx + 5, 0x00, 32); /* prev txout hash */
+		le32enc((uint32_t *)(cbtx + 37), 0xffffffff); /* prev txout index */
+		cbtx_size = 43;
+		/* BIP 34: height in coinbase */
+		for (n = work->height; n; n >>= 8)
+			cbtx[cbtx_size++] = n & 0xff;
+		cbtx[42] = cbtx_size - 43;
+		cbtx[41] = cbtx_size - 42; /* scriptsig length */
+		le32enc((uint32_t *)(cbtx + cbtx_size), 0xffffffff); /* sequence */
+		cbtx_size += 4;
+		cbtx[cbtx_size++] = 1; /* out-counter */
+		le32enc((uint32_t *)(cbtx + cbtx_size), (uint32_t)cbvalue); /* value */
+		le32enc((uint32_t *)(cbtx + cbtx_size + 4), cbvalue >> 32);
+		cbtx_size += 8;
+		cbtx[cbtx_size++] = (uint8_t)pk_script_size; /* txout-script length */
+		memcpy(cbtx + cbtx_size, pk_script, pk_script_size);
+		cbtx_size += (int)pk_script_size;
+		le32enc((uint32_t *)(cbtx + cbtx_size), 0); /* lock time */
+		cbtx_size += 4;
+		coinbase_append = true;
+	}
+	if (coinbase_append) {
+		unsigned char xsig[100];
+		int xsig_len = 0;
+		tmp = json_object_get(val, "coinbaseaux");
+		if (tmp && json_is_object(tmp)) {
+			void *iter = json_object_iter(tmp);
+			while (iter) {
+				unsigned char buf[100];
+				const char *s = json_string_value(json_object_iter_value(iter));
+				n = s ? (int)(strlen(s) / 2) : 0;
+				if (!s || n > 100 || !hex2bin(buf, s, n)) {
+					applog(LOG_ERR, "JSON invalid coinbaseaux");
+					break;
+				}
+				if (cbtx[41] + xsig_len + n <= 100) {
+					memcpy(xsig + xsig_len, buf, n);
+					xsig_len += n;
+				}
+				iter = json_object_iter_next(tmp, iter);
+			}
+		}
+		if (xsig_len) {
+			unsigned char *ssig_end = cbtx + 42 + cbtx[41];
+			int push_len = cbtx[41] + xsig_len < 76 ? 1 :
+				cbtx[41] + 2 + xsig_len > 100 ? 0 : 2;
+			n = xsig_len + push_len;
+			memmove(ssig_end + n, ssig_end, cbtx_size - 42 - cbtx[41]);
+			cbtx[41] += n;
+			if (push_len == 2)
+				*(ssig_end++) = 0x4c; /* OP_PUSHDATA1 */
+			if (push_len)
+				*(ssig_end++) = xsig_len;
+			memcpy(ssig_end, xsig, xsig_len);
+			cbtx_size += n;
+		}
+	}
+
+	n = varint_encode(txc_vi, 1 + tx_count);
+
+	work->txs2 = (char*)malloc(2 * (n + cbtx_size + tx_size) + 1);
+	cbin2hex(work->txs2, (char *)txc_vi, n);
+	cbin2hex(work->txs2 + 2 * n, (char *)cbtx, cbtx_size);
+
+	/* generate merkle root */
+	merkle_tree = (uchar(*)[32]) calloc(((1 + tx_count + 1) & ~1), 32);
+	sha256d(merkle_tree[0], cbtx, cbtx_size);
+
+	for (i = 0; i < tx_count; i++) {
+		tmp = json_array_get(txa, i);
+		const char *tx_hex = json_string_value(json_object_get(tmp, "data"));
+		const int tx_size = tx_hex ? (int)(strlen(tx_hex) / 2) : 0;
+		unsigned char *tx = (uchar*)malloc(tx_size);
+		if (!tx_hex || !hex2bin(tx, tx_hex, tx_size)) {
+			applog(LOG_ERR, "JSON invalid transactions");
+			free(tx);
+			goto out;
+		}
+		sha256d(merkle_tree[1 + i], tx, tx_size);
+		if (!submit_coinbase)
+			strcat(work->txs2, tx_hex);
+	}
+	n = 1 + tx_count;
+	while (n > 1) {
+		if (n % 2) {
+			memcpy(merkle_tree[n], merkle_tree[n - 1], 32);
+			++n;
+		}
+		n /= 2;
+		for (i = 0; i < n; i++)
+			sha256d(merkle_tree[i], merkle_tree[2 * i], 64);
+	}
+
+	/* assemble block header */
+	work->data[0] = swab32(version);
+	for (i = 0; i < 8; i++)
+		work->data[8 - i] = le32dec(prevhash + i);
+	for (i = 0; i < 8; i++)
+		work->data[9 + i] = be32dec((uint32_t *)merkle_tree[0] + i);
+	work->data[17] = swab32(curtime);
+	work->data[18] = le32dec(&bits);
+	memset(work->data + 19, 0x00, 52);
+	work->data[20] = 0x80000000;
+	work->data[31] = 0x00000280;
+
+	if (unlikely(!jobj_binary(val, "target", target, sizeof(target)))) {
+		applog(LOG_ERR, "JSON invalid target");
+		goto out;
+	}
+	for (i = 0; i < ARRAY_SIZE(work->target); i++)
+		work->target[7 - i] = be32dec(target + i);
+	tmp = json_object_get(val, "workid");
+	if (tmp) {
+		if (!json_is_string(tmp)) {
+			applog(LOG_ERR, "JSON invalid workid");
+			goto out;
+		}
+		work->workid = strdup(json_string_value(tmp));
+	}
+
+	rc = true;
+out:
+	/* Long polling */
+	tmp = json_object_get(val, "longpollid");
+	if (want_longpoll && json_is_string(tmp)) {
+		free(lp_id);
+		lp_id = strdup(json_string_value(tmp));
+		if (!have_longpoll) {
+			char *lp_uri;
+			tmp = json_object_get(val, "longpolluri");
+			lp_uri = json_is_string(tmp) ? strdup(json_string_value(tmp)) : rpc_url;
+			have_longpoll = true;
+			tq_push(thr_info[longpoll_thr_id].q, lp_uri);
+		}
+	}
+
+	free(merkle_tree);
+	free(cbtx);
+	return rc;
+}
+#endif
+
 #define GBT_CAPABILITIES "[\"coinbasetxn\", \"coinbasevalue\", \"longpoll\", \"workid\"]"
-static const char *gbt_req =
+static const char *gbt_req_ =
 	"{\"method\": \"getblocktemplate\", \"params\": [{"
 	//	"\"capabilities\": " GBT_CAPABILITIES ""
 	"}], \"id\":9}\r\n";
@@ -998,7 +1375,7 @@ static bool get_blocktemplate(CURL *curl, struct work *work)
 		return false;
 
 	int curl_err = 0;
-	json_t *val = json_rpc_call_pool(curl, pool, gbt_req, false, false, &curl_err);
+	json_t *val = json_rpc_call_pool(curl, pool, gbt_req_, false, false, &curl_err);
 
 	if (!val && curl_err == -1) {
 		// when getblocktemplate is not supported, disable it
@@ -1068,8 +1445,19 @@ static bool get_mininginfo(CURL *curl, struct work *work)
 	return true;
 }
 
+#ifdef ORG
 static const char *rpc_req =
 	"{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n";
+#else
+static const char *getwork_req =
+"{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n";
+static const char *gbt_req =
+"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
+GBT_CAPABILITIES "}], \"id\":0}\r\n";
+#endif
+static const char *gbt_lp_req =
+"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
+GBT_CAPABILITIES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n";
 
 static bool get_upstream_work(CURL *curl, struct work *work)
 {
@@ -1082,9 +1470,18 @@ static bool get_upstream_work(CURL *curl, struct work *work)
 		applog(LOG_DEBUG, "%s: want_longpoll=%d have_longpoll=%d",
 			__func__, want_longpoll, have_longpoll);
 
+#ifndef ORG
+	int err;
+start:
+#endif
 	gettimeofday(&tv_start, NULL);
 	/* want_longpoll/have_longpoll required here to init/unlock the lp thread */
+#ifdef ORG
 	val = json_rpc_call_pool(curl, pool, rpc_req, want_longpoll, have_longpoll, NULL);
+#else
+	val = json_rpc_call_pool(curl, pool, allow_gbt ? gbt_req : getwork_req, want_longpoll, have_longpoll, &err);
+
+#endif
 	gettimeofday(&tv_end, NULL);
 
 	if (have_stratum || unlikely(work->pooln != cur_pooln)) {
@@ -1093,10 +1490,39 @@ static bool get_upstream_work(CURL *curl, struct work *work)
 		return false;
 	}
 
+#ifndef ORG
+	if (!allow_gbt && !allow_getwork) {
+		applog(LOG_ERR, "No usable protocol");
+		if (val)
+			json_decref(val);
+		return false;
+	}
+
+	if (allow_gbt && allow_getwork && !val && err == CURLE_OK) {
+		applog(LOG_NOTICE, "getblocktemplate failed, falling back to getwork");
+		allow_gbt = false;
+		goto start;
+	}
+
+#endif
+
 	if (!val)
 		return false;
 
-	rc = work_decode(json_object_get(val, "result"), work);
+#ifndef ORG
+	if (allow_gbt) {
+		rc = gbt_work_decode_full(json_object_get(val, "result"), work);
+		if (!allow_gbt) {
+			json_decref(val);
+			goto start;
+		}
+	}
+	else {
+#endif
+		rc = work_decode(json_object_get(val, "result"), work);
+#ifndef ORG
+	}
+#endif
 
 	if (opt_protocol && rc) {
 		timeval_subtract(&diff, &tv_end, &tv_start);
@@ -1393,7 +1819,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		else
 			sha256d(merkle_root, merkle_root, 64);
 	}
-	
+
 	/* Increment extranonce2 */
 	for (i = 0; i < (int)sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++);
 
@@ -1720,8 +2146,8 @@ static void *miner_thread(void *userdata)
 			#endif
 			memcpy(&work, &g_work, sizeof(struct work));
 			nonceptr[0] = (UINT32_MAX / opt_n_threads) * thr_id; // 0 if single thr
-		} else
-			nonceptr[0]++; //??
+		}
+		//else			nonceptr[0]++; //??
 
 		if (opt_algo == ALGO_DECRED) {
 			// suprnova job_id check without data/target/height change...
@@ -2136,10 +2562,15 @@ static void *miner_thread(void *userdata)
 			}
 		}
 
-		if (rc > 0)
+
+
+/*		if (rc > 0)
 			work.scanned_to = work.nonces[0];
 		if (rc > 1)
 			work.scanned_to = max(work.nonces[0], work.nonces[1]);
+*/
+		if (rc > 0)
+			work.scanned_to = start_nonce + hashes_done;
 		else {
 			work.scanned_to = max_nonce;
 			if (opt_debug && opt_benchmark) {
@@ -2209,6 +2640,7 @@ static void *miner_thread(void *userdata)
 					break;
 			}
 		}
+		nonceptr[0] = start_nonce + hashes_done;
 	}
 
 out:
@@ -2278,6 +2710,7 @@ longpoll_retry:
 
 	while (!abort_flag) {
 		json_t *val = NULL, *soval;
+		char *req = NULL;
 		int err = 0;
 
 		if (opt_debug_threads)
@@ -2288,7 +2721,12 @@ longpoll_retry:
 		if (switchn != pool_switch_count)
 			goto need_reinit;
 
-		val = json_rpc_longpoll(curl, lp_url, pool, rpc_req, &err);
+		if (allow_gbt) {
+			req = (char*)malloc(strlen(gbt_lp_req) + strlen(lp_id) + 1);
+			sprintf(req, gbt_lp_req, lp_id);
+		}
+		val = json_rpc_longpoll(curl, lp_url, pool, req ? req : getwork_req, &err);
+		if (allow_gbt) free(req);
 		if (have_stratum || switchn != pool_switch_count) {
 			if (val)
 				json_decref(val);
@@ -2486,7 +2924,7 @@ wait_stratum_url:
 			}
 			pthread_mutex_unlock(&g_work_lock);
 		}
-		
+
 		// check we are on the right pool
 		if (switchn != pool_switch_count) goto pool_switched;
 
@@ -2552,6 +2990,109 @@ static void show_usage_and_exit(int status)
 	}
 	proper_exit(status);
 }
+static const char b58digits[] = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz";
+
+static bool b58dec(unsigned char *bin, size_t binsz, const char *b58)
+{
+	size_t i, j;
+	uint64_t t;
+	uint32_t c;
+	uint32_t *outi;
+	size_t outisz = (binsz + 3) / 4;
+	int rem = binsz % 4;
+	uint32_t remmask = 0xffffffff << (8 * rem);
+	size_t b58sz = strlen(b58);
+	bool rc = false;
+
+	outi = (uint32_t *)calloc(outisz, sizeof(*outi));
+
+	for (i = 0; i < b58sz; ++i) {
+		for (c = 0; b58digits[c] != b58[i]; c++)
+			if (!b58digits[c])
+				goto out;
+		for (j = outisz; j--;) {
+			t = (uint64_t)outi[j] * 58 + c;
+			c = t >> 32;
+			outi[j] = t & 0xffffffff;
+		}
+		if (c || outi[0] & remmask)
+			goto out;
+	}
+
+	j = 0;
+	switch (rem) {
+	case 3:
+		*(bin++) = (outi[0] >> 16) & 0xff;
+	case 2:
+		*(bin++) = (outi[0] >> 8) & 0xff;
+	case 1:
+		*(bin++) = outi[0] & 0xff;
+		++j;
+	default:
+		break;
+	}
+	for (; j < outisz; ++j) {
+		be32enc((uint32_t *)bin, outi[j]);
+		bin += sizeof(uint32_t);
+	}
+
+	rc = true;
+out:
+	free(outi);
+	return rc;
+}
+
+static int b58check(unsigned char *bin, size_t binsz, const char *b58)
+{
+	unsigned char buf[32];
+	int i;
+
+	sha256d(buf, bin, (int)(binsz - 4));
+	if (memcmp(&bin[binsz - 4], buf, 4))
+		return -1;
+
+	/* Check number of zeros is correct AFTER verifying checksum
+	* (to avoid possibility of accessing the string beyond the end) */
+	for (i = 0; bin[i] == '\0' && b58[i] == '1'; ++i);
+	if (bin[i] == '\0' || b58[i] == '1')
+		return -3;
+
+	return bin[0];
+}
+
+size_t address_to_script(unsigned char *out, size_t outsz, const char *addr)
+{
+	unsigned char addrbin[25];
+	int addrver;
+	size_t rv;
+
+	if (!b58dec(addrbin, sizeof(addrbin), addr))
+		return 0;
+	addrver = b58check(addrbin, sizeof(addrbin), addr);
+	if (addrver < 0)
+		return 0;
+	switch (addrver) {
+	case 5:    /* Bitcoin script hash */
+	case 196:  /* Testnet script hash */
+		if (outsz < (rv = 23))
+			return rv;
+		out[0] = 0xa9;  /* OP_HASH160 */
+		out[1] = 0x14;  /* push 20 bytes */
+		memcpy(&out[2], &addrbin[1], 20);
+		out[22] = 0x87;  /* OP_EQUAL */
+		return rv;
+	default:
+		if (outsz < (rv = 25))
+			return rv;
+		out[0] = 0x76;  /* OP_DUP */
+		out[1] = 0xa9;  /* OP_HASH160 */
+		out[2] = 0x14;  /* push 20 bytes */
+		memcpy(&out[3], &addrbin[1], 20);
+		out[23] = 0x88;  /* OP_EQUALVERIFY */
+		out[24] = 0xac;  /* OP_CHECKSIG */
+		return rv;
+	}
+}
 
 void parse_arg(int key, char *arg)
 {
@@ -2611,6 +3152,9 @@ void parse_arg(int key, char *arg)
 	case 1030: /* --api-remote */
 		opt_api_remote = 1;
 		break;
+	case 1080:
+		opt_eco_mode = true;
+		break;
 	case 'B':
 		opt_background = true;
 		break;
@@ -2946,9 +3490,19 @@ void parse_arg(int key, char *arg)
 	case 1009:
 		opt_shares_limit = atoi(arg);
 		break;
+	case 1010:
+		allow_getwork = false;
+		break;
 	case 1011:
 		allow_gbt = false;
 		break;
+	case 1016:			/* --coinbase-addr */
+		pk_script_size = address_to_script(pk_script, sizeof(pk_script), arg);
+		if (!pk_script_size) {
+			fprintf(stderr, "invalid address -- '%s'\n", arg);
+			show_usage_and_exit(1);
+		}
+		break;
 	case 1012:
 		opt_extranonce = false;
 		break;
@@ -3186,7 +3740,7 @@ static void parse_cmdline(int argc, char *argv[])
 		show_usage_and_exit(1);
 	}
 
-	if (opt_algo == ALGO_DECRED && opt_vote == 9999) {
+	if (opt_vote == 9999) {
 		opt_vote = 0; // default, don't vote
 	}
 }
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 10d32f2..7b79951 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -41,10 +41,7 @@
     <LinkIncremental>false</LinkIncremental>
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings" Condition="'$(Platform)'=='Win32'">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.props" />
-  </ImportGroup>
-  <ImportGroup Label="ExtensionSettings" Condition="'$(Platform)'=='x64'">
+  <ImportGroup Label="ExtensionSettings">
     <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
@@ -83,10 +80,10 @@
     <CudaCompile>
       <CInterleavedPTX>false</CInterleavedPTX>
       <GenerateLineInfo>true</GenerateLineInfo>
-      <MaxRegCount>80</MaxRegCount>
+      <MaxRegCount>255</MaxRegCount>
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>true</Keep>
-      <CodeGeneration>compute_50,sm_50</CodeGeneration>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_35,sm_35;compute_30,sm_30;compute_20,sm_20</CodeGeneration>
       <Include>$(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99</Include>
     </CudaCompile>
   </ItemDefinitionGroup>
@@ -115,15 +112,16 @@
     <CudaCompile>
       <CInterleavedPTX>false</CInterleavedPTX>
       <GenerateLineInfo>true</GenerateLineInfo>
-      <MaxRegCount>80</MaxRegCount>
+      <MaxRegCount>255</MaxRegCount>
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>true</Keep>
-      <CodeGeneration>compute_50,sm_50</CodeGeneration>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_35,sm_35;compute_30,sm_30;compute_20,sm_20</CodeGeneration>
       <Include>$(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99</Include>
       <TargetMachinePlatform>64</TargetMachinePlatform>
     </CudaCompile>
     <CudaLink>
       <PerformDeviceLink>false</PerformDeviceLink>
+      <Optimization>O3</Optimization>
     </CudaLink>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@@ -158,16 +156,16 @@
     </Link>
     <CudaCompile>
       <CInterleavedPTX>false</CInterleavedPTX>
-      <MaxRegCount>80</MaxRegCount>
+      <MaxRegCount>255</MaxRegCount>
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>true</Keep>
-      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_30,sm_30;compute_20,sm_21</CodeGeneration>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_35,sm_35;compute_30,sm_30;compute_20,sm_20</CodeGeneration>
       <AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
-      <Optimization>O2</Optimization>
+      <Optimization>O3</Optimization>
     </CudaCompile>
     <CudaLink>
       <GPUDebugInfo>false</GPUDebugInfo>
-      <Optimization>O3</Optimization>
+      <Optimization>O2</Optimization>
     </CudaLink>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
@@ -201,10 +199,10 @@
     </Link>
     <CudaCompile>
       <CInterleavedPTX>false</CInterleavedPTX>
-      <MaxRegCount>80</MaxRegCount>
+      <MaxRegCount>255</MaxRegCount>
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>true</Keep>
-      <CodeGeneration>compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;compute_20,sm_21</CodeGeneration>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_35,sm_35;compute_30,sm_30;compute_20,sm_20</CodeGeneration>
       <Include>$(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99</Include>
       <Optimization>O3</Optimization>
       <TargetMachinePlatform>64</TargetMachinePlatform>
@@ -250,6 +248,7 @@
     <ClCompile Include="lyra2\Lyra2.c" />
     <ClCompile Include="lyra2\Sponge.c" />
     <ClInclude Include="lyra2\cuda_lyra2_sm2.cuh" />
+    <ClInclude Include="lyra2\cuda_lyra2_sm5.cuh" />
     <ClInclude Include="neoscrypt\neoscrypt.h" />
     <ClCompile Include="neoscrypt\neoscrypt.cpp" />
     <ClCompile Include="neoscrypt\neoscrypt-cpu.c" />
@@ -347,7 +346,6 @@
     <ClInclude Include="uint256.h" />
     <ClInclude Include="lyra2\Lyra2.h" />
     <ClInclude Include="lyra2\Sponge.h" />
-    <ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh" />
     <ClInclude Include="quark\groestl_transf_quad.h" />
     <ClInclude Include="quark\groestl_functions_quad.h" />
     <ClInclude Include="quark\cuda_quark.h" />
@@ -527,10 +525,7 @@
     <Text Include="README.txt" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets" Condition="'$(Platform)'=='Win32'">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.targets" />
-  </ImportGroup>
-  <ImportGroup Label="ExtensionTargets" Condition="'$(Platform)'=='x64'">
+  <ImportGroup Label="ExtensionTargets">
     <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.targets" />
   </ImportGroup>
   <!-- Copy the required dlls -->
@@ -540,4 +535,4 @@
   <Target Name="AfterClean">
     <Delete Files="@(FilesToCopy->'$(OutDir)%(Filename)%(Extension)')" TreatErrorsAsWarnings="true" />
   </Target>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index ed942f3..c13b63f 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -437,9 +437,6 @@
     <ClInclude Include="bignum.hpp">
       <Filter>Header Files</Filter>
     </ClInclude>
-    <ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh">
-      <Filter>Source Files\CUDA\lyra2</Filter>
-    </ClInclude>
     <ClInclude Include="lyra2\cuda_lyra2_sm2.cuh">
       <Filter>Source Files\CUDA\lyra2</Filter>
     </ClInclude>
@@ -455,6 +452,9 @@
     <ClInclude Include="x11\cuda_x11_simd512_sm2.cuh">
       <Filter>Source Files\CUDA\x11</Filter>
     </ClInclude>
+    <ClInclude Include="lyra2\cuda_lyra2_sm5.cuh">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </ClInclude>
   </ItemGroup>
   <ItemGroup>
     <CudaCompile Include="cuda.cpp">
@@ -728,4 +728,4 @@
       <Filter>Ressources</Filter>
     </Text>
   </ItemGroup>
-</Project>
+</Project>
\ No newline at end of file
diff --git a/configure.ac b/configure.ac
index aec2fe6..4381840 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [1.7.6], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [1.7.6-r10], [], [ccminer], [http://github.com/tpruvot/ccminer])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cuda_helper.h b/cuda_helper.h
index 1358892..2b0bd73 100644
--- a/cuda_helper.h
+++ b/cuda_helper.h
@@ -96,7 +96,6 @@ __device__ __forceinline__ uint64_t REPLACE_LODWORD(const uint64_t &x, const uin
 	return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
 }
 
-// Endian Drehung f�r 32 Bit Typen
 #ifdef __CUDA_ARCH__
 __device__ __forceinline__ uint32_t cuda_swab32(uint32_t x)
 {
@@ -471,6 +470,15 @@ static __host__ __device__ __forceinline__ uint64_t devectorize(uint2 v) {
 #endif
 }
 
+static __device__ __forceinline__ uint2 eorswap32(uint2 u, uint2 v)
+{
+	uint2 result;
+	result.y = u.x ^ v.x;
+	result.x = u.y ^ v.y;
+	return result;
+}
+
+
 /**
  * uint2 direct ops by c++ operator definitions
  */
@@ -561,11 +569,9 @@ uint2 ROR2(const uint2 a, const int offset)
 	return result;
 }
 
-__device__ __forceinline__
-uint2 ROL2(const uint2 a, const int offset)
-{
+#if  __CUDA_ARCH__ >= 350
+__inline__ __device__ uint2 ROL2(const uint2 a, const int offset) {
 	uint2 result;
-#if __CUDA_ARCH__ > 300
 	if (offset >= 32) {
 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
@@ -574,14 +580,20 @@ uint2 ROL2(const uint2 a, const int offset)
 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
 	}
+	return result;
+}
 #else
-	if (!offset)
-		result = a;
+__inline__ __device__ uint2 ROL2(const uint2 v, const int n)
+{
+	uint2 result;
+	if (!n)
+		result = v;
 	else
-		result = ROR2(a, 64 - offset);
-#endif
+		result = ROR2(v, 64 - n);
+
 	return result;
 }
+#endif
 
 __device__ __forceinline__
 uint2 SWAPUINT2(uint2 value)
diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu
index 88d4bce..67e81fc 100644
--- a/lyra2/cuda_lyra2.cu
+++ b/lyra2/cuda_lyra2.cu
@@ -1,41 +1,211 @@
 /**
- * Lyra2 (v1) cuda implementation based on djm34 work - SM 5/5.2
- * tpruvot@github 2015
- */
+* Lyra2 (v1) cuda implementation based on djm34 work - SM 5/5.2
+* tpruvot@github 2015
+*/
 
 #include <stdio.h>
 #include <memory.h>
 
-#define TPB50 16
-#define TPB52 8
+#define TPB52 32
 
 #include "cuda_lyra2_sm2.cuh"
+#include "cuda_lyra2_sm5.cuh"
 
 #ifdef __INTELLISENSE__
 /* just for vstudio code colors */
-#define __CUDA_ARCH__ 500
+#define __CUDA_ARCH__ 520
 #endif
 
-#if !defined(__CUDA_ARCH__) ||  __CUDA_ARCH__ >= 500
+#if !defined(__CUDA_ARCH__) ||  __CUDA_ARCH__ > 500
 
-#include "cuda_vector_uint2x4.h"
+#include "cuda_lyra2_vectors.h"
 
-#define memshift 3
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+__device__ uint32_t __shfl(uint32_t a, uint32_t b, uint32_t c);
+#endif
 
+#define Nrow 8
 #define Ncol 8
-#define NcolMask 0x7
+#define memshift 3
+
+#define BUF_COUNT 0
+
+__device__ uint2 *DMatrix;
+
+__device__ __forceinline__ void LD4S(uint2 res[3], const int row, const int col, const int thread, const int threads)
+{
+#if BUF_COUNT != 8
+	extern __shared__ uint2 shared_mem[];
+	const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift;
+#endif
+#if BUF_COUNT != 0
+	const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x;
+#endif
+
+#if BUF_COUNT == 8
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		res[j] = *(DMatrix + d0 + j * threads * blockDim.x);
+#elif BUF_COUNT == 0
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+#else
+	if (row < BUF_COUNT)
+	{
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			res[j] = *(DMatrix + d0 + j * threads * blockDim.x);
+	}
+	else
+	{
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+	}
+#endif
+}
 
-__device__ uint2x4* DMatrix;
+__device__ __forceinline__ void ST4S(const int row, const int col, const uint2 data[3], const int thread, const int threads)
+{
+#if BUF_COUNT != 8
+	extern __shared__ uint2 shared_mem[];
+	const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift;
+#endif
+#if BUF_COUNT != 0
+	const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x;
+#endif
+
+#if BUF_COUNT == 8
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		*(DMatrix + d0 + j * threads * blockDim.x) = data[j];
+#elif BUF_COUNT == 0
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j];
+#else
+	if (row < BUF_COUNT)
+	{
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + d0 + j * threads * blockDim.x) = data[j];
+	}
+	else
+	{
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j];
+	}
+#endif
+}
+
+#if __CUDA_ARCH__ >= 300
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	return __shfl(a, b, c);
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	a1 = WarpShuffle(a1, b1, c);
+	a2 = WarpShuffle(a2, b2, c);
+	a3 = WarpShuffle(a3, b3, c);
+}
+
+#else
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+	uint32_t *_ptr = (uint32_t*)shared_mem;
+
+	__threadfence_block();
+	uint32_t buf = _ptr[thread];
+
+	_ptr[thread] = a;
+	__threadfence_block();
+	uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	_ptr[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a;
+	__threadfence_block();
+	uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a1;
+	__threadfence_block();
+	a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a2;
+	__threadfence_block();
+	a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a3;
+	__threadfence_block();
+	a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+	__threadfence_block();
+}
+
+#endif
 
 static __device__ __forceinline__
 void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
 {
-	a += b; d ^= a; d = SWAPUINT2(d);
-	c += d; b ^= c; b = ROR2(b, 24);
-	a += b; d ^= a; d = ROR2(d, 16);
+	a += b; d = eorswap32(a, d);
+	c += d; b ^= c; b = ROR24(b);
+	a += b; d ^= a; d = ROR16(d);
 	c += d; b ^= c; b = ROR2(b, 63);
 }
 
+__device__ __forceinline__ void round_lyra(uint2 s[4])
+{
+	Gfunc(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4);
+	Gfunc(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4);
+}
+
 static __device__ __forceinline__
 void round_lyra(uint2x4* s)
 {
@@ -50,21 +220,24 @@ void round_lyra(uint2x4* s)
 }
 
 static __device__ __forceinline__
-void reduceDuplex(uint2x4 state[4], uint32_t thread)
+void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads)
 {
-	uint2x4 state1[3];
-
-	const uint32_t ps1 = (256 * thread);
-	const uint32_t ps2 = (memshift * 7 + memshift * 8 + 256 * thread);
+	uint2 state1[3];
 
-	#pragma unroll 4
-	for (int i = 0; i < 8; i++)
+#if __CUDA_ARCH__ > 500
+#pragma unroll
+#endif
+	for (int i = 0; i < Nrow; i++)
 	{
-		const uint32_t s1 = ps1 + i*memshift;
-		const uint32_t s2 = ps2 - i*memshift;
+		ST4S(0, Ncol - i - 1, state, thread, threads);
 
-		for (int j = 0; j < 3; j++)
-			state1[j] = __ldg4(&(DMatrix+s1)[j]);
+		round_lyra(state);
+	}
+
+#pragma unroll 4
+	for (int i = 0; i < Nrow; i++)
+	{
+		LD4S(state1, 0, i, thread, threads);
 		for (int j = 0; j < 3; j++)
 			state[j] ^= state1[j];
 
@@ -72,208 +245,342 @@ void reduceDuplex(uint2x4 state[4], uint32_t thread)
 
 		for (int j = 0; j < 3; j++)
 			state1[j] ^= state[j];
-		for (int j = 0; j < 3; j++)
-			(DMatrix + s2)[j] = state1[j];
+		ST4S(1, Ncol - i - 1, state1, thread, threads);
 	}
 }
 
 static __device__ __forceinline__
-void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2x4 state[4], uint32_t thread)
+void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], uint32_t thread, const uint32_t threads)
 {
-	uint2x4 state1[3], state2[3];
-
-	const uint32_t ps1 = (             memshift*8 * rowIn    + 256 * thread);
-	const uint32_t ps2 = (             memshift*8 * rowInOut + 256 * thread);
-	const uint32_t ps3 = (memshift*7 + memshift*8 * rowOut   + 256 * thread);
+	uint2 state1[3], state2[3];
 
-	#pragma unroll 1
-	for (int i = 0; i < 8; i++)
+#pragma unroll 1
+	for (int i = 0; i < Nrow; i++)
 	{
-		const uint32_t s1 = ps1 + i*memshift;
-		const uint32_t s2 = ps2 + i*memshift;
+		LD4S(state1, rowIn, i, thread, threads);
+		LD4S(state2, rowInOut, i, thread, threads);
 		for (int j = 0; j < 3; j++)
-			state1[j]= __ldg4(&(DMatrix + s1)[j]);
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+#pragma unroll
 		for (int j = 0; j < 3; j++)
-			state2[j]= __ldg4(&(DMatrix + s2)[j]);
-		for (int j = 0; j < 3; j++) {
-			uint2x4 tmp = state1[j] + state2[j];
-			state[j] ^= tmp;
+			state1[j] ^= state[j];
+
+		ST4S(rowOut, Ncol - i - 1, state1, thread, threads);
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
 		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		ST4S(rowInOut, i, state2, thread, threads);
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	for (int i = 0; i < Nrow; i++)
+	{
+		uint2 state1[3], state2[3];
+
+		LD4S(state1, rowIn, i, thread, threads);
+		LD4S(state2, rowInOut, i, thread, threads);
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
 
 		round_lyra(state);
 
-		for (int j = 0; j < 3; j++) {
-			const uint32_t s3 = ps3 - i*memshift;
-			state1[j] ^= state[j];
-			(DMatrix + s3)[j] = state1[j];
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
 		}
 
-		((uint2*)state2)[0] ^= ((uint2*)state)[11];
+		ST4S(rowInOut, i, state2, thread, threads);
 
-		for (int j = 0; j < 11; j++)
-			((uint2*)state2)[j+1] ^= ((uint2*)state)[j];
+		LD4S(state1, rowOut, i, thread, threads);
 
+#pragma unroll
 		for (int j = 0; j < 3; j++)
-			(DMatrix + s2)[j] = state2[j];
+			state1[j] ^= state[j];
+
+		ST4S(rowOut, i, state1, thread, threads);
 	}
 }
 
 static __device__ __forceinline__
-void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2x4* state, const uint32_t thread)
+void reduceDuplexRowt_8(const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads)
 {
-	const uint32_t ps1 = (memshift * 8 * rowIn    + 256 * thread);
-	const uint32_t ps2 = (memshift * 8 * rowInOut + 256 * thread);
-	const uint32_t ps3 = (memshift * 8 * rowOut   + 256 * thread);
 
-	#pragma unroll 1
-	for (int i = 0; i < 8; i++)
+	uint2 state1[3], state2[3], last[3];
+
+	LD4S(state1, 2, 0, thread, threads);
+	LD4S(last, rowInOut, 0, thread, threads);
+
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= state1[j] + last[j];
+
+	round_lyra(state);
+
+	//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+	uint2 Data0 = state[0];
+	uint2 Data1 = state[1];
+	uint2 Data2 = state[2];
+	WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+	if (threadIdx.x == 0)
+	{
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	}
+	else
 	{
-		uint2x4 state1[3], state2[3];
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
+	}
 
-		const uint32_t s1 = ps1 + i*memshift;
-		const uint32_t s2 = ps2 + i*memshift;
+	if (rowInOut == 5)
+	{
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}
 
-		for (int j = 0; j < 3; j++) {
-			state1[j] = __ldg4(&(DMatrix + s1)[j]);
-			state2[j] = __ldg4(&(DMatrix + s2)[j]);
-		}
+	for (int i = 1; i < Nrow; i++)
+	{
+		LD4S(state1, 2, i, thread, threads);
+		LD4S(state2, rowInOut, i, thread, threads);
 
-		#pragma unroll
-		for (int j = 0; j < 3; j++) {
-			state1[j] += state2[j];
-			state[j]  ^= state1[j];
-		}
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
 
 		round_lyra(state);
-
-		((uint2*)state2)[0] ^= ((uint2*)state)[11];
-
-		for (int j = 0; j < 11; j++)
-			((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
-
-		if (rowInOut == rowOut) {
-			for (int j = 0; j < 3; j++) {
-				state2[j] ^= state[j];
-				(DMatrix + s2)[j]=state2[j];
-			}
-		} else {
-			const uint32_t s3 = ps3 + i*memshift;
-			for (int j = 0; j < 3; j++) {
-				(DMatrix + s2)[j] = state2[j];
-				(DMatrix + s3)[j] ^= state[j];
-			}
-		}
 	}
+
+
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= last[j];
 }
 
-#if __CUDA_ARCH__ == 500
-__global__ __launch_bounds__(TPB50, 1)
-#else
-__global__ __launch_bounds__(TPB52, 2)
-#endif
-void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+__constant__ uint2x4 blake2b_IV[2] = {
+	0xf3bcc908lu, 0x6a09e667lu,
+	0x84caa73blu, 0xbb67ae85lu,
+	0xfe94f82blu, 0x3c6ef372lu,
+	0x5f1d36f1lu, 0xa54ff53alu,
+	0xade682d1lu, 0x510e527flu,
+	0x2b3e6c1flu, 0x9b05688clu,
+	0xfb41bd6blu, 0x1f83d9ablu,
+	0x137e2179lu, 0x5be0cd19lu
+};
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 
-	const uint2x4 blake2b_IV[2] = {
-		{{ 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a }},
-		{{ 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 }}
-	};
-
 	if (thread < threads)
 	{
 		uint2x4 state[4];
 
-		((uint2*)state)[0] = __ldg(&g_hash[thread]);
-		((uint2*)state)[1] = __ldg(&g_hash[thread + threads]);
-		((uint2*)state)[2] = __ldg(&g_hash[thread + threads*2]);
-		((uint2*)state)[3] = __ldg(&g_hash[thread + threads*3]);
-
-		state[1] = state[0];
+		state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]);
+		state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]);
+		state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]);
+		state[0].w = state[1].w = __ldg(&g_hash[thread + threads * 3]);
 		state[2] = blake2b_IV[0];
 		state[3] = blake2b_IV[1];
 
 		for (int i = 0; i<24; i++)
 			round_lyra(state); //because 12 is not enough
 
-		const uint32_t ps1 = (memshift * 7  + 256 * thread);
-		for (int i = 0; i < 8; i++)
-		{
-			const uint32_t s1 = ps1 - memshift * i;
-			for (int j = 0; j < 3; j++)
-				(DMatrix + s1)[j] = (state)[j];
-			round_lyra(state);
-		}
+		((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
+		((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
+		((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
+		((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
+	}
+}
 
-		reduceDuplex(state, thread);
-
-		reduceDuplexRowSetup(1, 0, 2, state,  thread);
-		reduceDuplexRowSetup(2, 1, 3, state,  thread);
-		reduceDuplexRowSetup(3, 0, 4, state,  thread);
-		reduceDuplexRowSetup(4, 3, 5, state,  thread);
-		reduceDuplexRowSetup(5, 2, 6, state,  thread);
-		reduceDuplexRowSetup(6, 1, 7, state,  thread);
-
-		uint32_t rowa = state[0].x.x & 7;
-		reduceDuplexRowt(7, rowa, 0, state, thread);
-		rowa = state[0].x.x & 7;
-		reduceDuplexRowt(0, rowa, 3, state, thread);
-		rowa = state[0].x.x & 7;
-		reduceDuplexRowt(3, rowa, 6, state, thread);
-		rowa = state[0].x.x & 7;
-		reduceDuplexRowt(6, rowa, 1, state, thread);
-		rowa = state[0].x.x & 7;
-		reduceDuplexRowt(1, rowa, 4, state, thread);
-		rowa = state[0].x.x & 7;
-		reduceDuplexRowt(4, rowa, 7, state, thread);
-		rowa = state[0].x.x & 7;
-		reduceDuplexRowt(7, rowa, 2, state, thread);
-		rowa = state[0].x.x & 7;
-		reduceDuplexRowt(2, rowa, 5, state, thread);
-
-		const int32_t shift = (memshift * 8 * rowa + 256 * thread);
-
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state[j] ^= __ldg4(&(DMatrix + shift)[j]);
+#if __CUDA_ARCH__ < 300
+__global__ __launch_bounds__(TPB20, 1)
+#elif __CUDA_ARCH__ < 500
+__global__ __launch_bounds__(TPB30, 1)
+#elif __CUDA_ARCH__ == 500
+__global__ __launch_bounds__(TPB50, 1)
+#else
+__global__ __launch_bounds__(TPB52, 1)
+#endif
+void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+{
+	const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
+
+	if (thread < threads)
+	{
+		uint2 state[4];
+		state[0] = __ldg(&DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x]);
+		state[1] = __ldg(&DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x]);
+		state[2] = __ldg(&DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x]);
+		state[3] = __ldg(&DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x]);
+
+		reduceDuplex(state, thread, threads);
+
+		reduceDuplexRowSetup(1, 0, 2, state, thread, threads);
+		reduceDuplexRowSetup(2, 1, 3, state, thread, threads);
+		reduceDuplexRowSetup(3, 0, 4, state, thread, threads);
+		reduceDuplexRowSetup(4, 3, 5, state, thread, threads);
+		reduceDuplexRowSetup(5, 2, 6, state, thread, threads);
+		reduceDuplexRowSetup(6, 1, 7, state, thread, threads);
+
+		uint32_t rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(7, rowa, 0, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(0, rowa, 3, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(3, rowa, 6, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(6, rowa, 1, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(1, rowa, 4, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(4, rowa, 7, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(7, rowa, 2, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt_8(rowa, state, thread, threads);
+
+		DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x] = state[0];
+		DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x] = state[1];
+		DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x] = state[2];
+		DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x] = state[3];
+	}
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	uint28 state[4];
+
+	if (thread < threads)
+	{
+		state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
+		state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
+		state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
+		state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);
 
 		for (int i = 0; i < 12; i++)
 			round_lyra(state);
 
-		g_hash[thread]             = ((uint2*)state)[0];
-		g_hash[thread + threads]   = ((uint2*)state)[1];
-		g_hash[thread + threads*2] = ((uint2*)state)[2];
-		g_hash[thread + threads*3] = ((uint2*)state)[3];
-	}
+		g_hash[thread + threads * 0] = state[0].x;
+		g_hash[thread + threads * 1] = state[0].y;
+		g_hash[thread + threads * 2] = state[0].z;
+		g_hash[thread + threads * 3] = state[0].w;
+
+	} //thread
 }
 #else
+#if __CUDA_ARCH__ < 500
+
 /* for unsupported SM arch */
 __device__ void* DMatrix;
-__global__ void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
+#endif
+__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {}
+__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
 #endif
 
 __host__
-void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix)
+void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
 {
-	cuda_get_arch(thr_id);
+	int dev_id = device_map[thr_id % MAX_GPUS];
+	// just assign the device pointer allocated in main loop
 	cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
 }
 
 __host__
-void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order)
+void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, bool gtx750ti)
 {
 	int dev_id = device_map[thr_id % MAX_GPUS];
+
 	uint32_t tpb = TPB52;
-	if (device_sm[dev_id] == 500) tpb = TPB50;
-	if (device_sm[dev_id] == 350) tpb = TPB30; // to enhance (or not)
-	if (device_sm[dev_id] <= 300) tpb = TPB30;
 
-	dim3 grid((threads + tpb - 1) / tpb);
-	dim3 block(tpb);
+	if (cuda_arch[dev_id] >= 520) tpb = TPB52;
+	else if (cuda_arch[dev_id] >= 500) tpb = TPB50;
+	else if (cuda_arch[dev_id] >= 200) tpb = TPB20;
 
-	if (device_sm[dev_id] >= 500)
-		lyra2_gpu_hash_32 <<< grid, block >>> (threads, startNounce, (uint2*)d_hash);
-	else
-		lyra2_gpu_hash_32_sm2 <<< grid, block >>> (threads, startNounce, d_hash);
+	dim3 grid1((threads * 4 + tpb - 1) / tpb);
+	dim3 block1(4, tpb >> 2);
 
+	dim3 grid2((threads + 64 - 1) / 64);
+	dim3 block2(64);
+
+	dim3 grid3((threads + tpb - 1) / tpb);
+	dim3 block3(tpb);
+
+	size_t shared_mem = 0;
+
+	//if (cuda_arch[dev_id] < 500) cudaFuncSetCacheConfig(lyra2_gpu_hash_32_2, cudaFuncCachePreferShared);
+
+	if (cuda_arch[dev_id] >= 520)
+	{
+		lyra2_gpu_hash_32_1 << <grid2, block2 >> > (threads, startNounce, (uint2*)d_hash);
+
+		lyra2_gpu_hash_32_2 << <grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >> > (threads, startNounce, d_hash);
+
+		lyra2_gpu_hash_32_3 << <grid2, block2 >> > (threads, startNounce, (uint2*)d_hash);
+	}
+	else if (cuda_arch[dev_id] >= 500)
+	{
+		if (gtx750ti)
+			// 8Warpに調整のため、8192バイト確保する
+			shared_mem = 8192;
+		else
+			// 10Warpに調整のため、6144バイト確保する
+			shared_mem = 6144;
+
+
+		lyra2_gpu_hash_32_1_sm5 << <grid2, block2 >> > (threads, startNounce, (uint2*)d_hash);
+
+		lyra2_gpu_hash_32_2_sm5 << <grid1, block1, shared_mem >> > (threads, startNounce, (uint2*)d_hash);
+
+		lyra2_gpu_hash_32_3_sm5 << <grid2, block2 >> > (threads, startNounce, (uint2*)d_hash);
+	}
+	else
+		lyra2_gpu_hash_32_sm2 << < grid3, block3 >> > (threads, startNounce, d_hash);
 }
diff --git a/lyra2/cuda_lyra2_sm2.cuh b/lyra2/cuda_lyra2_sm2.cuh
index 7998d17..94e8756 100644
--- a/lyra2/cuda_lyra2_sm2.cuh
+++ b/lyra2/cuda_lyra2_sm2.cuh
@@ -3,15 +3,16 @@
 #ifdef __INTELLISENSE__
 /* just for vstudio code colors */
 #undef __CUDA_ARCH__
-#define __CUDA_ARCH__ 300
+#define __CUDA_ARCH__ 500
 #endif
 
 #include "cuda_helper.h"
 
 #define TPB30 160
+#define TPB20 160
 
 #if (__CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350) || !defined(__CUDA_ARCH__)
-__constant__ static uint2 blake2b_IV[8] = {
+__constant__ static uint2 blake2b_IV_sm2[8] = {
 	{ 0xf3bcc908, 0x6a09e667 },
 	{ 0x84caa73b, 0xbb67ae85 },
 	{ 0xfe94f82b, 0x3c6ef372 },
@@ -149,7 +150,7 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_h
 
 		#pragma unroll
 		for (int i = 0; i<8; i++) {
-			state[i + 8] = blake2b_IV[i];
+			state[i + 8] = blake2b_IV_sm2[i];
 		}
 
 		// blake2blyra x2
diff --git a/lyra2/cuda_lyra2_sm5.cuh b/lyra2/cuda_lyra2_sm5.cuh
new file mode 100644
index 0000000..1db4e63
--- /dev/null
+++ b/lyra2/cuda_lyra2_sm5.cuh
@@ -0,0 +1,701 @@
+#include <memory.h>
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#undef __CUDA_ARCH__
+#define __CUDA_ARCH__ 500
+#endif
+
+#include "cuda_helper.h"
+
+#define TPB50 32
+
+#if __CUDA_ARCH__ == 500
+#include "cuda_lyra2_vectors.h"
+
+#define Nrow 8
+#define Ncol 8
+#define memshift 3
+
+__device__ uint2 *DMatrix;
+
+__device__ __forceinline__ uint2 LD4S(const int index)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+}
+
+__device__ __forceinline__ void ST4S(const int index, const uint2 data)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data;
+}
+
+#if __CUDA_ARCH__ >= 300
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	return __shfl(a, b, c);
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	a1 = WarpShuffle(a1, b1, c);
+	a2 = WarpShuffle(a2, b2, c);
+	a3 = WarpShuffle(a3, b3, c);
+}
+
+#else
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+	uint32_t *_ptr = (uint32_t*)shared_mem;
+
+	__threadfence_block();
+	uint32_t buf = _ptr[thread];
+
+	_ptr[thread] = a;
+	__threadfence_block();
+	uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	_ptr[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a;
+	__threadfence_block();
+	uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a1;
+	__threadfence_block();
+	a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a2;
+	__threadfence_block();
+	a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a3;
+	__threadfence_block();
+	a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+	__threadfence_block();
+}
+
+#endif
+
+static __device__ __forceinline__
+void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
+{
+	a += b; d ^= a; d = SWAPUINT2(d);
+	c += d; b ^= c; b = ROR2(b, 24);
+	a += b; d ^= a; d = ROR2(d, 16);
+	c += d; b ^= c; b = ROR2(b, 63);
+}
+
+__device__ __forceinline__ void round_lyra(uint2 s[4])
+{
+	Gfunc(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4);
+	Gfunc(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4);
+}
+
+static __device__ __forceinline__
+void round_lyra(uint2x4* s)
+{
+	Gfunc(s[0].x, s[1].x, s[2].x, s[3].x);
+	Gfunc(s[0].y, s[1].y, s[2].y, s[3].y);
+	Gfunc(s[0].z, s[1].z, s[2].z, s[3].z);
+	Gfunc(s[0].w, s[1].w, s[2].w, s[3].w);
+	Gfunc(s[0].x, s[1].y, s[2].z, s[3].w);
+	Gfunc(s[0].y, s[1].z, s[2].w, s[3].x);
+	Gfunc(s[0].z, s[1].w, s[2].x, s[3].y);
+	Gfunc(s[0].w, s[1].x, s[2].y, s[3].z);
+}
+
+static __device__ __forceinline__
+void reduceDuplexV5(uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	uint2 state1[3], state2[3];
+
+	const uint32_t ps0 = (memshift * Ncol * 0 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps1 = (memshift * Ncol * 1 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps2 = (memshift * Ncol * 2 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps3 = (memshift * Ncol * 3 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps4 = (memshift * Ncol * 4 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps5 = (memshift * Ncol * 5 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps6 = (memshift * Ncol * 6 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps7 = (memshift * Ncol * 7 * threads + thread)*blockDim.x + threadIdx.x;
+
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s0 = memshift * Ncol * 0 + (Ncol - 1 - i) * memshift;
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			ST4S(s0 + j, state[j]);
+		round_lyra(state);
+	}
+
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s1 = ps1 + (7 - i)*memshift* threads*blockDim.x;
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = LD4S(s0 + j);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j];
+
+		round_lyra(state);
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s1 + j*threads*blockDim.x) = state1[j] ^ state[j];
+	}
+
+	// 1, 0, 2
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
+		const uint32_t s2 = ps2 + (7 - i)*memshift* threads*blockDim.x;
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s1 + j*threads*blockDim.x);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = LD4S(s0 + j);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s2 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			ST4S(s0 + j, state2[j]);
+	}
+
+	// 2, 1, 3
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
+		const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x;
+		const uint32_t s3 = ps3 + (7 - i)*memshift* threads*blockDim.x;
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s2 + j*threads*blockDim.x);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = *(DMatrix + s1 + j*threads*blockDim.x);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s3 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s1 + j*threads*blockDim.x) = state2[j];
+	}
+
+	// 3, 0, 4
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t ls0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s0 = ps0 + i * memshift* threads*blockDim.x;
+		const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x;
+		const uint32_t s4 = ps4 + (7 - i)*memshift* threads*blockDim.x;
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s3 + j*threads*blockDim.x);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = LD4S(ls0 + j);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s4 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s0 + j*threads*blockDim.x) = state2[j];
+	}
+
+	// 4, 3, 5
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x;
+		const uint32_t s4 = ps4 + i * memshift* threads*blockDim.x;
+		const uint32_t s5 = ps5 + (7 - i)*memshift* threads*blockDim.x;
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s4 + j*threads*blockDim.x);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = *(DMatrix + s3 + j*threads*blockDim.x);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s5 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s3 + j*threads*blockDim.x) = state2[j];
+	}
+
+	// 5, 2, 6
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x;
+		const uint32_t s5 = ps5 + i * memshift* threads*blockDim.x;
+		const uint32_t s6 = ps6 + (7 - i)*memshift* threads*blockDim.x;
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s5 + j*threads*blockDim.x);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = *(DMatrix + s2 + j*threads*blockDim.x);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s6 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s2 + j*threads*blockDim.x) = state2[j];
+	}
+
+	// 6, 1, 7
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
+		const uint32_t s6 = ps6 + i * memshift* threads*blockDim.x;
+		const uint32_t s7 = ps7 + (7 - i)*memshift* threads*blockDim.x;
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s6 + j*threads*blockDim.x);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = *(DMatrix + s1 + j*threads*blockDim.x);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s7 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s1 + j*threads*blockDim.x) = state2[j];
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowV50(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	const uint32_t ps1 = (memshift * Ncol * rowIn*threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps3 = (memshift * Ncol * rowOut*threads + thread)*blockDim.x + threadIdx.x;
+
+#pragma unroll 1
+	for (int i = 0; i < 8; i++)
+	{
+		uint2 state1[3], state2[3];
+
+		const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x;
+		const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x;
+		const uint32_t s3 = ps3 + i*memshift*threads *blockDim.x;
+
+#pragma unroll
+		for (int j = 0; j < 3; j++) {
+			state1[j] = *(DMatrix + s1 + j*threads*blockDim.x);
+			state2[j] = *(DMatrix + s2 + j*threads*blockDim.x);
+		}
+
+#pragma unroll
+		for (int j = 0; j < 3; j++) {
+			state1[j] += state2[j];
+			state[j] ^= state1[j];
+		}
+
+		round_lyra(state);
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+		{
+			*(DMatrix + s2 + j*threads*blockDim.x) = state2[j];
+			*(DMatrix + s3 + j*threads*blockDim.x) ^= state[j];
+		}
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	const uint32_t ps1 = (memshift * Ncol * 2*threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps3 = (memshift * Ncol * 5*threads + thread)*blockDim.x + threadIdx.x;
+
+	uint2 state1[3], last[3];
+
+#pragma unroll
+	for (int j = 0; j < 3; j++) {
+		state1[j] = *(DMatrix + ps1 + j*threads*blockDim.x);
+		last[j] = *(DMatrix + ps2 + j*threads*blockDim.x);
+	}
+
+#pragma unroll
+	for (int j = 0; j < 3; j++) {
+		state1[j] += last[j];
+		state[j] ^= state1[j];
+	}
+
+	round_lyra(state);
+
+	//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+	uint2 Data0 = state[0];
+	uint2 Data1 = state[1];
+	uint2 Data2 = state[2];
+	WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+	if (threadIdx.x == 0)
+	{
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	}
+	else
+	{
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
+	}
+
+	if (rowInOut == 5)
+	{
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}
+
+	for (int i = 1; i < 8; i++)
+	{
+		const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x;
+		const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x;
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= *(DMatrix + s1 + j*threads*blockDim.x) + *(DMatrix + s2 + j*threads*blockDim.x);
+
+		round_lyra(state);
+	}
+
+
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= last[j];
+
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	const uint2x4 blake2b_IV[2] = {
+		{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } },
+		{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }
+	};
+
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+
+		((uint2*)state)[0] = __ldg(&g_hash[thread]);
+		((uint2*)state)[1] = __ldg(&g_hash[thread + threads]);
+		((uint2*)state)[2] = __ldg(&g_hash[thread + threads * 2]);
+		((uint2*)state)[3] = __ldg(&g_hash[thread + threads * 3]);
+
+		state[1] = state[0];
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];
+
+		for (int i = 0; i < 24; i++)
+			round_lyra(state); //because 12 is not enough
+
+		((uint2x4*)DMatrix)[0 * threads + thread] = state[0];
+		((uint2x4*)DMatrix)[1 * threads + thread] = state[1];
+		((uint2x4*)DMatrix)[2 * threads + thread] = state[2];
+		((uint2x4*)DMatrix)[3 * threads + thread] = state[3];
+	}
+}
+
+__global__ __launch_bounds__(TPB50, 1)
+void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+{
+	const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
+
+	if (thread < threads)
+	{
+		uint2 state[4];
+
+		state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]);
+		state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]);
+		state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]);
+		state[3] = __ldg(&DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x]);
+
+		reduceDuplexV5(state, thread, threads);
+
+		uint32_t rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(7, rowa, 0, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(0, rowa, 3, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(3, rowa, 6, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(6, rowa, 1, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(1, rowa, 4, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(4, rowa, 7, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(7, rowa, 2, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50_8(rowa, state, thread, threads);
+
+		DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x] = state[0];
+		DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x] = state[1];
+		DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x] = state[2];
+		DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x] = state[3];
+	}
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+
+		state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]);
+		state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]);
+		state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]);
+		state[3] = __ldg4(&((uint2x4*)DMatrix)[3 * threads + thread]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra(state);
+
+		g_hash[thread] = ((uint2*)state)[0];
+		g_hash[thread + threads] = ((uint2*)state)[1];
+		g_hash[thread + threads * 2] = ((uint2*)state)[2];
+		g_hash[thread + threads * 3] = ((uint2*)state)[3];
+	}
+}
+
+#else
+/* if __CUDA_ARCH__ != 500 .. host */
+__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
+#endif
diff --git a/lyra2/cuda_lyra2v2.cu b/lyra2/cuda_lyra2v2.cu
index c6c4d1a..265d433 100644
--- a/lyra2/cuda_lyra2v2.cu
+++ b/lyra2/cuda_lyra2v2.cu
@@ -2,35 +2,152 @@
 #include <stdint.h>
 #include <memory.h>
 
-#define TPB52 8
-#define TPB50 16
-
-#include "cuda_lyra2v2_sm3.cuh"
+#define TPB52 32
+#define TPB50 32
+#define TPB30 32
+#define TPB20 32
 
 #ifdef __INTELLISENSE__
 /* just for vstudio code colors */
-#define __CUDA_ARCH__ 500
+#define __CUDA_ARCH__ 200
 #endif
 
-#if __CUDA_ARCH__ >= 500
-
 #include "cuda_lyra2_vectors.h"
 
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#if __CUDA_ARCH__ >= 300
+__device__ uint32_t __shfl(uint32_t a, uint32_t b, uint32_t c);
+#endif
+#endif
+
 #define Nrow 4
 #define Ncol 4
 #define memshift 3
 
-__device__ uint2x4 *DMatrix;
+__device__ uint2x4 *DState;
+
+__device__ __forceinline__ uint2 LD4S(const int index)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+}
+
+__device__ __forceinline__ void ST4S(const int index, const uint2 data)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data;
+}
 
 __device__ __forceinline__
 void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
 {
-	a += b; d ^= a; d = SWAPUINT2(d);
-	c += d; b ^= c; b = ROR2(b, 24);
-	a += b; d ^= a; d = ROR2(d, 16);
+	a += b; d = eorswap32(a, d);
+	c += d; b ^= c; b = ROR24(b);
+	a += b; d ^= a; d = ROR16(d);
 	c += d; b ^= c; b = ROR2(b, 63);
 }
 
+
+#if __CUDA_ARCH__ >= 300
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	return __shfl(a, b, c);
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	a1 = WarpShuffle(a1, b1, c);
+	a2 = WarpShuffle(a2, b2, c);
+	a3 = WarpShuffle(a3, b3, c);
+}
+
+#else
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+	uint32_t *_ptr = (uint32_t*)shared_mem;
+
+	__threadfence_block();
+	uint32_t buf = _ptr[thread];
+
+	_ptr[thread] = a;
+	__threadfence_block();
+	uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	_ptr[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a;
+	__threadfence_block();
+	uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a1;
+	__threadfence_block();
+	a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a2;
+	__threadfence_block();
+	a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a3;
+	__threadfence_block();
+	a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+	__threadfence_block();
+}
+
+#endif
+
+
+__device__ __forceinline__ void round_lyra_v35(uint2 s[4])
+{
+	Gfunc_v5(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4);
+	Gfunc_v5(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4);
+}
+
 __device__ __forceinline__
 void round_lyra_v5(uint2x4* s)
 {
@@ -45,145 +162,142 @@ void round_lyra_v5(uint2x4* s)
 	Gfunc_v5(s[0].w, s[1].x, s[2].y, s[3].z);
 }
 
-__device__ __forceinline__
-void reduceDuplex(uint2x4 state[4], const uint32_t thread)
+
+__device__ __forceinline__ void reduceDuplexRowSetupV2(uint2 state[4])
 {
-	uint2x4 state1[3];
-	const uint32_t ps1 = (Nrow * Ncol * memshift * thread);
-	const uint32_t ps2 = (memshift * (Ncol-1) + memshift * Ncol + Nrow * Ncol * memshift * thread);
+	int i, j;
+	uint2 state1[Ncol][3], state0[Ncol][3], state2[3];
 
-	#pragma unroll 4
+#if __CUDA_ARCH__ > 500
+#pragma unroll
+#endif
 	for (int i = 0; i < Ncol; i++)
 	{
-		uint32_t s1 = ps1 + i*memshift;
-		uint32_t s2 = ps2 - i*memshift;
-
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state1[j] = __ldg4(&(DMatrix+s1)[j]);
-
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state[j] ^= state1[j];
-
-		round_lyra_v5(state);
-
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state1[j] ^= state[j];
-
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			(DMatrix + s2)[j] = state1[j];
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state0[Ncol - i - 1][j] = state[j];
+		round_lyra_v35(state);
 	}
-}
-
-__device__ __forceinline__
-void reduceDuplex50(uint2x4 state[4], const uint32_t thread)
-{
-	const uint32_t ps1 = (Nrow * Ncol * memshift * thread);
-	const uint32_t ps2 = (memshift * (Ncol - 1) + memshift * Ncol + Nrow * Ncol * memshift * thread);
 
-	#pragma unroll 4
-	for (int i = 0; i < Ncol; i++)
+	//#pragma unroll 4
+	for (i = 0; i < Ncol; i++)
 	{
-		const uint32_t s1 = ps1 + i*memshift;
-		const int32_t s2 = ps2 - i*memshift;
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= state0[i][j];
 
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state[j] ^= __ldg4(&(DMatrix + s1)[j]);
+		round_lyra_v35(state);
 
-		round_lyra_v5(state);
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state1[Ncol - i - 1][j] = state0[i][j];
 
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			(DMatrix + s2)[j] = __ldg4(&(DMatrix + s1)[j]) ^ state[j];
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state1[Ncol - i - 1][j] ^= state[j];
 	}
-}
 
-__device__ __forceinline__
-void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOut, uint2x4 state[4], const uint32_t thread)
-{
-	uint2x4 state2[3], state1[3];
-
-	const uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread);
-	const uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread);
-	const uint32_t ps3 = (memshift * (Ncol-1) + memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread);
-
-	for (int i = 0; i < Ncol; i++)
+	for (i = 0; i < Ncol; i++)
 	{
-		const uint32_t s1 = ps1 + i*memshift;
-		const uint32_t s2 = ps2 + i*memshift;
-		const uint32_t s3 = ps3 - i*memshift;
+		const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s2 = memshift * Ncol * 2 + memshift * (Ncol - 1) - i*memshift;
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= state1[i][j] + state0[i][j];
 
-#if __CUDA_ARCH__ == 500
+		round_lyra_v35(state);
 
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state[j] = state[j] ^ (__ldg4(&(DMatrix + s1)[j]) + __ldg4(&(DMatrix + s2)[j]));
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state2[j] = state1[i][j];
 
-		round_lyra_v5(state);
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state1[j] = __ldg4(&(DMatrix + s1)[j]);
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state2[j] ^= state[j];
 
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state2[j] = __ldg4(&(DMatrix + s2)[j]);
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s2 + j, state2[j]);
 
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
 		{
-			state1[j] ^= state[j];
-			(DMatrix + s3)[j] = state1[j];
+			state0[i][0] ^= Data2;
+			state0[i][1] ^= Data0;
+			state0[i][2] ^= Data1;
 		}
-
-#else /* 5.2 */
-
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state1[j] = __ldg4(&(DMatrix + s1)[j]);
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state2[j] = __ldg4(&(DMatrix + s2)[j]);
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
+		else
 		{
-			uint2x4 tmp = state1[j] + state2[j];
-			state[j] ^= tmp;
+			state0[i][0] ^= Data0;
+			state0[i][1] ^= Data1;
+			state0[i][2] ^= Data2;
 		}
 
-		round_lyra_v5(state);
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s0 + j, state0[i][j]);
 
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state0[i][j] = state2[j];
+
+	}
+
+	for (i = 0; i < Ncol; i++)
+	{
+		const uint32_t s1 = memshift * Ncol * 1 + i*memshift;
+		const uint32_t s3 = memshift * Ncol * 3 + memshift * (Ncol - 1) - i*memshift;
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= state1[i][j] + state0[Ncol - i - 1][j];
+
+		round_lyra_v35(state);
+
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state0[Ncol - i - 1][j] ^= state[j];
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s3 + j, state0[Ncol - i - 1][j]);
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
 		{
-			state1[j] ^= state[j];
-			(DMatrix + s3)[j] = state1[j];
+			state1[i][0] ^= Data2;
+			state1[i][1] ^= Data0;
+			state1[i][2] ^= Data1;
 		}
+		else
+		{
+			state1[i][0] ^= Data0;
+			state1[i][1] ^= Data1;
+			state1[i][2] ^= Data2;
+	}
 
-#endif
-		((uint2*)state2)[0] ^= ((uint2*)state)[11];
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s1 + j, state1[i][j]);
 
-		#pragma unroll
-		for (int j = 0; j < 11; j++)
-			((uint2*)state2)[j+1] ^= ((uint2*)state)[j];
 
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			(DMatrix + s2)[j] = state2[j];
 	}
 }
 
-
-__device__ __forceinline__
-void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, uint2x4* state, const uint32_t thread)
+__device__ void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4])
 {
-	uint2x4 state1[3], state2[3];
-	const uint32_t ps1 = (memshift * Ncol * rowIn    + Nrow * Ncol * memshift * thread);
-	const uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread);
-	const uint32_t ps3 = (memshift * Ncol * rowOut   + Nrow * Ncol * memshift * thread);
+	uint2 state1[3], state2[3];
+	const uint32_t ps1 = memshift * Ncol * rowIn;
+	const uint32_t ps2 = memshift * Ncol * rowInOut;
+	const uint32_t ps3 = memshift * Ncol * rowOut;
 
 	for (int i = 0; i < Ncol; i++)
 	{
@@ -191,190 +305,268 @@ void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, u
 		const uint32_t s2 = ps2 + i*memshift;
 		const uint32_t s3 = ps3 + i*memshift;
 
-		#pragma unroll
+#pragma unroll
 		for (int j = 0; j < 3; j++)
-			state1[j] = __ldg4(&(DMatrix + s1)[j]);
+			state1[j] = LD4S(s1 + j);
 
-
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state2[j] = __ldg4(&(DMatrix + s2)[j]);
-
-		#pragma unroll
+#pragma unroll
 		for (int j = 0; j < 3; j++)
-			state1[j] += state2[j];
+			state2[j] = LD4S(s2 + j);
 
-		#pragma unroll
+#pragma unroll
 		for (int j = 0; j < 3; j++)
-			state[j] ^= state1[j];
-
-		round_lyra_v5(state);
-
-		((uint2*)state2)[0] ^= ((uint2*)state)[11];
+			state[j] ^= state1[j] + state2[j];
 
-		#pragma unroll
-		for (int j = 0; j < 11; j++)
-			((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
+		round_lyra_v35(state);
 
-#if __CUDA_ARCH__ == 500
-		if (rowInOut != rowOut)
-		{
-			#pragma unroll
-			for (int j = 0; j < 3; j++)
-				(DMatrix + s3)[j] ^= state[j];
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
 
-		}
-		if (rowInOut == rowOut)
+		if (threadIdx.x == 0)
 		{
-			#pragma unroll
-			for (int j = 0; j < 3; j++)
-				state2[j] ^= state[j];
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
 		}
-#else
-		if (rowInOut != rowOut)
+		else
 		{
-			#pragma unroll
-			for (int j = 0; j < 3; j++)
-				(DMatrix + s3)[j] ^= state[j];
-		} else {
-			#pragma unroll
-			for (int j = 0; j < 3; j++)
-				state2[j] ^= state[j];
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
 		}
-#endif
-		#pragma unroll
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			ST4S(s2 + j, state2[j]);
+#pragma unroll
 		for (int j = 0; j < 3; j++)
-			(DMatrix + s2)[j] = state2[j];
+			ST4S(s3 + j, LD4S(s3 + j) ^ state[j]);
 	}
 }
 
-
-#if __CUDA_ARCH__ == 500
-__global__ __launch_bounds__(TPB50, 1)
-#else
-__global__ __launch_bounds__(TPB52, 1)
-#endif
-void lyra2v2_gpu_hash_32(const uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+__device__ void reduceDuplexRowtV2_4(const int rowInOut, uint2 state[4])
 {
-	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const int rowIn = 2;
+	const int rowOut = 3;
 
-	uint2x4 blake2b_IV[2];
+	int i, j;
+	uint2 state2[3], state1[3], last[3];
+	const uint32_t ps1 = memshift * Ncol * rowIn;
+	const uint32_t ps2 = memshift * Ncol * rowInOut;
+	const uint32_t ps3 = memshift * Ncol * rowOut;
 
-	if (threadIdx.x == 0) {
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		last[j] = LD4S(ps2 + j);
 
-		((uint16*)blake2b_IV)[0] = make_uint16(
-			0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85,
-			0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a,
-			0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c,
-			0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19
-		);
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= LD4S(ps1 + j) + last[j];
+
+	round_lyra_v35(state);
+
+	//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+	uint2 Data0 = state[0];
+	uint2 Data1 = state[1];
+	uint2 Data2 = state[2];
+	WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+	if (threadIdx.x == 0)
+	{
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	}
+	else
+	{
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
 	}
 
-	if (thread < threads)
+	if (rowInOut == rowOut)
+	{
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}
+
+	for (i = 1; i < Ncol; i++)
 	{
-		uint2x4 state[4];
+		const uint32_t s1 = ps1 + i*memshift;
+		const uint32_t s2 = ps2 + i*memshift;
 
-		((uint2*)state)[0] = __ldg(&g_hash[thread]);
-		((uint2*)state)[1] = __ldg(&g_hash[thread + threads]);
-		((uint2*)state)[2] = __ldg(&g_hash[thread + threads*2]);
-		((uint2*)state)[3] = __ldg(&g_hash[thread + threads*3]);
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= LD4S(s1 + j) + LD4S(s2 + j);
 
-		state[1] = state[0];
+		round_lyra_v35(state);
+	}
+
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= last[j];
+}
 
-		state[2] = ((blake2b_IV)[0]);
-		state[3] = ((blake2b_IV)[1]);
+__constant__ uint28 blake2b_IV[2] = {
+	0xf3bcc908lu, 0x6a09e667lu,
+	0x84caa73blu, 0xbb67ae85lu,
+	0xfe94f82blu, 0x3c6ef372lu,
+	0x5f1d36f1lu, 0xa54ff53alu,
+	0xade682d1lu, 0x510e527flu,
+	0x2b3e6c1flu, 0x9b05688clu,
+	0xfb41bd6blu, 0x1f83d9ablu,
+	0x137e2179lu, 0x5be0cd19lu
+};
+
+__constant__ uint28 Mask[2] = {
+	0x00000020lu, 0x00000000lu,
+	0x00000020lu, 0x00000000lu,
+	0x00000020lu, 0x00000000lu,
+	0x00000001lu, 0x00000000lu,
+	0x00000004lu, 0x00000000lu,
+	0x00000004lu, 0x00000000lu,
+	0x00000080lu, 0x00000000lu,
+	0x00000000lu, 0x01000000lu
+};
+
+__global__ __launch_bounds__(64, 1)
+void lyra2v2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	uint28 state[4];
+
+	if (thread < threads)
+	{
+		state[0].x = state[1].x = __ldg(&outputHash[thread + threads * 0]);
+		state[0].y = state[1].y = __ldg(&outputHash[thread + threads * 1]);
+		state[0].z = state[1].z = __ldg(&outputHash[thread + threads * 2]);
+		state[0].w = state[1].w = __ldg(&outputHash[thread + threads * 3]);
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];
 
 		for (int i = 0; i<12; i++)
 			round_lyra_v5(state);
 
-		((uint2*)state)[0].x ^= 0x20;
-		((uint2*)state)[1].x ^= 0x20;
-		((uint2*)state)[2].x ^= 0x20;
-		((uint2*)state)[3].x ^= 0x01;
-		((uint2*)state)[4].x ^= 0x04;
-		((uint2*)state)[5].x ^= 0x04;
-		((uint2*)state)[6].x ^= 0x80;
-		((uint2*)state)[7].y ^= 0x01000000;
+		state[0] ^= Mask[0];
+		state[1] ^= Mask[1];
 
 		for (int i = 0; i<12; i++)
 			round_lyra_v5(state);
 
-		const uint32_t ps1 = (memshift * (Ncol - 1) + Nrow * Ncol * memshift * thread);
+		DState[blockDim.x * gridDim.x * 0 + blockDim.x * blockIdx.x + threadIdx.x] = state[0];
+		DState[blockDim.x * gridDim.x * 1 + blockDim.x * blockIdx.x + threadIdx.x] = state[1];
+		DState[blockDim.x * gridDim.x * 2 + blockDim.x * blockIdx.x + threadIdx.x] = state[2];
+		DState[blockDim.x * gridDim.x * 3 + blockDim.x * blockIdx.x + threadIdx.x] = state[3];
 
-		for (int i = 0; i < Ncol; i++)
-		{
-			const uint32_t s1 = ps1 - memshift * i;
-			DMatrix[s1] = state[0];
-			DMatrix[s1+1] = state[1];
-			DMatrix[s1+2] = state[2];
-			round_lyra_v5(state);
-		}
+	} //thread
+}
+
+#if __CUDA_ARCH__ < 300
+__global__ __launch_bounds__(TPB20, 1)
+#elif __CUDA_ARCH__ < 500
+__global__ __launch_bounds__(TPB30, 1)
+#elif __CUDA_ARCH__ == 500
+__global__ __launch_bounds__(TPB50, 1)
+#else
+__global__ __launch_bounds__(TPB52, 1)
+#endif
+void lyra2v2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
+{
+	const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
 
-		reduceDuplex50(state, thread);
+	if (thread < threads)
+	{
+		uint2 state[4];
+		state[0] = ((uint2*)DState)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[1] = ((uint2*)DState)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[2] = ((uint2*)DState)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[3] = ((uint2*)DState)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
 
-		reduceDuplexRowSetupV2(1, 0, 2, state, thread);
-		reduceDuplexRowSetupV2(2, 1, 3, state, thread);
+		reduceDuplexRowSetupV2(state);
 
 		uint32_t rowa;
-		int prev=3;
+		int prev = 3;
 
-		for (int i = 0; i < 4; i++)
+		for (int i = 0; i < 3; i++)
 		{
-			rowa = ((uint2*)state)[0].x & 3;
-			reduceDuplexRowtV2(prev, rowa, i, state, thread);
+			rowa = WarpShuffle(state[0].x, 0, 4) & 3;
+			reduceDuplexRowtV2(prev, rowa, i, state);
 			prev = i;
 		}
 
-		const uint32_t shift = (memshift * Ncol * rowa + Nrow * Ncol * memshift * thread);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 3;
+		reduceDuplexRowtV2_4(rowa, state);
 
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state[j] ^= __ldg4(&(DMatrix + shift)[j]);
+		((uint2*)DState)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[0];
+		((uint2*)DState)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[1];
+		((uint2*)DState)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[2];
+		((uint2*)DState)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[3];
+	} //thread
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2v2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	uint28 state[4];
+
+	if (thread < threads)
+	{
+		state[0] = __ldg4(&DState[blockDim.x * gridDim.x * 0 + blockDim.x * blockIdx.x + threadIdx.x]);
+		state[1] = __ldg4(&DState[blockDim.x * gridDim.x * 1 + blockDim.x * blockIdx.x + threadIdx.x]);
+		state[2] = __ldg4(&DState[blockDim.x * gridDim.x * 2 + blockDim.x * blockIdx.x + threadIdx.x]);
+		state[3] = __ldg4(&DState[blockDim.x * gridDim.x * 3 + blockDim.x * blockIdx.x + threadIdx.x]);
 
 		for (int i = 0; i < 12; i++)
 			round_lyra_v5(state);
 
-		g_hash[thread]             = ((uint2*)state)[0];
-		g_hash[thread + threads]   = ((uint2*)state)[1];
-		g_hash[thread + threads*2] = ((uint2*)state)[2];
-		g_hash[thread + threads*3] = ((uint2*)state)[3];
-	}
+		outputHash[thread + threads * 0] = state[0].x;
+		outputHash[thread + threads * 1] = state[0].y;
+		outputHash[thread + threads * 2] = state[0].z;
+		outputHash[thread + threads * 3] = state[0].w;
+
+	} //thread
 }
-#else
-#include "cuda_helper.h"
-#if __CUDA_ARCH__ < 200
-__device__ void* DMatrix;
-#endif
-__global__ void lyra2v2_gpu_hash_32(const uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
-#endif
 
 __host__
 void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
 {
-	cuda_get_arch(thr_id);
+	int dev_id = device_map[thr_id % MAX_GPUS];
 	// just assign the device pointer allocated in main loop
-	cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(DState, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
 }
 
 __host__
 void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, int order)
 {
 	int dev_id = device_map[thr_id % MAX_GPUS];
+
 	uint32_t tpb = TPB52;
 
 	if (cuda_arch[dev_id] > 500) tpb = TPB52;
 	else if (cuda_arch[dev_id] == 500) tpb = TPB50;
-	else if (cuda_arch[dev_id] >= 350) tpb = TPB35;
 	else if (cuda_arch[dev_id] >= 300) tpb = TPB30;
 	else if (cuda_arch[dev_id] >= 200) tpb = TPB20;
 
-	dim3 grid((threads + tpb - 1) / tpb);
-	dim3 block(tpb);
+	dim3 grid1((threads * 4 + tpb - 1) / tpb);
+	dim3 block1(4, tpb >> 2);
 
-	if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500)
-		lyra2v2_gpu_hash_32    <<<grid, block>>> (threads, startNounce, (uint2*)g_hash);
-	else
-		lyra2v2_gpu_hash_32_v3 <<<grid, block>>> (threads, startNounce, (uint2*)g_hash);
+	dim3 grid2((threads + 64 - 1) / 64);
+	dim3 block2(64);
+
+	if (cuda_arch[dev_id] < 500)
+		cudaFuncSetCacheConfig(lyra2v2_gpu_hash_32_2, cudaFuncCachePreferShared);
+
+	lyra2v2_gpu_hash_32_1 << <grid2, block2 >> > (threads, startNounce, (uint2*)g_hash);
+
+	lyra2v2_gpu_hash_32_2 << <grid1, block1, 48 * sizeof(uint2) * tpb >> > (threads, startNounce, g_hash);
 
+	lyra2v2_gpu_hash_32_3 << <grid2, block2 >> > (threads, startNounce, (uint2*)g_hash);
 	//MyStreamSynchronize(NULL, order, thr_id);
 }
diff --git a/lyra2/cuda_lyra2v2_sm3.cuh b/lyra2/cuda_lyra2v2_sm3.cuh
deleted file mode 100644
index 1b20485..0000000
--- a/lyra2/cuda_lyra2v2_sm3.cuh
+++ /dev/null
@@ -1,338 +0,0 @@
-/* SM 2/3/3.5 Variant for lyra2REv2 */
-
-#ifdef __INTELLISENSE__
-/* just for vstudio code colors */
-#undef __CUDA_ARCH__
-#define __CUDA_ARCH__ 350
-#endif
-
-#define TPB20 64
-#define TPB30 64
-#define TPB35 64
-
-#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500
-
-#include "cuda_lyra2_vectors.h"
-
-#define Nrow 4
-#define Ncol 4
-
-#define vectype ulonglong4
-#define memshift 4
-
-__device__ vectype *DMatrix;
-
-static __device__ __forceinline__
-void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d)
-{
-	a += b; d ^= a; d = ROTR64(d, 32);
-	c += d; b ^= c; b = ROTR64(b, 24);
-	a += b; d ^= a; d = ROTR64(d, 16);
-	c += d; b ^= c; b = ROTR64(b, 63);
-}
-
-static __device__ __forceinline__
-void round_lyra_v35(vectype* s)
-{
-	Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x);
-	Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y);
-	Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z);
-	Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w);
-
-	Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w);
-	Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x);
-	Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y);
-	Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z);
-}
-
-static __device__ __forceinline__
-void reduceDuplexV3(vectype state[4], uint32_t thread)
-{
-	vectype state1[3];
-	uint32_t ps1 = (Nrow * Ncol * memshift * thread);
-	uint32_t ps2 = (memshift * (Ncol - 1) * Nrow + memshift * 1 + Nrow * Ncol * memshift * thread);
-
-	#pragma unroll 4
-	for (int i = 0; i < Ncol; i++)
-	{
-		uint32_t s1 = ps1 + Nrow * i *memshift;
-		uint32_t s2 = ps2 - Nrow * i *memshift;
-
-		for (int j = 0; j < 3; j++)
-			state1[j] = __ldg4(&(DMatrix + s1)[j]);
-
-		for (int j = 0; j < 3; j++)
-			state[j] ^= state1[j];
-		round_lyra_v35(state);
-
-		for (int j = 0; j < 3; j++)
-			state1[j] ^= state[j];
-
-		for (int j = 0; j < 3; j++)
-			(DMatrix + s2)[j] = state1[j];
-	}
-}
-
-static __device__ __forceinline__
-void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread)
-{
-	vectype state2[3], state1[3];
-
-	uint32_t ps1 = (memshift * rowIn    + Nrow * Ncol * memshift * thread);
-	uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread);
-	uint32_t ps3 = (Nrow * memshift * (Ncol - 1) + memshift *  rowOut + Nrow * Ncol * memshift * thread);
-
-	for (int i = 0; i < Ncol; i++)
-	{
-		uint32_t s1 = ps1 + Nrow*i*memshift;
-		uint32_t s2 = ps2 + Nrow*i*memshift;
-		uint32_t s3 = ps3 - Nrow*i*memshift;
-
-		for (int j = 0; j < 3; j++)
-			state1[j] = __ldg4(&(DMatrix + s1 )[j]);
-		for (int j = 0; j < 3; j++)
-			state2[j] = __ldg4(&(DMatrix + s2 )[j]);
-		for (int j = 0; j < 3; j++) {
-			vectype tmp = state1[j] + state2[j];
-			state[j] ^= tmp;
-		}
-
-		round_lyra_v35(state);
-
-		for (int j = 0; j < 3; j++) {
-			state1[j] ^= state[j];
-			(DMatrix + s3)[j] = state1[j];
-		}
-
-		((uint2*)state2)[0] ^= ((uint2*)state)[11];
-		for (int j = 0; j < 11; j++)
-			((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
-
-		for (int j = 0; j < 3; j++)
-			(DMatrix + s2)[j] = state2[j];
-	}
-}
-
-static __device__ __forceinline__
-void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread)
-{
-	vectype state1[3], state2[3];
-	uint32_t ps1 = (memshift * rowIn    + Nrow * Ncol * memshift * thread);
-	uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread);
-	uint32_t ps3 = (memshift * rowOut   + Nrow * Ncol * memshift * thread);
-
-	#pragma nounroll
-	for (int i = 0; i < Ncol; i++)
-	{
-		uint32_t s1 = ps1 + Nrow * i*memshift;
-		uint32_t s2 = ps2 + Nrow * i*memshift;
-		uint32_t s3 = ps3 + Nrow * i*memshift;
-
-		for (int j = 0; j < 3; j++)
-			state1[j] = __ldg4(&(DMatrix + s1)[j]);
-
-		for (int j = 0; j < 3; j++)
-			state2[j] = __ldg4(&(DMatrix + s2)[j]);
-
-		for (int j = 0; j < 3; j++)
-			state1[j] += state2[j];
-
-		for (int j = 0; j < 3; j++)
-			state[j] ^= state1[j];
-
-		round_lyra_v35(state);
-
-		((uint2*)state2)[0] ^= ((uint2*)state)[11];
-
-		for (int j = 0; j < 11; j++)
-			((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
-
-		if (rowInOut != rowOut) {
-
-			for (int j = 0; j < 3; j++)
-				(DMatrix + s2)[j] = state2[j];
-
-			for (int j = 0; j < 3; j++)
-				(DMatrix + s3)[j] ^= state[j];
-
-		} else {
-
-			for (int j = 0; j < 3; j++)
-				state2[j] ^= state[j];
-
-			for (int j = 0; j < 3; j++)
-				(DMatrix + s2)[j] = state2[j];
-		}
-	}
-}
-
-#if __CUDA_ARCH__ >= 300
-__global__ __launch_bounds__(TPB35, 1)
-void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
-{
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-
-	vectype state[4];
-	vectype blake2b_IV[2];
-	vectype padding[2];
-
-	if (threadIdx.x == 0) {
-
-		((uint16*)blake2b_IV)[0] = make_uint16(
-			0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85,
-			0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a,
-			0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c,
-			0xfb41bd6b, 0x1f83d9ab , 0x137e2179, 0x5be0cd19
-		);
-		((uint16*)padding)[0] = make_uint16(
-			0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0,
-			0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000
-		);
-	}
-
-	if (thread < threads)
-	{
-		((uint2*)state)[0] = __ldg(&outputHash[thread]);
-		((uint2*)state)[1] = __ldg(&outputHash[thread + threads]);
-		((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]);
-		((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]);
-
-		state[1] = state[0];
-		state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0);
-		state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0);
-
-		for (int i = 0; i<12; i++)
-			round_lyra_v35(state);
-
-		state[0] ^= shuffle4(((vectype*)padding)[0], 0);
-		state[1] ^= shuffle4(((vectype*)padding)[1], 0);
-
-		for (int i = 0; i<12; i++)
-			round_lyra_v35(state);
-
-		uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
-
-		//#pragma unroll 4
-		for (int i = 0; i < 4; i++)
-		{
-			uint32_t s1 = ps1 - 4 * memshift * i;
-			for (int j = 0; j < 3; j++)
-				(DMatrix + s1)[j] = (state)[j];
-
-			round_lyra_v35(state);
-		}
-
-		reduceDuplexV3(state, thread);
-		reduceDuplexRowSetupV3(1, 0, 2, state, thread);
-		reduceDuplexRowSetupV3(2, 1, 3, state, thread);
-
-		uint32_t rowa;
-		int prev = 3;
-		for (int i = 0; i < 4; i++)
-		{
-			rowa = ((uint2*)state)[0].x & 3;  reduceDuplexRowtV3(prev, rowa, i, state, thread);
-			prev = i;
-		}
-
-		uint32_t shift = (memshift * rowa + 16 * memshift * thread);
-
-		for (int j = 0; j < 3; j++)
-			state[j] ^= __ldg4(&(DMatrix + shift)[j]);
-
-		for (int i = 0; i < 12; i++)
-			round_lyra_v35(state);
-
-		outputHash[thread] = ((uint2*)state)[0];
-		outputHash[thread + threads] = ((uint2*)state)[1];
-		outputHash[thread + 2 * threads] = ((uint2*)state)[2];
-		outputHash[thread + 3 * threads] = ((uint2*)state)[3];
-
-	} //thread
-}
-#elif __CUDA_ARCH__ >= 200
-__global__ __launch_bounds__(TPB20, 1)
-void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
-{
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-
-	vectype state[4];
-	vectype blake2b_IV[2];
-	vectype padding[2];
-
-	((uint16*)blake2b_IV)[0] = make_uint16(
-		0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85,
-		0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a,
-		0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c,
-		0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19
-	);
-	((uint16*)padding)[0] = make_uint16(
-		0x20, 0x0, 0x20, 0x0, 0x20, 0x0, 0x01, 0x0,
-		0x04, 0x0, 0x04, 0x0, 0x80, 0x0, 0x0, 0x01000000
-	);
-
-	if (thread < threads)
-	{
-
-		((uint2*)state)[0] = outputHash[thread];
-		((uint2*)state)[1] = outputHash[thread + threads];
-		((uint2*)state)[2] = outputHash[thread + 2 * threads];
-		((uint2*)state)[3] = outputHash[thread + 3 * threads];
-
-		state[1] = state[0];
-		state[2] = ((vectype*)blake2b_IV)[0];
-		state[3] = ((vectype*)blake2b_IV)[1];
-
-		for (int i = 0; i<12; i++)
-			round_lyra_v35(state);
-
-		state[0] ^= ((vectype*)padding)[0];
-		state[1] ^= ((vectype*)padding)[1];
-
-		for (int i = 0; i<12; i++)
-			round_lyra_v35(state);
-
-		uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
-
-		//#pragma unroll 4
-		for (int i = 0; i < 4; i++)
-		{
-			uint32_t s1 = ps1 - 4 * memshift * i;
-			for (int j = 0; j < 3; j++)
-				(DMatrix + s1)[j] = (state)[j];
-
-			round_lyra_v35(state);
-		}
-
-		reduceDuplexV3(state, thread);
-		reduceDuplexRowSetupV3(1, 0, 2, state, thread);
-		reduceDuplexRowSetupV3(2, 1, 3, state, thread);
-
-		uint32_t rowa;
-		int prev = 3;
-		for (int i = 0; i < 4; i++)
-		{
-			rowa = ((uint2*)state)[0].x & 3;  reduceDuplexRowtV3(prev, rowa, i, state, thread);
-			prev = i;
-		}
-
-		uint32_t shift = (memshift * rowa + 16 * memshift * thread);
-
-		for (int j = 0; j < 3; j++)
-			state[j] ^= __ldg4(&(DMatrix + shift)[j]);
-
-		for (int i = 0; i < 12; i++)
-			round_lyra_v35(state);
-
-		outputHash[thread] = ((uint2*)state)[0];
-		outputHash[thread + threads] = ((uint2*)state)[1];
-		outputHash[thread + 2 * threads] = ((uint2*)state)[2];
-		outputHash[thread + 3 * threads] = ((uint2*)state)[3];
-
-	} //thread
-}
-#endif
-
-#else
-/* host & sm5+ */
-__global__ void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) {}
-#endif
diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu
index d74bb16..463fa4a 100644
--- a/lyra2/lyra2RE.cu
+++ b/lyra2/lyra2RE.cu
@@ -23,7 +23,7 @@ extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNon
 extern void skein256_cpu_init(int thr_id, uint32_t threads);
 
 extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
-extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti);
 
 extern void groestl256_cpu_init(int thr_id, uint32_t threads);
 extern void groestl256_cpu_free(int thr_id);
@@ -85,30 +85,49 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
-	int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 17 : 16;
-	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
-	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 
 	if (opt_benchmark)
-		ptarget[7] = 0x000f;
+		ptarget[7] = 0x00ff;
 
+	static bool gtx750ti;
+	static uint32_t throughput[MAX_GPUS];
 	if (!init[thr_id])
 	{
-		cudaSetDevice(device_map[thr_id]);
+		int dev_id = device_map[thr_id];
+		cudaSetDevice(dev_id);
 		CUDA_LOG_ERROR();
 
-		blake256_cpu_init(thr_id, throughput);
-		keccak256_cpu_init(thr_id,throughput);
-		skein256_cpu_init(thr_id, throughput);
-		groestl256_cpu_init(thr_id, throughput);
+		int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16;
+		if (device_sm[device_map[thr_id]] == 500) intensity = 15;
+		int temp = intensity;
+		throughput[thr_id] = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
+		if (init[thr_id]) throughput[thr_id] = min(throughput[thr_id], max_nonce - first_nonce);
 
-		// DMatrix
-		cudaMalloc(&d_matrix[thr_id], (size_t)16 * 8 * 8 * sizeof(uint64_t) * throughput);
-		lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
+		cudaDeviceProp props;
+		cudaGetDeviceProperties(&props, dev_id);
 
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
+		if (strstr(props.name, "750 Ti")) gtx750ti = true;
+		else gtx750ti = false;
+
+		//blake256_cpu_init(thr_id, throughput);
+		keccak256_cpu_init(thr_id, throughput[thr_id]);
+		skein256_cpu_init(thr_id, throughput[thr_id]);
+		groestl256_cpu_init(thr_id, throughput[thr_id]);
+
+		if (device_sm[dev_id] >= 500)
+		{
+			size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
+			CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput[thr_id]));
+			lyra2_cpu_init(thr_id, throughput[thr_id], d_matrix[thr_id]);
+		}
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput[thr_id]));
 
 		init[thr_id] = true;
+		if (temp != intensity){
+			gpulog(LOG_INFO, thr_id, "Intensity set to %u, %u cuda threads",
+				intensity, throughput[thr_id]);
+		}
 	}
 
 	uint32_t _ALIGN(128) endiandata[20];
@@ -122,15 +141,15 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
 		int order = 0;
 		uint32_t foundNonce;
 
-		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		blake256_cpu_hash_80(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
+		keccak256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
+		lyra2_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], gtx750ti);
+		skein256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
 		TRACE("S")
 
-		*hashes_done = pdata[19] - first_nonce + throughput;
+		*hashes_done = pdata[19] - first_nonce + throughput[thr_id];
 
-		foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		foundNonce = groestl256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
 		if (foundNonce != UINT32_MAX)
 		{
 			uint32_t _ALIGN(64) vhash64[8];
@@ -162,11 +181,11 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
 			}
 		}
 
-		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+		if ((uint64_t)throughput[thr_id] + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
-		pdata[19] += throughput;
+		pdata[19] += throughput[thr_id];
 
 	} while (!work_restart[thr_id].restart);
 
diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu
index 2308d0c..9a14c4d 100644
--- a/lyra2/lyra2REv2.cu
+++ b/lyra2/lyra2REv2.cu
@@ -10,6 +10,7 @@ extern "C" {
 #include "miner.h"
 #include "cuda_helper.h"
 
+#include <math.h>
 
 static uint64_t *d_hash[MAX_GPUS];
 static uint64_t* d_matrix[MAX_GPUS];
@@ -20,6 +21,9 @@ extern void blake256_cpu_setBlock_80(uint32_t *pdata);
 extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void keccak256_cpu_init(int thr_id, uint32_t threads);
 extern void keccak256_cpu_free(int thr_id);
+extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
+extern void blakeKeccakcube256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
+
 extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void skein256_cpu_init(int thr_id, uint32_t threads);
 extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);
@@ -27,10 +31,11 @@ extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t start
 extern void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix);
 
-extern void bmw256_setTarget(const void *ptarget);
+//extern void bmw256_setTarget(const void *ptarget);
 extern void bmw256_cpu_init(int thr_id, uint32_t threads);
 extern void bmw256_cpu_free(int thr_id);
-extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces);
+extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces, uint32_t Target);
+extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces, uint32_t Target, uint32_t **result);
 
 void lyra2v2_hash(void *state, const void *input)
 {
@@ -79,7 +84,7 @@ void lyra2v2_hash(void *state, const void *input)
 		uint32_t* debugbuf = NULL; \
 		cudaMallocHost(&debugbuf, 32); \
 		cudaMemcpy(debugbuf, d_hash[thr_id], 32, cudaMemcpyDeviceToHost); \
-		printf("lyra2 %s %08x %08x %08x %08x...%08x... \n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
+		printf("lyra2 %s %08x %08x %08x %08x...%08x... ¥n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
 			swab32(debugbuf[2]), swab32(debugbuf[3]), swab32(debugbuf[7])); \
 		cudaFreeHost(debugbuf); \
 	} \
@@ -89,23 +94,96 @@ void lyra2v2_hash(void *state, const void *input)
 #endif
 
 static bool init[MAX_GPUS] = { 0 };
+static uint32_t throughput[MAX_GPUS] = { 0 };
 
 extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
-	int dev_id = device_map[thr_id];
-	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 18;
-	uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity);
-	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
-
 	if (opt_benchmark)
 		ptarget[7] = 0x000f;
 
 	if (!init[thr_id])
 	{
-		size_t matrix_sz = 16 * sizeof(uint64_t) * 4 * 3;
+		int dev_id = device_map[thr_id];
+		cudaDeviceProp props;
+		cudaGetDeviceProperties(&props, dev_id);
+
+		int intensity = 0;
+		// Pascal
+		if (strstr(props.name, "1080")) intensity = 22;
+		else if (strstr(props.name, "1070")) intensity = 21;
+		// Maxwell
+		else if (strstr(props.name, "TITAN X")) intensity = 21;
+		else if (strstr(props.name, "980")) intensity = 21;
+		else if (strstr(props.name, "970")) intensity = 20;
+		else if (strstr(props.name, "960")) intensity = 20;
+		else if (strstr(props.name, "950")) intensity = 19;
+		else if (strstr(props.name, "750 Ti")) intensity = 19;
+		else if (strstr(props.name, "750")) intensity = 18;
+		// Kepler〜Fermi
+		else if (strstr(props.name, "TITAN Z")) intensity = 20;
+		else if (strstr(props.name, "TITAN")) intensity = 19;
+		else if (strstr(props.name, "780")) intensity = 19;
+		else if (strstr(props.name, "760")) intensity = 18;
+		else if (strstr(props.name, "730")) intensity = 16;
+		else if (strstr(props.name, "720")) intensity = 15;
+		else if (strstr(props.name, "710")) intensity = 16;
+		else if (strstr(props.name, "690")) intensity = 20;
+		else if (strstr(props.name, "680")) intensity = 19;
+		else if (strstr(props.name, "660")) intensity = 18;
+		else if (strstr(props.name, "650 Ti")) intensity = 18;
+		else if (strstr(props.name, "640")) intensity = 17;
+		else if (strstr(props.name, "630")) intensity = 16;
+		else if (strstr(props.name, "620")) intensity = 15;
+
+		else if (strstr(props.name, "90")) intensity = 18;	//590
+		else if (strstr(props.name, "80")) intensity = 18;	//480 580
+		else if (strstr(props.name, "70")) intensity = 18;	//470 570 670 770
+		else if (strstr(props.name, "65")) intensity = 17;	//465
+		else if (strstr(props.name, "60")) intensity = 17;	//460 560
+		else if (strstr(props.name, "55")) intensity = 17;	//555
+		else if (strstr(props.name, "50")) intensity = 17;	//450 550Ti 650
+		else if (strstr(props.name, "45")) intensity = 16;	//545
+		else if (strstr(props.name, "40")) intensity = 15;	//440
+		else if (strstr(props.name, "30")) intensity = 15;	//430 530
+		else if (strstr(props.name, "20")) intensity = 14;	//420 520
+		else if (strstr(props.name, "10")) intensity = 14;	//510 610
+
+		if (intensity != 0 && opt_eco_mode) intensity -= 3.0;
+
+		if (intensity == 0)
+		{
+			intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 18;
+			throughput[thr_id] = cuda_default_throughput(dev_id, 1UL << (int)intensity);
+		}
+		else
+		{
+			//uint32_t adds = 0;
+			//	double d = floor(intensity);
+
+			/*	if ((intensity - d) > 0.0) {
+					adds = (uint32_t)floor((intensity - d) * (1 << (int)(d - 10.0)) * 1024;
+					throughput = (1 << (int)d) + adds;
+					gpulog(LOG_INFO, thr_id, "Adding %u threads to intensity %u, %u cuda threads",
+					adds, (int)d, throughput);
+					}
+					else if (gpus_intensity[n] != (1 << (int)intensity)) {
+					throughput = (1 << (int)intensity);
+					applog(LOG_INFO, "Intensity set to %u, %u cuda threads",
+					v, gpus_intensity[n]);
+					}
+					*/
+			uint32_t temp = 1UL << intensity;
+			throughput[thr_id] = cuda_default_throughput(dev_id, temp);
+
+			if (temp == throughput[thr_id])
+			{
+				gpulog(LOG_INFO, thr_id, "Intensity set to %u, %u cuda threads",
+					intensity, throughput[thr_id]);
+			}
+		}
 		cudaSetDevice(dev_id);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
@@ -113,52 +191,84 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
 
-		blake256_cpu_init(thr_id, throughput);
-		keccak256_cpu_init(thr_id,throughput);
-		skein256_cpu_init(thr_id, throughput);
-		bmw256_cpu_init(thr_id, throughput);
+		//blake256_cpu_init(thr_id, throughput);
+		//keccak256_cpu_init(thr_id,throughput);
+		skein256_cpu_init(thr_id, throughput[thr_id]);
+		bmw256_cpu_init(thr_id, throughput[thr_id]);
 
 		// SM 3 implentation requires a bit more memory
-		if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500)
-			matrix_sz = 16 * sizeof(uint64_t) * 4 * 4;
-			
-		CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));
-		lyra2v2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
+		//if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300)
+		//	matrix_sz = 16 * sizeof(uint64_t) * 4 * 4;
+		//else
+		size_t matrix_sz = sizeof(uint64_t) * 4 * 4;
+		CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput[thr_id]));
+		lyra2v2_cpu_init(thr_id, throughput[thr_id], d_matrix[thr_id]);
 
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput[thr_id]));
 
-		api_set_throughput(thr_id, throughput);
+		api_set_throughput(thr_id, throughput[thr_id]);
 		init[thr_id] = true;
 	}
+	else throughput[thr_id] = min(throughput[thr_id], max_nonce - first_nonce);
 
 	uint32_t endiandata[20];
-	for (int k=0; k < 20; k++)
+	for (int k = 0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
 
 	blake256_cpu_setBlock_80(pdata);
-	bmw256_setTarget(ptarget);
+	//bmw256_setTarget(ptarget);
 
+	//uint32_t *vhash64[2];
 	do {
 		int order = 0;
 		uint32_t foundNonces[2] = { 0, 0 };
 
-		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		blakeKeccak256_cpu_hash_80(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
+		//blakeKeccakcube256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		TRACE("blake  :");
-		keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		//keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		TRACE("keccak :");
-		cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		cubehash256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
 		TRACE("cube   :");
-		lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		lyra2v2_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
 		TRACE("lyra2  :");
-		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		skein256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
 		TRACE("skein  :");
-		cubehash256_cpu_hash_32(thr_id, throughput,pdata[19], d_hash[thr_id], order++);
+		cubehash256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
 		TRACE("cube   :");
 
-		bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonces);
+		bmw256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], foundNonces, ptarget[7]);
+		//bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonces, ptarget[7], vhash64);
+
+		*hashes_done = pdata[19] - first_nonce + throughput[thr_id];
 
-		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		/*if (foundNonces[1] != 0)
+		{
+			if (fulltest(vhash64[0], ptarget))
+			{
+				gpulog(LOG_WARNING, thr_id, "result two foundNonces!");
+				pdata[19] = foundNonces[1];
+				pdata[21] = foundNonces[0];
+				work_set_target_ratio(work, vhash64[0]);
+				if (bn_hash_target_ratio(vhash64[1], ptarget) > work->shareratio) {
+					work_set_target_ratio(work, vhash64[1]);
+				}
+				return 2;
+			}
+		}
+		if (foundNonces[0] != 0)
+		{
+			if (fulltest(vhash64[0], ptarget))
+			{
+				gpulog(LOG_WARNING, thr_id, "result one foundNonce!");
+				pdata[19] = foundNonces[0];
+				work_set_target_ratio(work, vhash64[0]);
+				return 1;
+			}
+		}*/
 
 		if (foundNonces[0] != 0)
 		{
@@ -176,25 +286,25 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 					be32enc(&endiandata[19], foundNonces[1]);
 					lyra2v2_hash(vhash64, endiandata);
 					pdata[21] = foundNonces[1];
+					xchg(pdata[19], pdata[21]);
 					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio) {
 						work_set_target_ratio(work, vhash64);
-						xchg(pdata[19], pdata[21]);
 					}
 					res++;
 				}
 				return res;
 			}
-			else
+			else if (vhash64[7] > ptarget[7])
 			{
 				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonces[0]);
 			}
 		}
 
-		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+		if ((uint64_t)throughput[thr_id] + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
-		pdata[19] += throughput;
+		pdata[19] += throughput[thr_id];
 
 	} while (!work_restart[thr_id].restart && !abort_flag);
 
@@ -214,7 +324,7 @@ extern "C" void free_lyra2v2(int thr_id)
 	cudaFree(d_matrix[thr_id]);
 
 	bmw256_cpu_free(thr_id);
-	keccak256_cpu_free(thr_id);
+	//keccak256_cpu_free(thr_id);
 
 	init[thr_id] = false;
 
diff --git a/miner.h b/miner.h
index 1efc579..146aa56 100644
--- a/miner.h
+++ b/miner.h
@@ -445,6 +445,7 @@ struct option {
 #endif
 extern int options_count();
 
+extern bool opt_eco_mode;
 extern bool opt_benchmark;
 extern bool opt_debug;
 extern bool opt_quiet;
@@ -646,6 +647,9 @@ struct work {
 	/* pok getwork txs */
 	uint32_t tx_count;
 	struct tx txs[POK_MAX_TXS];
+
+	char *txs2;
+	char *workid;
 };
 
 #define POK_BOOL_MASK 0x00008000
diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu
index e7f4b21..f163c03 100644
--- a/neoscrypt/cuda_neoscrypt.cu
+++ b/neoscrypt/cuda_neoscrypt.cu
@@ -1,18 +1,33 @@
+// originally from djm34 - github.com/djm34/ccminer-sp-neoscrypt
+
 #include <stdio.h>
 #include <memory.h>
 
-#include "cuda_helper.h"
-#include "cuda_vectors.h" /* NOT COMPATIBLE WITH SM 3.0 !!! */
+#include <cuda_helper.h>
+#include <cuda_vector_uint2x4.h>
+#include "cuda_vectors.h"
+
+typedef uint48 uint4x2;
+
+#include "miner.h"
+
+#ifdef __INTELLISENSE__
+#define __CUDA_ARCH__ 500
+#define __byte_perm(x,y,c) x
+#define atomicExch(p,x) x
+#endif
+
+static uint32_t* d_NNonce[MAX_GPUS];
+
+__device__ uint2x4* W;
+__device__ uint2x4* Tr;
+__device__ uint2x4* Tr2;
+__device__ uint2x4* Input;
 
-static uint32_t *d_buffer[MAX_GPUS];
-static uint32_t *d_NNonce[MAX_GPUS];
-__constant__ uint4* W;
-__constant__ uint32_t pTarget[8];
+__constant__ uint32_t c_data[64];
+__constant__ uint32_t c_target[2];
 __constant__ uint32_t key_init[16];
 __constant__ uint32_t input_init[16];
-__constant__ uint32_t c_data[80];
-
-/// constants ///
 
 static const __constant__ uint8 BLAKE2S_IV_Vec = {
 	0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A,
@@ -37,26 +52,127 @@ static const uint32_t BLAKE2S_SIGMA_host[10][16] = {
 	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
 };
 
-static __constant__ uint32_t BLAKE2S_SIGMA[10][16];
+__constant__ uint32_t BLAKE2S_SIGMA[10][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+};
 
-#define FASTKDF_BUFFER_SIZE  256U
+#define BLOCK_SIZE         64U
+#define BLAKE2S_BLOCK_SIZE 64U
+#define BLAKE2S_OUT_SIZE   32U
 
-// Blake2S
+#define SALSA(a,b,c,d) { \
+	t = rotateL(a+d,  7U); b ^= t; \
+	t = rotateL(b+a,  9U); c ^= t; \
+	t = rotateL(c+b, 13U); d ^= t; \
+	t = rotateL(d+c, 18U); a ^= t; \
+}
 
-#define BLAKE2S_BLOCK_SIZE    64U
-#define BLAKE2S_OUT_SIZE      32U
-#define BLAKE2S_KEY_SIZE      32U
+#define shf_r_clamp32(out,a,b,shift) \
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(out) : "r"(a), "r"(b), "r"(shift));
 
-#if __CUDA_ARCH__ >= 500
-#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \
-	idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
+__device__ __forceinline__
+static void shift256R4(uint32_t* ret, const uint8 &vec4, const uint32_t shift2)
+{
+#if __CUDA_ARCH__ >= 320
+	uint32_t shift = 32U - shift2;
+	asm("shf.r.clamp.b32 %0, 0, %1, %2;" : "=r"(ret[0]) : "r"(vec4.s0), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[1]) : "r"(vec4.s0), "r"(vec4.s1), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[2]) : "r"(vec4.s1), "r"(vec4.s2), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[3]) : "r"(vec4.s2), "r"(vec4.s3), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[4]) : "r"(vec4.s3), "r"(vec4.s4), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[5]) : "r"(vec4.s4), "r"(vec4.s5), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[6]) : "r"(vec4.s5), "r"(vec4.s6), "r"(shift));
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[7]) : "r"(vec4.s6), "r"(vec4.s7), "r"(shift));
+	asm("shr.b32         %0, %1, %2;"     : "=r"(ret[8]) : "r"(vec4.s7), "r"(shift));
+#else
+	// to check
+	shift256R(ret, vec4, shift2);
+#endif
+}
+
+#if __CUDA_ARCH__ >= 300
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	return __shfl(a, b, c);
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint32_t &a1, uint32_t &a2, uint32_t &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	a1 = WarpShuffle(a1, b1, c);
+	a2 = WarpShuffle(a2, b2, c);
+	a3 = WarpShuffle(a3, b3, c);
+}
+
+#else
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	__shared__ uint32_t shared_mem[32];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	shared_mem[thread] = a;
+	__threadfence_block();
+
+	uint32_t result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
+	__threadfence_block();
+
+	return result;
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint32_t &a1, uint32_t &a2, uint32_t &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	__shared__ uint32_t shared_mem[32];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	shared_mem[thread] = a1;
+	__threadfence_block();
+
+	a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
+	__threadfence_block();
+
+	shared_mem[thread] = a2;
+	__threadfence_block();
+
+	a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
+	__threadfence_block();
+
+	shared_mem[thread] = a3;
+	__threadfence_block();
+
+	a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
+	__threadfence_block();
+}
+
+#endif
+
+#define CHACHA_STEP(a,b,c,d) { \
 	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateL(b^c, 12); \
+	a += b; d = __byte_perm(d^a, 0, 0x2103); \
+	c += d; b = rotateL(b^c, 7); \
+}
+
+#if __CUDA_ARCH__ < 500
+
+#define BLAKE(a, b, c, d, key1, key2) { \
+	a += key1; \
+	a += b; d = rotateL(d^a, 16); \
 	c += d; b = rotateR(b^c, 12); \
-	idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \
-	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	a += key2; \
+	a += b; d = rotateR(d^a, 8); \
 	c += d; b = rotateR(b^c, 7); \
 }
-#else
+
 #define BLAKE_G(idx0, idx1, a, b, c, d, key) { \
 	idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
 	a += b; d = rotateL(d^a, 16); \
@@ -65,39 +181,41 @@ static __constant__ uint32_t BLAKE2S_SIGMA[10][16];
 	a += b; d = rotateR(d^a, 8); \
 	c += d; b = rotateR(b^c, 7); \
 }
-#endif
 
-#if __CUDA_ARCH__ >= 500
 #define BLAKE_G_PRE(idx0, idx1, a, b, c, d, key) { \
 	a += key[idx0]; \
-	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	a += b; d = rotateL(d^a, 16); \
 	c += d; b = rotateR(b^c, 12); \
 	a += key[idx1]; \
-	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	a += b; d = rotateR(d^a, 8); \
 	c += d; b = rotateR(b^c, 7); \
 }
-#else
-#define BLAKE_G_PRE(idx0, idx1, a, b, c, d, key) { \
+
+#define BLAKE_G_PRE0(idx0, idx1, a, b, c, d, key) { \
+	a += b; d = rotateL(d^a, 16); \
+	c += d; b = rotateR(b^c, 12); \
+	a += b; d = rotateR(d^a, 8); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE_G_PRE1(idx0, idx1, a, b, c, d, key) { \
 	a += key[idx0]; \
 	a += b; d = rotateL(d^a, 16); \
 	c += d; b = rotateR(b^c, 12); \
-	a += key[idx1]; \
 	a += b; d = rotateR(d^a, 8); \
 	c += d; b = rotateR(b^c, 7); \
 }
-#endif
 
-#define BLAKE_Ghost(idx0, idx1, a, b, c, d, key) { \
-	idx = BLAKE2S_SIGMA_host[idx0][idx1]; a += key[idx]; \
-	a += b; d = ROTR32(d^a,16); \
-	c += d; b = ROTR32(b^c, 12); \
-	idx = BLAKE2S_SIGMA_host[idx0][idx1+1]; a += key[idx]; \
-	a += b; d = ROTR32(d^a,8); \
-	c += d; b = ROTR32(b^c, 7); \
+#define BLAKE_G_PRE2(idx0, idx1, a, b, c, d, key) { \
+	a += b; d = rotateL(d^a, 16); \
+	c += d; b = rotateR(b^c, 12); \
+	a += key[idx1]; \
+	a += b; d = rotateR(d^a, 8); \
+	c += d; b = rotateR(b^c, 7); \
 }
 
 static __forceinline__ __device__
-void Blake2S(uint32_t * inout, const uint32_t * TheKey)
+void Blake2S(uint32_t *out, const uint32_t* const __restrict__  inout, const  uint32_t * const __restrict__ TheKey)
 {
 	uint16 V;
 	uint32_t idx;
@@ -112,122 +230,98 @@ void Blake2S(uint32_t * inout, const uint32_t * TheKey)
 
 	V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;
 
-#if 0
-	for (int x = 0; x < 10; ++x)
-	{
-		BLAKE_G(x, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-		BLAKE_G(x, 0x2, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-		BLAKE_G(x, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-		BLAKE_G(x, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-		BLAKE_G(x, 0x8, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-		BLAKE_G(x, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-		BLAKE_G(x, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-		BLAKE_G(x, 0xE, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-	}
-#else
-	// { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-	BLAKE_G_PRE(0x0, 0x1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x2, 0x3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x4, 0x5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-	BLAKE_G_PRE(0x6, 0x7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x8, 0x9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0xA, 0xB, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0xC, 0xD, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0xE, 0xF, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-
+	//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 	// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-	BLAKE_G_PRE(0xE, 0xA, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x4, 0x8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x9, 0xF, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-	BLAKE_G_PRE(0xD, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x1, 0xC, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x0, 0x2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0xB, 0x7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x5, 0x3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-
+	BLAKE_G_PRE0(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE2(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 	// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-	BLAKE_G_PRE(0xB, 0x8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0xC, 0x0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x5, 0x2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-	BLAKE_G_PRE(0xF, 0xD, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0xA, 0xE, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x3, 0x6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x7, 0x1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x9, 0x4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-
+	BLAKE_G_PRE0(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 	// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-	BLAKE_G_PRE(0x7, 0x9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x3, 0x1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0xD, 0xC, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-	BLAKE_G_PRE(0xB, 0xE, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x2, 0x6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x5, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x4, 0x0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0xF, 0x8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-
+	BLAKE_G_PRE1(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 	// { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
-	BLAKE_G_PRE(0x9, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x5, 0x7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x2, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-	BLAKE_G_PRE(0xA, 0xF, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0xE, 0x1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0xB, 0xC, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x6, 0x8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x3, 0xD, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-
+	BLAKE_G_PRE2(9, 0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(5, 7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(2, 4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(10, 15, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(14, 1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(11, 12, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(6, 8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(3, 13, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 	// { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
-	BLAKE_G_PRE(0x2, 0xC, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x6, 0xA, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x0, 0xB, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-	BLAKE_G_PRE(0x8, 0x3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x4, 0xD, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x7, 0x5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0xF, 0xE, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x1, 0x9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-
+	BLAKE_G_PRE1(2, 12, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(6, 10, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(0, 11, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE2(8, 3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(4, 13, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(7, 5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(15, 14, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(1, 9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 	// { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
-	BLAKE_G_PRE(0xC, 0x5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x1, 0xF, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0xE, 0xD, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-	BLAKE_G_PRE(0x4, 0xA, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x0, 0x7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x6, 0x3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x9, 0x2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x8, 0xB, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-
+	BLAKE_G_PRE2(12, 5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(1, 15, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(14, 13, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(4, 10, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(0, 7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(6, 3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(9, 2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(8, 11, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 	// { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
-	BLAKE_G_PRE(0xD, 0xB, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x7, 0xE, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0xC, 0x1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-	BLAKE_G_PRE(0x3, 0x9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x5, 0x0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0xF, 0x4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x8, 0x6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x2, 0xA, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-
+	BLAKE_G_PRE0(13, 11, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(7, 14, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(12, 1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(3, 9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(5, 0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(15, 4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(8, 6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE(2, 10, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 	// { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
-	BLAKE_G_PRE(0x6, 0xF, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0xE, 0x9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0xB, 0x3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-	BLAKE_G_PRE(0x0, 0x8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0xC, 0x2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0xD, 0x7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x1, 0x4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0xA, 0x5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-
+	BLAKE_G_PRE1(6, 15, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(14, 9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(11, 3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(0, 8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(12, 2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(13, 7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(1, 4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(10, 5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 	// { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
-	BLAKE_G_PRE(0xA, 0x2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x8, 0x4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0x7, 0x6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
-	BLAKE_G_PRE(0x1, 0x5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0xF, 0xB, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
-	BLAKE_G_PRE(0x9, 0xE, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
-	BLAKE_G_PRE(0x3, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
-	BLAKE_G_PRE(0xD, 0x0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
-#endif
+	BLAKE_G_PRE2(10, 2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(8, 4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(7, 6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE(1, 5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(15, 11, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(9, 14, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(3, 12, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(13, 0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 
-	V.lo ^= V.hi;
-	V.lo ^= tmpblock;
+	V.lo ^= V.hi ^ tmpblock;
 
 	V.hi = BLAKE2S_IV_Vec;
 	tmpblock = V.lo;
@@ -235,86 +329,121 @@ void Blake2S(uint32_t * inout, const uint32_t * TheKey)
 	V.hi.s4 ^= 128;
 	V.hi.s6 = ~V.hi.s6;
 
-#if 0
-	for (int x = 0; x < 10; ++x)
-	{
-		BLAKE_G(x, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
-		BLAKE_G(x, 0x2, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
-		BLAKE_G(x, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
-		BLAKE_G(x, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
-		BLAKE_G(x, 0x8, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
-		BLAKE_G(x, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
-		BLAKE_G(x, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
-		BLAKE_G(x, 0xE, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
-	}
-#else
-	//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
-	BLAKE_G_PRE(0x0, 0x1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
-	BLAKE_G_PRE(0x2, 0x3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
-	BLAKE_G_PRE(0x4, 0x5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
-	BLAKE_G_PRE(0x6, 0x7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
-	BLAKE_G_PRE(0x8, 0x9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
-	BLAKE_G_PRE(0xA, 0xB, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
-	BLAKE_G_PRE(0xC, 0xD, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
-	BLAKE_G_PRE(0xE, 0xF, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
-
+	// { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
 	// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
-	BLAKE_G_PRE(0xE, 0xA, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
-	BLAKE_G_PRE(0x4, 0x8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
-	BLAKE_G_PRE(0x9, 0xF, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
-	BLAKE_G_PRE(0xD, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
-	BLAKE_G_PRE(0x1, 0xC, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
-	BLAKE_G_PRE(0x0, 0x2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
-	BLAKE_G_PRE(0xB, 0x7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
-	BLAKE_G_PRE(0x5, 0x3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
-
+	BLAKE_G_PRE(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
 	// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
-	BLAKE_G_PRE(0xB, 0x8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
-	BLAKE_G_PRE(0xC, 0x0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
-	BLAKE_G_PRE(0x5, 0x2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
-	BLAKE_G_PRE(0xF, 0xD, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
-	BLAKE_G_PRE(0xA, 0xE, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
-	BLAKE_G_PRE(0x3, 0x6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
-	BLAKE_G_PRE(0x7, 0x1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
-	BLAKE_G_PRE(0x9, 0x4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
-
+	BLAKE_G_PRE(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
 	// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
-	BLAKE_G_PRE(0x7, 0x9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
-	BLAKE_G_PRE(0x3, 0x1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
-	BLAKE_G_PRE(0xD, 0xC, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
-	BLAKE_G_PRE(0xB, 0xE, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
-	BLAKE_G_PRE(0x2, 0x6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
-	BLAKE_G_PRE(0x5, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
-	BLAKE_G_PRE(0x4, 0x0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
-	BLAKE_G_PRE(0xF, 0x8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
-
-	for (int x = 4; x < 10; ++x)
+	BLAKE_G_PRE(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+
+	for (uint32_t x = 4U; x < 10U; x++)
 	{
-		BLAKE_G(x, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
-		BLAKE_G(x, 0x2, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
-		BLAKE_G(x, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
-		BLAKE_G(x, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
-		BLAKE_G(x, 0x8, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
-		BLAKE_G(x, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
-		BLAKE_G(x, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
-		BLAKE_G(x, 0xE, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+		BLAKE_G(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+		BLAKE_G(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+		BLAKE_G(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+		BLAKE_G(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+		BLAKE_G(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+		BLAKE_G(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+		BLAKE_G(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+		BLAKE_G(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
 	}
-#endif
 
 	V.lo ^= V.hi ^ tmpblock;
 
-	((uint8*)inout)[0]=V.lo;
+	((uint8*)out)[0] = V.lo;
 }
+#endif
 
-static __forceinline__ __host__
-void Blake2Shost(uint32_t * inout, const uint32_t * inkey)
+#if __CUDA_ARCH__ >= 500
+
+#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \
+	idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
+	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE(a, b, c, d, key1,key2) { \
+	a += key1; \
+	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += key2; \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE_G_PRE(idx0,idx1, a, b, c, d, key) { \
+	a += key[idx0]; \
+	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += key[idx1]; \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE_G_PRE0(idx0,idx1, a, b, c, d, key) { \
+	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE_G_PRE1(idx0,idx1, a, b, c, d, key) { \
+	a += key[idx0]; \
+	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+#define BLAKE_G_PRE2(idx0,idx1, a, b, c, d, key) { \
+	a += b; d = __byte_perm(d^a, 0, 0x1032); \
+	c += d; b = rotateR(b^c, 12); \
+	a += key[idx1]; \
+	a += b; d = __byte_perm(d^a, 0, 0x0321); \
+	c += d; b = rotateR(b^c, 7); \
+}
+
+static __forceinline__ __device__
+void Blake2S_v2(uint32_t *out, const uint32_t* __restrict__  inout, const  uint32_t * __restrict__ TheKey)
 {
 	uint16 V;
-	uint32_t idx;
 	uint8 tmpblock;
 
-	V.hi = BLAKE2S_IV_Vechost;
-	V.lo = BLAKE2S_IV_Vechost;
+	V.hi = BLAKE2S_IV_Vec;
+	V.lo = BLAKE2S_IV_Vec;
 	V.lo.s0 ^= 0x01012020;
 
 	// Copy input block for later
@@ -322,469 +451,1100 @@ void Blake2Shost(uint32_t * inout, const uint32_t * inkey)
 
 	V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;
 
-	for (int x = 0; x < 10; ++x)
-	{
-		BLAKE_Ghost(x, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inkey);
-		BLAKE_Ghost(x, 0x2, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inkey);
-		BLAKE_Ghost(x, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inkey);
-		BLAKE_Ghost(x, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inkey);
-		BLAKE_Ghost(x, 0x8, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inkey);
-		BLAKE_Ghost(x, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inkey);
-		BLAKE_Ghost(x, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inkey);
-		BLAKE_Ghost(x, 0xE, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inkey);
-	}
+	//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	BLAKE_G_PRE0(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE2(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	BLAKE_G_PRE0(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	BLAKE_G_PRE1(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	BLAKE_G_PRE2(9, 0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE(5, 7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(2, 4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE0(10, 15, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(14, 1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(11, 12, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(6, 8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(3, 13, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	BLAKE_G_PRE1(2, 12, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(6, 10, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(0, 11, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE2(8, 3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE1(4, 13, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(7, 5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(15, 14, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE1(1, 9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	BLAKE_G_PRE2(12, 5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(1, 15, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(14, 13, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(4, 10, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(0, 7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE(6, 3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(9, 2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE0(8, 11, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	BLAKE_G_PRE0(13, 11, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(7, 14, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(12, 1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(3, 9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE(5, 0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(15, 4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(8, 6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE(2, 10, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	BLAKE_G_PRE1(6, 15, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE0(14, 9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(11, 3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE1(0, 8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(12, 2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE2(13, 7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE(1, 4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(10, 5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
+	// { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+	BLAKE_G_PRE2(10, 2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey);
+	BLAKE_G_PRE2(8, 4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey);
+	BLAKE_G_PRE(7, 6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey);
+	BLAKE_G_PRE(1, 5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(15, 11, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey);
+	BLAKE_G_PRE0(9, 14, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey);
+	BLAKE_G_PRE1(3, 12, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey);
+	BLAKE_G_PRE2(13, 0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey);
 
 	V.lo ^= V.hi;
 	V.lo ^= tmpblock;
 
-	V.hi = BLAKE2S_IV_Vechost;
+	V.hi = BLAKE2S_IV_Vec;
 	tmpblock = V.lo;
 
 	V.hi.s4 ^= 128;
 	V.hi.s6 = ~V.hi.s6;
 
-	for (int x = 0; x < 10; ++x)
-	{
-		BLAKE_Ghost(x, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
-		BLAKE_Ghost(x, 0x2, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
-		BLAKE_Ghost(x, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
-		BLAKE_Ghost(x, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
-		BLAKE_Ghost(x, 0x8, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
-		BLAKE_Ghost(x, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
-		BLAKE_Ghost(x, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
-		BLAKE_Ghost(x, 0xE, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	// { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	// { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	BLAKE_G_PRE(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	// { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	BLAKE_G_PRE(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	// { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	BLAKE_G_PRE(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+	BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+	BLAKE_G_PRE(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+	BLAKE_G_PRE(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+	BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+	BLAKE_G_PRE(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+	BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+	BLAKE_G_PRE(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[9], inout[0]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[5], inout[7]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[2], inout[4]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[10], inout[15]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[14], inout[1]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[11], inout[12]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[6], inout[8]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[3], inout[13]);
+
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[2], inout[12]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[6], inout[10]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[0], inout[11]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[8], inout[3]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[4], inout[13]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[7], inout[5]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[15], inout[14]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[1], inout[9]);
+
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[12], inout[5]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[1], inout[15]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[14], inout[13]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[4], inout[10]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[0], inout[7]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[6], inout[3]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[9], inout[2]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[8], inout[11]);
+	// 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10,
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[13], inout[11]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[7], inout[14]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[12], inout[1]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[3], inout[9]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[5], inout[0]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[15], inout[4]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[8], inout[6]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[2], inout[10]);
+	// 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5,
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[6], inout[15]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[14], inout[9]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[11], inout[3]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[0], inout[8]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[12], inout[2]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[13], inout[7]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[1], inout[4]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[10], inout[5]);
+	// 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0,
+	BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[10], inout[2]);
+	BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[8], inout[4]);
+	BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[7], inout[6]);
+	BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[1], inout[5]);
+	BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[15], inout[11]);
+	BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[9], inout[14]);
+	BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[3], inout[12]);
+	BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[13], inout[0]);
+
+	V.lo ^= V.hi;
+	V.lo ^= tmpblock;
+
+	((uint8*)out)[0] = V.lo;
+}
+
+#endif /* __CUDA_ARCH__ >= 500 */
+
+#define SALSA_CORE(state) { \
+	uint32_t t; \
+	SALSA(state.x, state.y, state.z, state.w); \
+	WarpShuffle3(state.y, state.z, state.w, threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1,4); \
+	SALSA(state.x, state.w, state.z, state.y); \
+	WarpShuffle3(state.y, state.z, state.w, threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3,4); \
+}
+
+#define CHACHA_CORE_PARALLEL(state)	{ \
+	CHACHA_STEP(state.x, state.y, state.z, state.w); \
+	WarpShuffle3(state.y, state.z, state.w, threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3,4); \
+	CHACHA_STEP(state.x, state.y, state.z, state.w); \
+	WarpShuffle3(state.y, state.z, state.w, threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1,4); \
+}
+
+__forceinline__ __device__
+uint4 salsa_small_scalar_rnd(const uint4 X)
+{
+	uint4 state = X;
+
+#pragma nounroll
+	for (int i = 0; i < 10; i++) {
+		SALSA_CORE(state);
 	}
 
-	V.lo ^= V.hi ^ tmpblock;
+	return (X + state);
+}
 
-	((uint8*)inout)[0] = V.lo;
+__device__ __forceinline__
+uint4 chacha_small_parallel_rnd(const uint4 X)
+{
+	uint4 state = X;
+
+#pragma nounroll
+	for (int i = 0; i < 10; i++) {
+		CHACHA_CORE_PARALLEL(state);
+	}
+	return (X + state);
+}
+
+__device__ __forceinline__
+void neoscrypt_chacha(uint4 XV[4])
+{
+	uint4 temp;
+
+	XV[0] = chacha_small_parallel_rnd(XV[0] ^ XV[3]);
+	temp = chacha_small_parallel_rnd(XV[1] ^ XV[0]);
+	XV[1] = chacha_small_parallel_rnd(XV[2] ^ temp);
+	XV[3] = chacha_small_parallel_rnd(XV[3] ^ XV[1]);
+	XV[2] = temp;
 }
 
+__device__ __forceinline__
+void neoscrypt_salsa(uint4 XV[4])
+{
+	uint4 temp;
+
+	XV[0] = salsa_small_scalar_rnd(XV[0] ^ XV[3]);
+	temp = salsa_small_scalar_rnd(XV[1] ^ XV[0]);
+	XV[1] = salsa_small_scalar_rnd(XV[2] ^ temp);
+	XV[3] = salsa_small_scalar_rnd(XV[3] ^ XV[1]);
+	XV[2] = temp;
+}
+
+
+#if __CUDA_ARCH__ < 500
 static __forceinline__ __device__
-void fastkdf256(const uint32_t* password, uint8_t* output)
+void fastkdf256_v1(const uint32_t thread, const uint32_t nonce, uint32_t* const s_data)
 {
-	uint8_t bufidx = 0;
+	uint2x4 output[8];
 	uchar4 bufhelper;
-	uint8_t A[320],B[288];
+	uint32_t* B = (uint32_t*)&s_data[threadIdx.x * 64U];
+	uint32_t qbuf, rbuf, bitbuf;
+	uint32_t input[BLAKE2S_BLOCK_SIZE / 4];
+	uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = { 0 };
 
-	((uintx64*)A)[0] = ((uintx64*)password)[0];
-	((uint816 *)A)[4] =  ((uint816 *)password)[0];
+	const uint32_t data18 = c_data[18];
+	const uint32_t data20 = c_data[0];
 
-	((uintx64*)B)[0] = ((uintx64*)password)[0];
-	((uint48 *)B)[8] = ((uint48 *)password)[0];
-
-	uint32_t input[BLAKE2S_BLOCK_SIZE/4]; uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = { 0 };
+	((uintx64*)(B))[0] = ((uintx64*)c_data)[0];
+	((uint32_t*)B)[19] = nonce;
+	((uint32_t*)B)[39] = nonce;
+	((uint32_t*)B)[59] = nonce;
 
 	((uint816*)input)[0] = ((uint816*)input_init)[0];
-	((uint48*)key)[0] = ((uint48*)key_init)[0];
+	((uint4x2*)key)[0] = ((uint4x2*)key_init)[0];
 
-	for (int i = 0; i < 32; ++i)
+#pragma unroll 1
+	for (int i = 0; i < 31; i++)
 	{
-		bufhelper = ((uchar4*)input)[0];
-		for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
-			bufhelper += ((uchar4*)input)[x];
-		bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
-
-		int qbuf = bufidx/4;
-		int rbuf = bufidx&3;
-		int bitbuf = rbuf << 3;
-		uint32_t shifted[9];
+		uint32_t bufidx = 0;
+#pragma unroll
+		for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+			bufhelper = bufhelper + (bufhelper >> 16);
+			bufidx += bufhelper;
+		}
+		bufidx &= 0x000000ff;
+		qbuf = bufidx >> 2;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
 
-		shift256R(shifted, ((uint8*)input)[0], bitbuf);
+		uint32_t shifted[9];
+		shift256R4(shifted, ((uint8*)input)[0], bitbuf);
 
-		for (int k = 0; k < 9; ++k) {
-			((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
+		uint32_t temp[9];
+		//#pragma unroll
+		for (int k = 0; k < 9; k++)
+		{
+			uint32_t indice = (k + qbuf) & 0x3f;
+			temp[k] = B[indice] ^ shifted[k];
+			B[indice] = temp[k];
+		}
+#if __CUDA_ARCH__ >= 320  || !defined(__CUDA_ARCH__)
+		uint32_t a = c_data[qbuf & 0x3f], b;
+		//#pragma unroll
+		for (int k = 0; k<16; k += 2)
+		{
+			b = c_data[(qbuf + k + 1) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = c_data[(qbuf + k + 2) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
 		}
 
-		if (bufidx < BLAKE2S_KEY_SIZE)                          {((uint8*)B)[8] = ((uint8*)B)[0];}
-		else if (bufidx > FASTKDF_BUFFER_SIZE-BLAKE2S_OUT_SIZE) {((uint8*)B)[0] = ((uint8*)B)[8];}
-
-		if (i<31) {
-			for (int k = 0; k <BLAKE2S_BLOCK_SIZE / 4; k++) {
-				((uchar4*)(input))[k] = make_uchar4(
-					(A + bufidx)[4 * k], (A + bufidx)[4 * k + 1],
-					(A + bufidx)[4 * k + 2], (A + bufidx)[4 * k + 3]
-				);
-			}
-
-			for (int k = 0; k <BLAKE2S_KEY_SIZE / 4; k++) {
-				((uchar4*)(key))[k] = make_uchar4(
-					(B + bufidx)[4 * k], (B + bufidx)[4 * k + 1],
-					(B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3]
-				);
-			}
-
-			Blake2S((uint32_t*)input, key);
+		const uint32_t noncepos = 19U - qbuf % 20U;
+		if (noncepos <= 16U && qbuf < 60U)
+		{
+			if (noncepos != 0)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if (noncepos != 16U)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
 		}
+
+		for (int k = 0; k<8; k++)
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[k]) : "r"(temp[k]), "r"(temp[k + 1]), "r"(bitbuf));
+#endif
+		Blake2S(input, input, key);
 	}
 
-	int left = FASTKDF_BUFFER_SIZE - bufidx;
-	int qleft =left/4;
-	int rleft =left&3;
-	for (int k = 0; k < qleft; ++k) {
-		((uchar4*)output)[k] = make_uchar4(
-			(B + bufidx)[4 * k], (B + bufidx)[4 * k + 1],
-			(B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3]
-		) ^ ((uchar4*)A)[k];
+	uint32_t bufidx = 0;
+#pragma unroll
+	for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+	{
+		uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+		bufhelper = bufhelper + (bufhelper >> 16);
+		bufidx += bufhelper;
 	}
-	for (int i = 4*qleft; i < 4*qleft+rleft; ++i) {
-		output[i] = (B + bufidx)[i] ^ A[i];
+	bufidx &= 0x000000ff;
+	qbuf = bufidx >> 2;
+	rbuf = bufidx & 3;
+	bitbuf = rbuf << 3;
+
+#if __CUDA_ARCH__ >= 320
+	for (int i = 0; i<64; i++)
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(((uint32_t*)output)[i]) : "r"(B[(qbuf + i) & 0x3f]), "r"(B[(qbuf + i + 1) & 0x3f4]), "r"(bitbuf));
+#endif
+
+	((ulonglong4*)output)[0] ^= ((ulonglong4*)input)[0];
+	((uintx64*)output)[0] ^= ((uintx64*)c_data)[0];
+	((uint32_t*)output)[19] ^= nonce;
+	((uint32_t*)output)[39] ^= nonce;
+	((uint32_t*)output)[59] ^= nonce;
+
+	for (int i = 0; i<8; i++)
+		(Input + 8U * thread)[i] = output[i];
+}
+#endif
+
+#if __CUDA_ARCH__ >= 500
+static __forceinline__ __device__
+void fastkdf256_v2(const uint32_t thread, const uint32_t nonce, uint32_t* const s_data)
+{
+	const uint32_t data18 = c_data[18];
+	const uint32_t data20 = c_data[0];
+	uint32_t input[16];
+	uint32_t key[16] = { 0 };
+	uint32_t qbuf, rbuf, bitbuf;
+
+	uint32_t* B = (uint32_t*)&s_data[threadIdx.x * 64U];
+	((uintx64*)(B))[0] = ((uintx64*)c_data)[0];
+
+	B[19] = nonce;
+	B[39] = nonce;
+	B[59] = nonce;
+
+	{
+		uint32_t bufidx = 0;
+#pragma unroll
+		for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			uint32_t bufhelper = (input_init[x] & 0x00ff00ff) + ((input_init[x] & 0xff00ff00) >> 8);
+			bufhelper = bufhelper + (bufhelper >> 16);
+			bufidx += bufhelper;
+		}
+		bufidx &= 0x000000ff;
+		qbuf = bufidx >> 2;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+
+		uint32_t temp[9];
+
+		uint32_t shifted;
+		uint32_t shift = 32U - bitbuf;
+		asm("shl.b32         %0, %1, %2;"     : "=r"(shifted) : "r"(input_init[0]), "r"(bitbuf));
+		temp[0] = B[(0 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[0]), "r"(input_init[1]), "r"(shift));
+		temp[1] = B[(1 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[1]), "r"(input_init[2]), "r"(shift));
+		temp[2] = B[(2 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[2]), "r"(input_init[3]), "r"(shift));
+		temp[3] = B[(3 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[3]), "r"(input_init[4]), "r"(shift));
+		temp[4] = B[(4 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[4]), "r"(input_init[5]), "r"(shift));
+		temp[5] = B[(5 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[5]), "r"(input_init[6]), "r"(shift));
+		temp[6] = B[(6 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[6]), "r"(input_init[7]), "r"(shift));
+		temp[7] = B[(7 + qbuf) & 0x3f] ^ shifted;
+		asm("shr.b32         %0, %1, %2;"     : "=r"(shifted) : "r"(input_init[7]), "r"(shift));
+		temp[8] = B[(8 + qbuf) & 0x3f] ^ shifted;
+
+		uint32_t a = c_data[qbuf & 0x3f], b;
+
+#pragma unroll
+		for (int k = 0; k<16; k += 2)
+		{
+			b = c_data[(qbuf + k + 1) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = c_data[(qbuf + k + 2) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
+		}
+
+		const uint32_t noncepos = 19 - qbuf % 20U;
+		if (noncepos <= 16U && qbuf < 60U)
+		{
+			if (noncepos)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if (noncepos != 16U)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
+		}
+
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+
+		Blake2S_v2(input, input, key);
+
+#pragma unroll
+		for (int k = 0; k < 9; k++)
+			B[(k + qbuf) & 0x3f] = temp[k];
 	}
-	for (int i = qleft*4+rleft; i < (qleft+1)*4; ++i) {
-		((uint8_t *)output)[i] = ((uint8_t *)B)[i - left] ^ ((uint8_t *)A)[i];
+
+	for (int i = 1; i < 31; i++)
+	{
+		uint32_t bufidx = 0;
+#pragma unroll
+		for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+			bufhelper = bufhelper + (bufhelper >> 16);
+			bufidx += bufhelper;
+		}
+		bufidx &= 0x000000ff;
+		qbuf = bufidx >> 2;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+
+		uint32_t temp[9];
+
+		uint32_t shifted;
+		uint32_t shift = 32U - bitbuf;
+		asm("shl.b32         %0, %1, %2;"     : "=r"(shifted) : "r"(input[0]), "r"(bitbuf));
+		temp[0] = B[(0 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[0]), "r"(input[1]), "r"(shift));
+		temp[1] = B[(1 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[1]), "r"(input[2]), "r"(shift));
+		temp[2] = B[(2 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[2]), "r"(input[3]), "r"(shift));
+		temp[3] = B[(3 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[3]), "r"(input[4]), "r"(shift));
+		temp[4] = B[(4 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[4]), "r"(input[5]), "r"(shift));
+		temp[5] = B[(5 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[5]), "r"(input[6]), "r"(shift));
+		temp[6] = B[(6 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[6]), "r"(input[7]), "r"(shift));
+		temp[7] = B[(7 + qbuf) & 0x3f] ^ shifted;
+		asm("shr.b32         %0, %1, %2;"     : "=r"(shifted) : "r"(input[7]), "r"(shift));
+		temp[8] = B[(8 + qbuf) & 0x3f] ^ shifted;
+
+		uint32_t a = c_data[qbuf & 0x3f], b;
+
+#pragma unroll
+		for (int k = 0; k<16; k += 2)
+		{
+			b = c_data[(qbuf + k + 1) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = c_data[(qbuf + k + 2) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
+		}
+
+		const uint32_t noncepos = 19 - qbuf % 20U;
+		if (noncepos <= 16U && qbuf < 60U)
+		{
+			if (noncepos)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if (noncepos != 16U)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
+		}
+
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+
+		Blake2S_v2(input, input, key);
+
+#pragma unroll
+		for (int k = 0; k < 9; k++)
+			B[(k + qbuf) & 0x3f] = temp[k];
+	}
+
+	{
+		uint32_t bufidx = 0;
+#pragma unroll
+		for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+			bufhelper = bufhelper + (bufhelper >> 16);
+			bufidx += bufhelper;
+		}
+		bufidx &= 0x000000ff;
+		qbuf = bufidx >> 2;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
 	}
-	for (int i = qleft+1; i < FASTKDF_BUFFER_SIZE/4; ++i) {
-		((uchar4 *)output)[i] = make_uchar4(B[4*i - left],B[4*i+1-left],
-		 B[4*i+2-left],B[4*i+3-left]) ^ ((uchar4 *)A)[i];
+
+	uint2x4 output[8];
+	for (int i = 0; i<64; i++) {
+		const uint32_t a = (qbuf + i) & 0x3f, b = (qbuf + i + 1) & 0x3f;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(((uint32_t*)output)[i]) : "r"(B[a]), "r"(B[b]), "r"(bitbuf));
 	}
+
+	output[0] ^= ((uint2x4*)input)[0];
+#pragma unroll
+	for (int i = 0; i<8; i++)
+		output[i] ^= ((uint2x4*)c_data)[i];
+
+	((uint32_t*)output)[19] ^= nonce;
+	((uint32_t*)output)[39] ^= nonce;
+	((uint32_t*)output)[59] ^= nonce;;
+	((ulonglong16 *)(Input + 8U * thread))[0] = ((ulonglong16*)output)[0];
 }
+#endif
 
+#if __CUDA_ARCH__ < 500
 static __forceinline__ __device__
-void fastkdf32(const uint32_t * password, const uint32_t * salt, uint32_t * output)
+uint32_t fastkdf32_v1(uint32_t thread, const uint32_t nonce, uint32_t* const salt, uint32_t* const s_data)
 {
-	uint8_t bufidx = 0;
-	uchar4 bufhelper;
+	const uint32_t cdata7 = c_data[7];
+	const uint32_t data18 = c_data[18];
+	const uint32_t data20 = c_data[0];
 
-	uint8_t A[320];
-	uint8_t B[288];
+	uint32_t* B0 = (uint32_t*)&s_data[threadIdx.x * 64U];
+	((uintx64*)B0)[0] = ((uintx64*)salt)[0];
 
-	// Initialize the password buffer
-	((uintx64*)A)[0] = ((uintx64*)password)[0];
-	((uint816*)A)[4] = ((uint816*)password)[0];
-	((uintx64*)B)[0] = ((uintx64*)salt)[0];
-	((uintx64*)B)[1] = ((uintx64*)salt)[0];
+	uint32_t input[BLAKE2S_BLOCK_SIZE / 4];
+	((uint816*)input)[0] = ((uint816*)c_data)[0];
 
-	uint32_t input[BLAKE2S_BLOCK_SIZE/4];
-	uint32_t key[BLAKE2S_BLOCK_SIZE/4] = { 0 };
+	uint32_t key[BLAKE2S_BLOCK_SIZE / 4];
+	((uint4x2*)key)[0] = ((uint4x2*)salt)[0];
+	((uint4*)key)[2] = make_uint4(0, 0, 0, 0);
+	((uint4*)key)[3] = make_uint4(0, 0, 0, 0);
 
-	((uint816*)input)[0] = ((uint816*)password)[0];
-	((uint48*)key)[0] = ((uint48*)salt)[0];
+	uint32_t qbuf, rbuf, bitbuf;
+	uint32_t temp[9];
 
-	for (int i = 0; i < 32; ++i)
+#pragma nounroll
+	for (int i = 0; i < 31; i++)
 	{
-		Blake2S((uint32_t*)input, key);
+		Blake2S(input, input, key);
 
-		bufidx = 0;
-		bufhelper = ((uchar4*)input)[0];
+		uint32_t bufidx = 0;
+#pragma unroll
+		for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+			bufhelper = bufhelper + (bufhelper >> 16);
+			bufidx += bufhelper;
+		}
+		bufidx &= 0x000000ff;
+		qbuf = bufidx >> 2;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+		uint32_t shifted[9];
 
-		for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x)
-			bufhelper += ((uchar4*)input)[x];
+		shift256R4(shifted, ((uint8*)input)[0], bitbuf);
 
-		bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w;
-		int qbuf = bufidx / 4;
-		int rbuf = bufidx & 3;
-		int bitbuf = rbuf << 3;
-		uint32_t shifted[9];
+		for (int k = 0; k < 9; k++) {
+			temp[k] = B0[(k + qbuf) & 0x3f];
+		}
 
-		shift256R(shifted, ((uint8*)input)[0], bitbuf);
+		((uint2x4*)temp)[0] ^= ((uint2x4*)shifted)[0];
+		temp[8] ^= shifted[8];
 
-		for (int k = 0; k < 9; ++k) {
-			((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k];
+#if __CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__)
+		uint32_t a = c_data[qbuf & 0x3f], b;
+		//#pragma unroll
+		for (int k = 0; k<16; k += 2)
+		{
+			b = c_data[(qbuf + k + 1) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = c_data[(qbuf + k + 2) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
 		}
 
-		if (i<31) {
-			if (bufidx < BLAKE2S_KEY_SIZE)                            {((uint8*)B)[8] = ((uint8*)B)[0];}
-			else if (bufidx > FASTKDF_BUFFER_SIZE - BLAKE2S_OUT_SIZE) {((uint8*)B)[0] = ((uint8*)B)[8];}
-
-			for (uint8_t k = 0; k < BLAKE2S_BLOCK_SIZE/4; k++) {
-				((uchar4*)(input))[k] = make_uchar4(
-					(A + bufidx)[4 * k], (A + bufidx)[4 * k + 1],
-					(A + bufidx)[4 * k + 2], (A + bufidx)[4 * k + 3]
-				);
-			}
-			for (uint8_t k = 0; k < BLAKE2S_KEY_SIZE / 4; k++) {
-				((uchar4*)(key))[k] = make_uchar4(
-					(B + bufidx)[4 * k], (B + bufidx)[4 * k + 1],
-					(B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3]
-				);
-			}
+		const uint32_t noncepos = 19U - qbuf % 20U;
+		if (noncepos <= 16U && qbuf < 60U)
+		{
+			if (noncepos != 0)	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if (noncepos != 16U)	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
 		}
-	}
 
-	uchar4 unfucked[1];
-	unfucked[0] = make_uchar4(B[28 + bufidx], B[29 + bufidx],B[30 + bufidx], B[31 + bufidx]);
-	((uint32_t*)output)[7] = ((uint32_t*)unfucked)[0] ^ ((uint32_t*)A)[7];
-}
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+#else
+		//#error SM 3.0 code missing here
+#endif
+		for (int k = 0; k < 9; k++) {
+			B0[(k + qbuf) & 0x3f] = temp[k];
+		}
+	}
 
+	Blake2S(input, input, key);
 
-#define SALSA(a,b,c,d) { \
-    t =a+d; b^=rotateL(t,  7); \
-    t =b+a; c^=rotateL(t,  9); \
-    t =c+b; d^=rotateL(t, 13); \
-    t =d+c; a^=rotateL(t, 18); \
-}
+	uint32_t bufidx = 0;
+#pragma unroll
+	for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+	{
+		uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+		bufhelper = bufhelper + (bufhelper >> 16);
+		bufidx += bufhelper;
+	}
+	bufidx &= 0x000000ff;
+	qbuf = bufidx >> 2;
+	rbuf = bufidx & 3;
+	bitbuf = rbuf << 3;
 
-#define SALSA_CORE(state) { \
-    SALSA(state.s0,state.s4,state.s8,state.sc); \
-    SALSA(state.s5,state.s9,state.sd,state.s1); \
-    SALSA(state.sa,state.se,state.s2,state.s6); \
-    SALSA(state.sf,state.s3,state.s7,state.sb); \
-    SALSA(state.s0,state.s1,state.s2,state.s3); \
-    SALSA(state.s5,state.s6,state.s7,state.s4); \
-    SALSA(state.sa,state.sb,state.s8,state.s9); \
-    SALSA(state.sf,state.sc,state.sd,state.se); \
-}
+	for (int k = 7; k < 9; k++) {
+		temp[k] = B0[(k + qbuf) & 0x3f];
+	}
 
-#if __CUDA_ARCH__ >=500
-#define CHACHA_STEP(a,b,c,d) { \
-    a += b; d = __byte_perm(d^a,0,0x1032); \
-    c += d; b = rotateL(b^c, 12); \
-    a += b; d = __byte_perm(d^a,0,0x2103); \
-    c += d; b = rotateL(b^c, 7); \
-}
+	uint32_t output;
+#if __CUDA_ARCH__ >= 320
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
 #else
-#define CHACHA_STEP(a,b,c,d) { \
-    a += b; d = rotateL(d^a,16); \
-    c += d; b = rotateL(b^c, 12); \
-    a += b; d = rotateL(d^a,8); \
-    c += d; b = rotateL(b^c, 7); \
+	output = (MAKE_ULONGLONG(temp[7], temp[8]) >> bitbuf); // to check maybe 7/8 reversed
+#endif
+	output ^= input[7] ^ cdata7;
+	return output;
 }
 #endif
 
-#define CHACHA_CORE_PARALLEL(state) { \
-    CHACHA_STEP(state.lo.s0, state.lo.s4, state.hi.s0, state.hi.s4); \
-    CHACHA_STEP(state.lo.s1, state.lo.s5, state.hi.s1, state.hi.s5); \
-    CHACHA_STEP(state.lo.s2, state.lo.s6, state.hi.s2, state.hi.s6); \
-    CHACHA_STEP(state.lo.s3, state.lo.s7, state.hi.s3, state.hi.s7); \
-    CHACHA_STEP(state.lo.s0, state.lo.s5, state.hi.s2, state.hi.s7); \
-    CHACHA_STEP(state.lo.s1, state.lo.s6, state.hi.s3, state.hi.s4); \
-    CHACHA_STEP(state.lo.s2, state.lo.s7, state.hi.s0, state.hi.s5); \
-    CHACHA_STEP(state.lo.s3, state.lo.s4, state.hi.s1, state.hi.s6); \
-}
+#if __CUDA_ARCH__ >= 500
+static __forceinline__ __device__
+uint32_t fastkdf32_v3(uint32_t thread, const uint32_t nonce, uint32_t* const salt, uint32_t* const s_data)
+{
+	const uint32_t cdata7 = c_data[7];
+	const uint32_t data18 = c_data[18];
+	const uint32_t data20 = c_data[0];
 
+	uint32_t* B0 = (uint32_t*)&s_data[threadIdx.x * 64U];
+	((uintx64*)B0)[0] = ((uintx64*)salt)[0];
 
-static __forceinline__ __device__ uint16 salsa_small_scalar_rnd(const uint16 &X)
-{
-	uint16 state = X;
-	uint32_t t;
+	uint32_t input[BLAKE2S_BLOCK_SIZE / 4];
+	((uint816*)input)[0] = ((uint816*)c_data)[0];
 
-	for (int i = 0; i < 10; ++i) { SALSA_CORE(state);}
+	uint32_t key[BLAKE2S_BLOCK_SIZE / 4];
+	((uint4x2*)key)[0] = ((uint4x2*)salt)[0];
+	((uint4*)key)[2] = make_uint4(0, 0, 0, 0);
+	((uint4*)key)[3] = make_uint4(0, 0, 0, 0);
 
-	return(X + state);
-}
+	uint32_t qbuf, rbuf, bitbuf;
+	uint32_t temp[9];
 
-static __device__ __forceinline__ uint16 chacha_small_parallel_rnd(const uint16 &X)
-{
-	uint16 st = X;
+#pragma nounroll
+	for (int i = 0; i < 31; i++)
+	{
+		Blake2S_v2(input, input, key);
+
+		uint32_t bufidx = 0;
+#pragma unroll
+		for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+		{
+			uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+			bufhelper = bufhelper + (bufhelper >> 16);
+			bufidx += bufhelper;
+		}
+		bufidx &= 0x000000ff;
+		qbuf = bufidx >> 2;
+		rbuf = bufidx & 3;
+		bitbuf = rbuf << 3;
+
+		uint32_t shifted;
+		uint32_t shift = 32U - bitbuf;
+		asm("shl.b32         %0, %1, %2;"     : "=r"(shifted) : "r"(input[0]), "r"(bitbuf));
+		temp[0] = B0[(0 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[0]), "r"(input[1]), "r"(shift));
+		temp[1] = B0[(1 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[1]), "r"(input[2]), "r"(shift));
+		temp[2] = B0[(2 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[2]), "r"(input[3]), "r"(shift));
+		temp[3] = B0[(3 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[3]), "r"(input[4]), "r"(shift));
+		temp[4] = B0[(4 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[4]), "r"(input[5]), "r"(shift));
+		temp[5] = B0[(5 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[5]), "r"(input[6]), "r"(shift));
+		temp[6] = B0[(6 + qbuf) & 0x3f] ^ shifted;
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[6]), "r"(input[7]), "r"(shift));
+		temp[7] = B0[(7 + qbuf) & 0x3f] ^ shifted;
+		asm("shr.b32         %0, %1, %2;"     : "=r"(shifted) : "r"(input[7]), "r"(shift));
+		temp[8] = B0[(8 + qbuf) & 0x3f] ^ shifted;
+
+		uint32_t a = c_data[qbuf & 0x3f], b;
+#pragma unroll
+		for (int k = 0; k<16; k += 2)
+		{
+			b = c_data[(qbuf + k + 1) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf));
+			a = c_data[(qbuf + k + 2) & 0x3f];
+			asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf));
+		}
+
+		const uint32_t noncepos = 19U - qbuf % 20U;
+		if (noncepos <= 16U && qbuf < 60U)
+		{
+			if (noncepos != 0)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf));
+			if (noncepos != 16U)
+				asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf));
+		}
 
-	for (int i = 0; i < 10; ++i) {CHACHA_CORE_PARALLEL(st);}
-	return(X + st);
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf));
+		asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+
+#pragma unroll
+		for (int k = 0; k < 9; k++)
+		{
+			B0[(k + qbuf) & 0x3f] = temp[k];
+		}
+	}
+
+	Blake2S_v2(input, input, key);
+
+	uint32_t bufidx = 0;
+#pragma unroll
+	for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x)
+	{
+		uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8);
+		bufhelper = bufhelper + (bufhelper >> 16);
+		bufidx += bufhelper;
+	}
+	bufidx &= 0x000000ff;
+	qbuf = bufidx >> 2;
+	rbuf = bufidx & 3;
+	bitbuf = rbuf << 3;
+
+	temp[7] = B0[(qbuf + 7) & 0x3f];
+	temp[8] = B0[(qbuf + 8) & 0x3f];
+
+	uint32_t output;
+	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf));
+	output ^= input[7] ^ cdata7;
+	return output;
 }
+#endif
 
-static __device__ __forceinline__ void neoscrypt_chacha(uint16 *XV)
-{
-	XV[0] ^= XV[3];
-	uint16 temp;
 
-	XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0];
-	 temp = chacha_small_parallel_rnd(XV[1]); XV[2] ^= temp;
-	XV[1] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[1];
-	XV[3] = chacha_small_parallel_rnd(XV[3]);
-	XV[2] = temp;
+#define BLAKE_Ghost(idx0, idx1, a, b, c, d, key) { \
+	idx = BLAKE2S_SIGMA_host[idx0][idx1]; a += key[idx]; \
+	a += b; d = ROTR32(d^a,16); \
+	c += d; b = ROTR32(b^c, 12); \
+	idx = BLAKE2S_SIGMA_host[idx0][idx1+1]; a += key[idx]; \
+	a += b; d = ROTR32(d^a,8); \
+	c += d; b = ROTR32(b^c, 7); \
 }
 
-static __device__ __forceinline__ void neoscrypt_salsa(uint16 *XV)
+static void Blake2Shost(uint32_t * inout, const uint32_t * inkey)
 {
-	XV[0] ^= XV[3];
-	uint16 temp;
+	uint16 V;
+	uint32_t idx;
+	uint8 tmpblock;
 
-	XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0];
-	 temp = salsa_small_scalar_rnd(XV[1]); XV[2] ^= temp;
-	XV[1] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[1];
-	XV[3] = salsa_small_scalar_rnd(XV[3]);
-	XV[2] = temp;
-}
+	V.hi = BLAKE2S_IV_Vechost;
+	V.lo = BLAKE2S_IV_Vechost;
+	V.lo.s0 ^= 0x01012020;
 
+	// Copy input block for later
+	tmpblock = V.lo;
 
-#define SHIFT 130
+	V.hi.s4 ^= BLAKE2S_BLOCK_SIZE;
 
-__global__ __launch_bounds__(128, 1)
-void neoscrypt_gpu_hash_k0(uint32_t threads, uint32_t startNonce, int stratum)
-{
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-//	if (thread < threads)
+	for (int x = 0; x < 10; ++x)
 	{
-		uint32_t data[80];
-		uint16 X[4];
-		uint32_t shift = thread * SHIFT * 16;
-		const uint32_t nonce = startNonce + thread;
-
-		for (int i = 0; i<20; i++) {
-			((uint4*)data)[i] = ((uint4 *)c_data)[i];
-		}  //ld.local.v4
-		data[19] = (stratum) ? cuda_swab32(nonce) : nonce; //freaking morons !!!
-		data[39] = data[19];
-		data[59] = data[19];
-
-		fastkdf256(data, (uint8_t*)X);
-
-		((uintx64 *)(W + shift))[0] = ((uintx64 *)X)[0];
-//		((ulonglong16 *)(W + shift))[0] = ((ulonglong16 *)X)[0];
+		BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inkey);
+		BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inkey);
+		BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inkey);
+		BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inkey);
+		BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inkey);
+		BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inkey);
+		BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inkey);
+		BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inkey);
 	}
+
+	V.lo ^= V.hi;
+	V.lo ^= tmpblock;
+
+	V.hi = BLAKE2S_IV_Vechost;
+	tmpblock = V.lo;
+
+	V.hi.s4 ^= 128;
+	V.hi.s6 = ~V.hi.s6;
+
+	for (int x = 0; x < 10; ++x)
+	{
+		BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout);
+		BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout);
+		BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout);
+		BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout);
+		BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout);
+		BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout);
+		BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout);
+		BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout);
+	}
+
+	V.lo ^= V.hi ^ tmpblock;
+
+	((uint8*)inout)[0] = V.lo;
 }
 
-__global__ __launch_bounds__(128, 1)
-void neoscrypt_gpu_hash_k01(uint32_t threads, uint32_t startNonce)
+
+#define SHIFT 128U
+#define TPB 32
+#define TPB2 64
+
+__global__
+__launch_bounds__(TPB2, 1)
+void neoscrypt_gpu_hash_start(const int stratum, const uint32_t startNonce)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-//	if (thread < threads)
-	{
-		uint16 X[4];
-		uint32_t shift = thread * SHIFT * 16;
-		((uintx64 *)X)[0]= ldg256(&(W + shift)[0]);
+	__shared__ uint32_t s_data[64 * TPB2];
 
-		//#pragma unroll
-		for (int i = 0; i < 128; ++i)
-		{
-			neoscrypt_chacha(X);
-			((ulonglong16 *)(W + shift))[i+1] = ((ulonglong16 *)X)[0];
-//			((uintx64 *)(W + shift))[i + 1] = ((uintx64 *)X)[0];
-		}
-	}
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t nonce = startNonce + thread;
+	const uint32_t ZNonce = (stratum) ? cuda_swab32(nonce) : nonce; //freaking morons !!!
+
+	__syncthreads();
+#if __CUDA_ARCH__ < 500
+	fastkdf256_v1(thread, ZNonce, s_data);
+#else
+	fastkdf256_v2(thread, ZNonce, s_data);
+#endif
 }
 
-__global__ __launch_bounds__(128, 1)
-void neoscrypt_gpu_hash_k2(uint32_t threads, uint32_t startNonce)
+__global__
+__launch_bounds__(TPB, 1)
+void neoscrypt_gpu_hash_chacha1()
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-//	if (thread < threads)
-	{
-		uint16 X[4];
-		uint32_t shift = thread * SHIFT * 16;
-		((uintx64 *)X)[0] = ldg256(&(W + shift)[2048]);
+	const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
+	const uint32_t shift = SHIFT * 8U * (thread & 8191);
+	const uint32_t shiftTr = 8U * thread;
 
-		for (int t = 0; t < 128; t++)
-		{
-			int idx = X[3].lo.s0 & 0x7F;
-			((uintx64 *)X)[0] ^= ldg256(&(W + shift)[idx << 4]);
-			neoscrypt_chacha(X);
+	uint4 X[4];
+	for (int i = 0; i < 4; i++)
+	{
+		X[i].x = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + 0 * 4 + threadIdx.x);
+		X[i].y = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + 1 * 4 + threadIdx.x);
+		X[i].z = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + 2 * 4 + threadIdx.x);
+		X[i].w = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + 3 * 4 + threadIdx.x);
+	}
 
-		}
-		((uintx64 *)(W + shift))[129] = ((uintx64*)X)[0];  // best checked
+#pragma nounroll
+	for (int i = 0; i < 128; i++)
+	{
+		uint32_t offset = shift + i * 8U;
+		for (int j = 0; j < 4; j++)
+			((uint4*)(W + offset))[j * 4 + threadIdx.x] = X[j];
+		neoscrypt_chacha(X);
+	}
 
+#pragma nounroll
+	for (int t = 0; t < 128; t++)
+	{
+		uint32_t offset = shift + (WarpShuffle(X[3].x, 0, 4) & 0x7F) * 8U;
+		for (int j = 0; j < 4; j++)
+			X[j] ^= ((uint4*)(W + offset))[j * 4 + threadIdx.x];
+		neoscrypt_chacha(X);
+	}
+#pragma unroll
+	for (int i = 0; i < 4; i++)
+	{
+		*((uint32_t*)&(Tr + shiftTr)[i * 2] + 0 * 4 + threadIdx.x) = X[i].x;
+		*((uint32_t*)&(Tr + shiftTr)[i * 2] + 1 * 4 + threadIdx.x) = X[i].y;
+		*((uint32_t*)&(Tr + shiftTr)[i * 2] + 2 * 4 + threadIdx.x) = X[i].z;
+		*((uint32_t*)&(Tr + shiftTr)[i * 2] + 3 * 4 + threadIdx.x) = X[i].w;
 	}
 }
 
-__global__ __launch_bounds__(128, 1)
-void neoscrypt_gpu_hash_k3(uint32_t threads, uint32_t startNonce)
+__global__
+__launch_bounds__(TPB, 1)
+void neoscrypt_gpu_hash_salsa1()
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-//	if (thread < threads)
+	const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
+	const uint32_t shift = SHIFT * 8U * (thread & 8191);
+	const uint32_t shiftTr = 8U * thread;
+
+	uint4 Z[4];
+	for (int i = 0; i < 4; i++)
 	{
-		uint32_t shift = thread * SHIFT * 16;
-		uint16 Z[4];
+		Z[i].x = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + ((0 + threadIdx.x) & 3) * 4 + threadIdx.x);
+		Z[i].y = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + ((1 + threadIdx.x) & 3) * 4 + threadIdx.x);
+		Z[i].z = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + ((2 + threadIdx.x) & 3) * 4 + threadIdx.x);
+		Z[i].w = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + ((3 + threadIdx.x) & 3) * 4 + threadIdx.x);
+	}
 
-		((uintx64*)Z)[0] = ldg256(&(W + shift)[0]);
+#pragma nounroll
+	for (int i = 0; i < 128; i++)
+	{
+		uint32_t offset = shift + i * 8U;
+		for (int j = 0; j < 4; j++)
+			((uint4*)(W + offset))[j * 4 + threadIdx.x] = Z[j];
+		neoscrypt_salsa(Z);
+	}
 
-		//#pragma unroll
-		for (int i = 0; i < 128; ++i) {
-			neoscrypt_salsa(Z);
-			((ulonglong16 *)(W + shift))[i+1] = ((ulonglong16 *)Z)[0];
-//			((uintx64 *)(W + shift))[i + 1] = ((uintx64 *)Z)[0];
-		}
+#pragma nounroll
+	for (int t = 0; t < 128; t++)
+	{
+		uint32_t offset = shift + (WarpShuffle(Z[3].x, 0, 4) & 0x7F) * 8U;
+		for (int j = 0; j < 4; j++)
+			Z[j] ^= ((uint4*)(W + offset))[j * 4 + threadIdx.x];
+		neoscrypt_salsa(Z);
+	}
+#pragma unroll
+	for (int i = 0; i < 4; i++)
+	{
+		*((uint32_t*)&(Tr2 + shiftTr)[i * 2] + ((0 + threadIdx.x) & 3) * 4 + threadIdx.x) = Z[i].x;
+		*((uint32_t*)&(Tr2 + shiftTr)[i * 2] + ((1 + threadIdx.x) & 3) * 4 + threadIdx.x) = Z[i].y;
+		*((uint32_t*)&(Tr2 + shiftTr)[i * 2] + ((2 + threadIdx.x) & 3) * 4 + threadIdx.x) = Z[i].z;
+		*((uint32_t*)&(Tr2 + shiftTr)[i * 2] + ((3 + threadIdx.x) & 3) * 4 + threadIdx.x) = Z[i].w;
 	}
 }
 
-__global__ __launch_bounds__(128, 1)
-void neoscrypt_gpu_hash_k4(uint32_t threads, uint32_t startNonce, uint32_t *nonceRes, int stratum)
+__global__
+__launch_bounds__(TPB2, 8)
+void neoscrypt_gpu_hash_ending(const int stratum, const uint32_t startNonce, uint32_t *resNonces)
 {
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		const uint32_t nonce = startNonce + thread;
+	__shared__ uint32_t s_data[64 * TPB2];
 
-		uint32_t shift = thread * SHIFT * 16;
-		uint16 Z[4];
-		uint32_t outbuf[8];
-		uint32_t data[80];
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t shiftTr = thread * 8U;
+	const uint32_t nonce = startNonce + thread;
+	const uint32_t ZNonce = (stratum) ? cuda_swab32(nonce) : nonce;
 
-		for (int i=0; i<20; i++) {
-			((uint4*)data)[i] = ((uint4 *)c_data)[i];
-		}
+	__syncthreads();
 
-		data[19] = (stratum) ? cuda_swab32(nonce) : nonce;
-		data[39] = data[19];
-		data[59] = data[19];
-		((uintx64 *)Z)[0] = ldg256(&(W + shift)[2048]);
-		for (int t = 0; t < 128; t++)
-		{
-			int idx = Z[3].lo.s0 & 0x7F;
-			((uintx64 *)Z)[0] ^= ldg256(&(W + shift)[idx << 4]);
-			neoscrypt_salsa(Z);
-		}
-		((uintx64 *)Z)[0] ^= ldg256(&(W + shift)[2064]);
-		fastkdf32(data, (uint32_t*)Z, outbuf);
-#if __CUDA_ARCH__ < 320
-		// workaround required when using SM 3.0 shift256R() func (tested on SM 5.0)
-		if (thread == 0)
-			printf("", outbuf[7]);
+	uint2x4 Z[8];
+#pragma unroll
+	for (int i = 0; i<8; i++)
+		Z[i] = __ldg4(&(Tr2 + shiftTr)[i]) ^ __ldg4(&(Tr + shiftTr)[i]);
+
+#if __CUDA_ARCH__ < 500
+	uint32_t outbuf = fastkdf32_v1(thread, ZNonce, (uint32_t*)Z, s_data);
+#else
+	uint32_t outbuf = fastkdf32_v3(thread, ZNonce, (uint32_t*)Z, s_data);
 #endif
-		if (outbuf[7] <= pTarget[7]) {
-			atomicMin(nonceRes, nonce); // init val is UINT32_MAX
-		}
+
+	if (outbuf <= c_target[1])
+	{
+		resNonces[0] = nonce;
+		//uint32_t tmp = atomicExch(resNonces, nonce);
+		//if(tmp != UINT32_MAX)
+		//	resNonces[1] = tmp;
 	}
 }
 
+static __thread uint32_t *hash1 = NULL;
+static __thread uint32_t *Trans1 = NULL;
+static __thread uint32_t *Trans2 = NULL; // 2 streams
+static __thread uint32_t *Trans3 = NULL; // 2 streams
+
 __host__
-void neoscrypt_cpu_init(int thr_id, uint32_t threads)
+void neoscrypt_init_2stream(int thr_id, uint32_t threads)
 {
-	cuda_get_arch(thr_id);
-	cudaMalloc(&d_NNonce[thr_id], sizeof(uint32_t));
-	CUDA_SAFE_CALL(cudaMalloc(&d_buffer[thr_id], (size_t) 256 * SHIFT * threads));
-	cudaMemcpyToSymbol(W, &d_buffer[thr_id], sizeof(uint4*), 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol(BLAKE2S_SIGMA, BLAKE2S_SIGMA_host, sizeof(BLAKE2S_SIGMA_host), 0, cudaMemcpyHostToDevice);
+	CUDA_SAFE_CALL(cudaMalloc(&d_NNonce[thr_id], 2 * sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMalloc(&hash1, 32 * 128 * sizeof(uint64_t) * min(8192, threads)));
+	CUDA_SAFE_CALL(cudaMalloc(&Trans1, 32 * sizeof(uint64_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&Trans2, 32 * sizeof(uint64_t) * threads));
+	CUDA_SAFE_CALL(cudaMalloc(&Trans3, 32 * sizeof(uint64_t) * threads));
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(W, &hash1, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(Tr, &Trans1, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(Tr2, &Trans2, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(Input, &Trans3, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice));
 }
 
 __host__
-void neoscrypt_cpu_free(int thr_id)
+void neoscrypt_free_2stream(int thr_id)
 {
 	cudaFree(d_NNonce[thr_id]);
-	cudaFree(d_buffer[thr_id]);
+
+	cudaFree(hash1);
+	cudaFree(Trans1);
+	cudaFree(Trans2);
+	cudaFree(Trans3);
+
 }
 
 __host__
-uint32_t neoscrypt_cpu_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, int have_stratum, int order)
+void neoscrypt_hash_k4_2stream(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum)
 {
-	uint32_t result[MAX_GPUS];
-	memset(result, 0xff, sizeof(result));
-	cudaMemset(d_NNonce[thr_id], 0xff, sizeof(uint32_t));
+	CUDA_SAFE_CALL(cudaMemset(d_NNonce[thr_id], 0xff, 2 * sizeof(uint32_t)));
 
-	const uint32_t threadsperblock = 128;
+	const int threadsperblock = TPB;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	neoscrypt_gpu_hash_k0  <<< grid, block >>>(threads, startNounce, have_stratum);
-	neoscrypt_gpu_hash_k01 <<< grid, block >>>(threads, startNounce);
-	neoscrypt_gpu_hash_k2  <<< grid, block >>>(threads, startNounce);
-	neoscrypt_gpu_hash_k3  <<< grid, block >>>(threads, startNounce);
-	neoscrypt_gpu_hash_k4  <<< grid, block >>>(threads, startNounce, d_NNonce[thr_id], have_stratum);
+	const int threadsperblock2 = TPB2;
+	dim3 grid2((threads + threadsperblock2 - 1) / threadsperblock2);
+	dim3 block2(threadsperblock2);
 
-	MyStreamSynchronize(NULL, order, thr_id);
-	cudaMemcpy(&result[thr_id], d_NNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	dim3 grid3((threads * 4 + threadsperblock - 1) / threadsperblock);
+	dim3 block3(4, threadsperblock >> 2);
 
-	return result[thr_id];
+	neoscrypt_gpu_hash_start << <grid2, block2 >> > (stratum, startNounce); //fastkdf
+
+	neoscrypt_gpu_hash_salsa1 << <grid3, block3>> > ();
+	neoscrypt_gpu_hash_chacha1 << <grid3, block3>> > ();
+
+	neoscrypt_gpu_hash_ending << <grid2, block2 >> > (stratum, startNounce, d_NNonce[thr_id]); //fastkdf+end
+
+	CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_NNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
 }
 
 __host__
-void neoscrypt_setBlockTarget(uint32_t* pdata, const void *target)
+void neoscrypt_setBlockTarget(uint32_t* const pdata, uint32_t* const target)
 {
-	unsigned char PaddedMessage[80*4]; //bring balance to the force
+	uint32_t PaddedMessage[64];
 	uint32_t input[16], key[16] = { 0 };
 
-	memcpy(PaddedMessage,     pdata, 80);
-	memcpy(PaddedMessage + 80, pdata, 80);
-	memcpy(PaddedMessage + 160, pdata, 80);
-	memcpy(PaddedMessage + 240, pdata, 80);
+	for (int i = 0; i < 19; i++)
+	{
+		PaddedMessage[i] = pdata[i];
+		PaddedMessage[i + 20] = pdata[i];
+		PaddedMessage[i + 40] = pdata[i];
+	}
+	for (int i = 0; i<4; i++)
+		PaddedMessage[i + 60] = pdata[i];
+
+	PaddedMessage[19] = 0;
+	PaddedMessage[39] = 0;
+	PaddedMessage[59] = 0;
 
 	((uint16*)input)[0] = ((uint16*)pdata)[0];
 	((uint8*)key)[0] = ((uint8*)pdata)[0];
 
-	Blake2Shost(input,key);
+	Blake2Shost(input, key);
 
-	cudaMemcpyToSymbol(pTarget, target, 32, 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol(input_init, input, sizeof(input), 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol(key_init, key, sizeof(key), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(input_init, input, 64, 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(key_init, key, 64, 0, cudaMemcpyHostToDevice);
 
-	cudaMemcpyToSymbol(c_data, PaddedMessage, 80*4, 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_target, &target[6], 2 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_data, PaddedMessage, 64 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+	CUDA_SAFE_CALL(cudaGetLastError());
 }
 
diff --git a/neoscrypt/cuda_vectors.h b/neoscrypt/cuda_vectors.h
index 08fc0ee..3799b74 100644
--- a/neoscrypt/cuda_vectors.h
+++ b/neoscrypt/cuda_vectors.h
@@ -482,7 +482,7 @@ static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift
 // require a uint32_t[9] ret array
 // note: djm neoscrypt implementation is near the limits of gpu capabilities
 //       and weird behaviors can happen when tuning device functions code...
-__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
+__device__ __forceinline__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
 {
 	uint8_t *v = (uint8_t*) &vec4.s0;
 	uint8_t *r = (uint8_t*) ret;
@@ -496,7 +496,7 @@ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
 #else
 
 // same for SM 3.5+, really faster ?
-__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
+__device__ __forceinline__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
 {
 	uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0;
 	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
diff --git a/neoscrypt/neoscrypt.cpp b/neoscrypt/neoscrypt.cpp
index b0cb1cb..ab59f19 100644
--- a/neoscrypt/neoscrypt.cpp
+++ b/neoscrypt/neoscrypt.cpp
@@ -1,11 +1,14 @@
 #include <cuda_runtime.h>
-#include "miner.h"
-#include "neoscrypt/neoscrypt.h"
+#include <string.h>
+#include <miner.h>
 
-extern void neoscrypt_setBlockTarget(uint32_t * data, const void *ptarget);
-extern void neoscrypt_cpu_init(int thr_id, uint32_t threads);
-extern void neoscrypt_cpu_free(int thr_id);
-extern uint32_t neoscrypt_cpu_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, int have_stratum, int order);
+#include "neoscrypt.h"
+
+extern void neoscrypt_setBlockTarget(uint32_t* const data, uint32_t* const ptarget);
+
+extern void neoscrypt_init_2stream(int thr_id, uint32_t threads);
+extern void neoscrypt_free_2stream(int thr_id);
+extern void neoscrypt_hash_k4_2stream(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum);
 
 static bool init[MAX_GPUS] = { 0 };
 
@@ -18,6 +21,17 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
 
 	int dev_id = device_map[thr_id];
 	int intensity = is_windows() ? 18 : 19;
+	// Pascal
+	if (strstr(device_name[dev_id], "GTX 10")) intensity = 22;
+	// Maxwell
+	else if (strstr(device_name[dev_id], "TITAN X")) intensity = 21;
+	else if (strstr(device_name[dev_id], "980")) intensity = 21;
+	else if (strstr(device_name[dev_id], "970")) intensity = 20;
+	else if (strstr(device_name[dev_id], "960")) intensity = 20;
+	else if (strstr(device_name[dev_id], "950")) intensity = 19;
+	else if (strstr(device_name[dev_id], "750 Ti")) intensity = 19;
+	else if (strstr(device_name[dev_id], "750")) intensity = 19;
+
 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
 	throughput = throughput / 32; /* set for max intensity ~= 20 */
 	api_set_throughput(thr_id, throughput);
@@ -31,16 +45,20 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
 	{
 		cudaDeviceSynchronize();
 		cudaSetDevice(dev_id);
-		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
-		cudaGetLastError(); // reset errors if device is not "reset"
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			cudaGetLastError(); // reset errors if device is not "reset"
+		}
 
 		if (device_sm[dev_id] <= 300) {
-			applog(LOG_ERR, "Sorry neoscrypt is not supported on SM 3.0 devices");
+			gpulog(LOG_ERR, thr_id, "Sorry neoscrypt is not supported on SM 3.0 devices");
 			proper_exit(EXIT_CODE_CUDA_ERROR);
 		}
 
-		applog(LOG_INFO, "GPU #%d: Using %d cuda threads", dev_id, throughput);
-		neoscrypt_cpu_init(thr_id, throughput);
+		gpulog(LOG_INFO, thr_id, "Using %d cuda threads", throughput);
+		neoscrypt_init_2stream(thr_id, throughput);
 
 		init[thr_id] = true;
 	}
@@ -48,34 +66,39 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
 	if (have_stratum) {
 		for (int k = 0; k < 20; k++)
 			be32enc(&endiandata[k], pdata[k]);
-	} else {
+	}
+	else {
 		for (int k = 0; k < 20; k++)
 			endiandata[k] = pdata[k];
 	}
 
-	neoscrypt_setBlockTarget(endiandata,ptarget);
+	neoscrypt_setBlockTarget(endiandata, ptarget);
 
 	do {
-		uint32_t foundNonce = neoscrypt_cpu_hash_k4(thr_id, throughput, pdata[19], have_stratum, 0);
-		if (foundNonce != UINT32_MAX)
-		{
-			uint32_t _ALIGN(64) vhash64[8];
+		uint32_t foundNonces[2] = { UINT32_MAX, UINT32_MAX };
+		neoscrypt_hash_k4_2stream(thr_id, throughput, pdata[19], foundNonces, have_stratum);
 
-			*hashes_done = pdata[19] - first_nonce + 1;
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (foundNonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
 
 			if (have_stratum) {
-				be32enc(&endiandata[19], foundNonce);
-			} else {
-				endiandata[19] = foundNonce;
+				be32enc(&endiandata[19], foundNonces[0]);
+			}
+			else {
+				endiandata[19] = foundNonces[0];
 			}
-			neoscrypt((uchar*)vhash64, (uchar*) endiandata, 0x80000620U);
+			neoscrypt((uchar*)vhash, (uchar*)endiandata, 0x80000620U);
 
-			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
-				work_set_target_ratio(work, vhash64);
-				pdata[19] = foundNonce;
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work_set_target_ratio(work, vhash);
+				pdata[19] = foundNonces[0];
 				return 1;
-			} else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+			}
+			else {
+				gpulog(LOG_WARNING, thr_id, "nonce %08x does not validate on CPU!", foundNonces[0]);
 			}
 		}
 
@@ -100,8 +123,9 @@ void free_neoscrypt(int thr_id)
 
 	cudaThreadSynchronize();
 
-	neoscrypt_cpu_free(thr_id);
+	neoscrypt_free_2stream(thr_id);
 	init[thr_id] = false;
 
 	cudaDeviceSynchronize();
 }
+
diff --git a/nvml.cpp b/nvml.cpp
index 74f2994..2d3a12d 100644
--- a/nvml.cpp
+++ b/nvml.cpp
@@ -49,7 +49,7 @@ uint32_t limit_prev[MAX_GPUS] = { 0 };
 	static void *wrap_dlopen(const char *filename) {
 		HMODULE h = LoadLibrary(filename);
 		if (!h && opt_debug) {
-			applog(LOG_DEBUG, "dlopen(%d): failed to load %s", 
+			applog(LOG_DEBUG, "dlopen(%d): failed to load %s",
 				GetLastError(), filename);
 		}
 		return (void*)h;
@@ -68,7 +68,7 @@ uint32_t limit_prev[MAX_GPUS] = { 0 };
 	static void *wrap_dlopen(const char *filename) {
 		void *h = dlopen(filename, RTLD_NOW);
 		if (h == NULL && opt_debug) {
-			applog(LOG_DEBUG, "dlopen(%d): failed to load %s", 
+			applog(LOG_DEBUG, "dlopen(%d): failed to load %s",
 				errno, filename);
 		}
 		return (void*)h;
diff --git a/quark/cuda_quark_blake512_sp.cuh b/quark/cuda_quark_blake512_sp.cuh
index 069620a..64f84fa 100644
--- a/quark/cuda_quark_blake512_sp.cuh
+++ b/quark/cuda_quark_blake512_sp.cuh
@@ -21,12 +21,7 @@ static __device__ __forceinline__ uint2 cuda_swap(uint2 v) {
 	v.y = t;
 	return v;
 }
-static __device__ __forceinline__ uint2 eorswap32(uint2 u, uint2 v) {
-	uint2 result;
-	result.y = u.x ^ v.x;
-	result.x = u.y ^ v.y;
-	return result;
-}
+
 
 __constant__ uint2 c_512_u2[16] =
 {
diff --git a/util.cpp b/util.cpp
index df32394..ae08cd5 100644
--- a/util.cpp
+++ b/util.cpp
@@ -559,7 +559,7 @@ static json_t *json_rpc_call(CURL *curl, const char *url,
 	res_val = json_object_get(val, "result");
 	err_val = json_object_get(val, "error");
 
-	if (!res_val || json_is_null(res_val) ||
+	if (!res_val || //json_is_null(res_val) ||
 	    (err_val && !json_is_null(err_val))) {
 		char *s = NULL;