nanashi r10 with proper utf8

8 years ago · f262850270
23 changed files with 5150 additions and 1912 deletions
--- a/Algo256/cuda_blake256.cu
+++ b/Algo256/cuda_blake256.cu
@ -8,17 +8,28 @@ extern "C" {
				@@ -8,17 +8,28 @@ extern "C" {
 }

 #include "cuda_helper.h"
-
 #include <memory.h>

-static __device__ uint64_t cuda_swab32ll(uint64_t x) {
-	return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
+#define UINT2(x,y) make_uint2(x,y)
+
+__device__ __inline__ uint2 ROR8(const uint2 a)
+{
+	uint2 result;
+	result.x = __byte_perm(a.y, a.x, 0x0765);
+	result.y = __byte_perm(a.x, a.y, 0x0765);
+
+	return result;
 }

-__constant__ static uint32_t c_data[3+1];

-__constant__ static uint32_t sigma[16][16];
-static uint32_t  c_sigma[16][16] = {
+//static __device__ uint64_t cuda_swab32ll(uint64_t x) {
+//	return MAKE_ULONGLONG(cuda_swab32(_LOWORD(x)), cuda_swab32(_HIWORD(x)));
+//}
+
+__constant__ static uint32_t  c_data[3];
+
+//__constant__ static uint8_t sigma[16][16];
+static uint8_t  c_sigma[16][16] = {
 	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
 	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
 	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
@ -46,7 +57,7 @@ static const uint32_t  c_IV256[8] = {
				@@ -46,7 +57,7 @@ static const uint32_t  c_IV256[8] = {

 __device__ __constant__ static uint32_t cpu_h[8];

-__device__ __constant__ static  uint32_t  u256[16];
+//__device__ __constant__ static  uint32_t  u256[16];
 static const uint32_t  c_u256[16] = {
 	0x243F6A88, 0x85A308D3,
 	0x13198A2E, 0x03707344,
@ -59,24 +70,22 @@ static const uint32_t  c_u256[16] = {
				@@ -59,24 +70,22 @@ static const uint32_t  c_u256[16] = {
 };

 #define GS2(a,b,c,d,x) { \
-	const uint32_t idx1 = sigma[r][x]; \
-	const uint32_t idx2 = sigma[r][x+1]; \
+	const uint8_t idx1 = sigma[r][x]; \
+	const uint8_t idx2 = sigma[r][x+1]; \
 	v[a] += (m[idx1] ^ u256[idx2]) + v[b]; \
-	v[d] = SPH_ROTL32(v[d] ^ v[a], 16); \
+	v[d] = __byte_perm(v[d] ^ v[a],0, 0x1032); \
 	v[c] += v[d]; \
 	v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \
 \
 	v[a] += (m[idx2] ^ u256[idx1]) + v[b]; \
-	v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \
+	v[d] = __byte_perm(v[d] ^ v[a],0, 0x0321); \
 	v[c] += v[d]; \
 	v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \
 }

-//#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n)))
-//#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
 #define hostGS(a,b,c,d,x) { \
-	const uint32_t idx1 = c_sigma[r][x]; \
-	const uint32_t idx2 = c_sigma[r][x+1]; \
+	const uint8_t idx1 = c_sigma[r][x]; \
+	const uint8_t idx2 = c_sigma[r][x+1]; \
 	v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \
 	v[d] = ROTR32(v[d] ^ v[a], 16); \
 	v[c] += v[d]; \
@ -86,14 +95,47 @@ static const uint32_t  c_u256[16] = {
				@@ -86,14 +95,47 @@ static const uint32_t  c_u256[16] = {
 	v[d] = ROTR32(v[d] ^ v[a], 8); \
 	v[c] += v[d]; \
 	v[b] = ROTR32(v[b] ^ v[c], 7); \
-	}
+				}

-/* Second part (64-80) msg never change, store it */
-__device__ __constant__ static const uint32_t  c_Padding[16] = {
-	0, 0, 0, 0,
-	0x80000000, 0, 0, 0,
-	0, 0, 0, 0,
-	0, 1, 0, 640,
+#define GSPREC(a,b,c,d,x,y) { \
+	v[a] += (m[x] ^ u256[y]) + v[b]; \
+	v[d] = __byte_perm(v[d] ^ v[a],0, 0x1032); \
+	v[c] += v[d]; \
+	v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \
+	v[a] += (m[y] ^ u256[x]) + v[b]; \
+	v[d] = __byte_perm(v[d] ^ v[a],0, 0x0321); \
+	v[c] += v[d]; \
+	v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \
+					}
+
+__constant__ uint64_t keccak_round_constants[24] = {
+	0x0000000000000001ull, 0x0000000000008082ull,
+	0x800000000000808aull, 0x8000000080008000ull,
+	0x000000000000808bull, 0x0000000080000001ull,
+	0x8000000080008081ull, 0x8000000000008009ull,
+	0x000000000000008aull, 0x0000000000000088ull,
+	0x0000000080008009ull, 0x000000008000000aull,
+	0x000000008000808bull, 0x800000000000008bull,
+	0x8000000000008089ull, 0x8000000000008003ull,
+	0x8000000000008002ull, 0x8000000000000080ull,
+	0x000000000000800aull, 0x800000008000000aull,
+	0x8000000080008081ull, 0x8000000000008080ull,
+	0x0000000080000001ull, 0x8000000080008008ull
+};
+
+__constant__ uint2 keccak_round_constants35[24] = {
+	{ 0x00000001ul, 0x00000000 }, { 0x00008082ul, 0x00000000 },
+	{ 0x0000808aul, 0x80000000 }, { 0x80008000ul, 0x80000000 },
+	{ 0x0000808bul, 0x00000000 }, { 0x80000001ul, 0x00000000 },
+	{ 0x80008081ul, 0x80000000 }, { 0x00008009ul, 0x80000000 },
+	{ 0x0000008aul, 0x00000000 }, { 0x00000088ul, 0x00000000 },
+	{ 0x80008009ul, 0x00000000 }, { 0x8000000aul, 0x00000000 },
+	{ 0x8000808bul, 0x00000000 }, { 0x0000008bul, 0x80000000 },
+	{ 0x00008089ul, 0x80000000 }, { 0x00008003ul, 0x80000000 },
+	{ 0x00008002ul, 0x80000000 }, { 0x00000080ul, 0x80000000 },
+	{ 0x0000800aul, 0x00000000 }, { 0x8000000aul, 0x80000000 },
+	{ 0x80008081ul, 0x80000000 }, { 0x00008080ul, 0x80000000 },
+	{ 0x80000001ul, 0x00000000 }, { 0x80008008ul, 0x80000000 }
 };

 __host__ __forceinline__
@ -132,116 +174,545 @@ static void blake256_compress1st(uint32_t *h, const uint32_t *block, const uint3
				@@ -132,116 +174,545 @@ static void blake256_compress1st(uint32_t *h, const uint32_t *block, const uint3
 		hostGS(3, 4, 0x9, 0xE, 0xE);
 	}

-	for (int i = 0; i < 16; i++) {
-		int j = i & 7;
-		h[j] ^= v[i];
-	}
+	h[0] ^= v[0] ^ v[8];
+	h[1] ^= v[1] ^ v[9];
+	h[2] ^= v[2] ^ v[10];
+	h[3] ^= v[3] ^ v[11];
+	h[4] ^= v[4] ^ v[12];
+	h[5] ^= v[5] ^ v[13];
+	h[6] ^= v[6] ^ v[14];
+	h[7] ^= v[7] ^ v[15];
 }
+#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a))))

-__device__ __forceinline__
-static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint32_t T0)
+static void __forceinline__ __device__ keccak_block(uint2 *s)
 {
-	uint32_t m[16];
-	uint32_t v[16];
-
-	m[0] = block[0];
-	m[1] = block[1];
-	m[2] = block[2];
-	m[3] = block[3];
-
-	#pragma unroll
-	for (int i = 4; i < 16; i++) {
-		m[i] = c_Padding[i];
-	}
-
-	#pragma unroll 8
-	for (int i = 0; i < 8; i++)
-		v[i] = h[i];
-
-	v[8] =  u256[0];
-	v[9] =  u256[1];
-	v[10] = u256[2];
-	v[11] = u256[3];
+	uint2 bc[5], tmpxor[5], tmp1, tmp2;
+	//	uint2 s[25];

-	v[12] = u256[4] ^ T0;
-	v[13] = u256[5] ^ T0;
-	v[14] = u256[6];
-	v[15] = u256[7];
-
-	#pragma unroll 14
-	for (int r = 0; r < 14; r++) {
-		/* column step */
-		GS2(0, 4, 0x8, 0xC, 0x0);
-		GS2(1, 5, 0x9, 0xD, 0x2);
-		GS2(2, 6, 0xA, 0xE, 0x4);
-		GS2(3, 7, 0xB, 0xF, 0x6);
-		/* diagonal step */
-		GS2(0, 5, 0xA, 0xF, 0x8);
-		GS2(1, 6, 0xB, 0xC, 0xA);
-		GS2(2, 7, 0x8, 0xD, 0xC);
-		GS2(3, 4, 0x9, 0xE, 0xE);
-	}
-
-	#pragma unroll 16
-	for (int i = 0; i < 16; i++) {
-		int j = i & 7;
-		h[j] ^= v[i];
+#pragma unroll 1
+	for (int i = 0; i < 24; i++)
+	{
+#pragma unroll
+		for (uint32_t x = 0; x < 5; x++)
+			tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20];
+
+		bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1);
+		bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1);
+		bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1);
+		bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1);
+		bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1);
+
+		tmp1 = s[1] ^ bc[0];
+
+		s[0] ^= bc[4];
+		s[1] = ROL2(s[6] ^ bc[0], 44);
+		s[6] = ROL2(s[9] ^ bc[3], 20);
+		s[9] = ROL2(s[22] ^ bc[1], 61);
+		s[22] = ROL2(s[14] ^ bc[3], 39);
+		s[14] = ROL2(s[20] ^ bc[4], 18);
+		s[20] = ROL2(s[2] ^ bc[1], 62);
+		s[2] = ROL2(s[12] ^ bc[1], 43);
+		s[12] = ROL2(s[13] ^ bc[2], 25);
+		s[13] = ROL8(s[19] ^ bc[3]);
+		s[19] = ROR8(s[23] ^ bc[2]);
+		s[23] = ROL2(s[15] ^ bc[4], 41);
+		s[15] = ROL2(s[4] ^ bc[3], 27);
+		s[4] = ROL2(s[24] ^ bc[3], 14);
+		s[24] = ROL2(s[21] ^ bc[0], 2);
+		s[21] = ROL2(s[8] ^ bc[2], 55);
+		s[8] = ROL2(s[16] ^ bc[0], 45);
+		s[16] = ROL2(s[5] ^ bc[4], 36);
+		s[5] = ROL2(s[3] ^ bc[2], 28);
+		s[3] = ROL2(s[18] ^ bc[2], 21);
+		s[18] = ROL2(s[17] ^ bc[1], 15);
+		s[17] = ROL2(s[11] ^ bc[0], 10);
+		s[11] = ROL2(s[7] ^ bc[1], 6);
+		s[7] = ROL2(s[10] ^ bc[4], 3);
+		s[10] = ROL2(tmp1, 1);
+
+		tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1);
+		tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1);
+		tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1);
+		tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1);
+		tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1);
+		s[0] ^= keccak_round_constants35[i];
 	}
 }

-__global__ __launch_bounds__(256,3)
-void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t * Hash)
+//__launch_bounds__(256)
+__global__
+void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t * Hash)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
+		const uint32_t nonce = startNonce + thread;
 		uint32_t h[8];
-		uint32_t input[4];
+		//		uint32_t input[4];
+		const uint32_t T0 = 640;
+#pragma unroll 8
+		for (int i = 0; i<8; i++) { h[i] = cpu_h[i]; }
+
+		uint32_t v[16];
+
+		const uint32_t c_Padding[12] = {
+			0x80000000, 0, 0, 0,
+			0, 0, 0, 0,
+			0, 1, 0, 640
+		};
+
+		const uint32_t  u256[16] =
+		{
+			0x243F6A88, 0x85A308D3,
+			0x13198A2E, 0x03707344,
+			0xA4093822, 0x299F31D0,
+			0x082EFA98, 0xEC4E6C89,
+			0x452821E6, 0x38D01377,
+			0xBE5466CF, 0x34E90C6C,
+			0xC0AC29B7, 0xC97C50DD,
+			0x3F84D5B5, 0xB5470917
+		};
+
+		uint32_t m[16] =
+		{
+			c_data[0], c_data[1], c_data[2], nonce,
+			c_Padding[0], c_Padding[1], c_Padding[2], c_Padding[3],
+			c_Padding[4], c_Padding[5], c_Padding[6], c_Padding[7],
+			c_Padding[8], c_Padding[9], c_Padding[10], c_Padding[11]
+		};
+
+#pragma unroll 8
+		for (int i = 0; i < 8; i++)
+			v[i] = h[i];
+
+		v[8] = u256[0];
+		v[9] = u256[1];
+		v[10] = u256[2];
+		v[11] = u256[3];
+		v[12] = u256[4] ^ T0;
+		v[13] = u256[5] ^ T0;
+		v[14] = u256[6];
+		v[15] = u256[7];
+
+		//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		GSPREC(0, 4, 0x8, 0xC, 0, 1);
+		GSPREC(1, 5, 0x9, 0xD, 2, 3);
+		GSPREC(2, 6, 0xA, 0xE, 4, 5);
+		GSPREC(3, 7, 0xB, 0xF, 6, 7);
+		GSPREC(0, 5, 0xA, 0xF, 8, 9);
+		GSPREC(1, 6, 0xB, 0xC, 10, 11);
+		GSPREC(2, 7, 0x8, 0xD, 12, 13);
+		GSPREC(3, 4, 0x9, 0xE, 14, 15);
+		//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		GSPREC(0, 4, 0x8, 0xC, 14, 10);
+		GSPREC(1, 5, 0x9, 0xD, 4, 8);
+		GSPREC(2, 6, 0xA, 0xE, 9, 15);
+		GSPREC(3, 7, 0xB, 0xF, 13, 6);
+		GSPREC(0, 5, 0xA, 0xF, 1, 12);
+		GSPREC(1, 6, 0xB, 0xC, 0, 2);
+		GSPREC(2, 7, 0x8, 0xD, 11, 7);
+		GSPREC(3, 4, 0x9, 0xE, 5, 3);
+		//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		GSPREC(0, 4, 0x8, 0xC, 11, 8);
+		GSPREC(1, 5, 0x9, 0xD, 12, 0);
+		GSPREC(2, 6, 0xA, 0xE, 5, 2);
+		GSPREC(3, 7, 0xB, 0xF, 15, 13);
+		GSPREC(0, 5, 0xA, 0xF, 10, 14);
+		GSPREC(1, 6, 0xB, 0xC, 3, 6);
+		GSPREC(2, 7, 0x8, 0xD, 7, 1);
+		GSPREC(3, 4, 0x9, 0xE, 9, 4);
+		//	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		GSPREC(0, 4, 0x8, 0xC, 7, 9);
+		GSPREC(1, 5, 0x9, 0xD, 3, 1);
+		GSPREC(2, 6, 0xA, 0xE, 13, 12);
+		GSPREC(3, 7, 0xB, 0xF, 11, 14);
+		GSPREC(0, 5, 0xA, 0xF, 2, 6);
+		GSPREC(1, 6, 0xB, 0xC, 5, 10);
+		GSPREC(2, 7, 0x8, 0xD, 4, 0);
+		GSPREC(3, 4, 0x9, 0xE, 15, 8);
+
+		//	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+		GSPREC(0, 4, 0x8, 0xC, 9, 0);
+		GSPREC(1, 5, 0x9, 0xD, 5, 7);
+		GSPREC(2, 6, 0xA, 0xE, 2, 4);
+		GSPREC(3, 7, 0xB, 0xF, 10, 15);
+		GSPREC(0, 5, 0xA, 0xF, 14, 1);
+		GSPREC(1, 6, 0xB, 0xC, 11, 12);
+		GSPREC(2, 7, 0x8, 0xD, 6, 8);
+		GSPREC(3, 4, 0x9, 0xE, 3, 13);
+		//	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+		GSPREC(0, 4, 0x8, 0xC, 2, 12);
+		GSPREC(1, 5, 0x9, 0xD, 6, 10);
+		GSPREC(2, 6, 0xA, 0xE, 0, 11);
+		GSPREC(3, 7, 0xB, 0xF, 8, 3);
+		GSPREC(0, 5, 0xA, 0xF, 4, 13);
+		GSPREC(1, 6, 0xB, 0xC, 7, 5);
+		GSPREC(2, 7, 0x8, 0xD, 15, 14);
+		GSPREC(3, 4, 0x9, 0xE, 1, 9);
+
+		//	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+		GSPREC(0, 4, 0x8, 0xC, 12, 5);
+		GSPREC(1, 5, 0x9, 0xD, 1, 15);
+		GSPREC(2, 6, 0xA, 0xE, 14, 13);
+		GSPREC(3, 7, 0xB, 0xF, 4, 10);
+		GSPREC(0, 5, 0xA, 0xF, 0, 7);
+		GSPREC(1, 6, 0xB, 0xC, 6, 3);
+		GSPREC(2, 7, 0x8, 0xD, 9, 2);
+		GSPREC(3, 4, 0x9, 0xE, 8, 11);
+
+		//	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+		GSPREC(0, 4, 0x8, 0xC, 13, 11);
+		GSPREC(1, 5, 0x9, 0xD, 7, 14);
+		GSPREC(2, 6, 0xA, 0xE, 12, 1);
+		GSPREC(3, 7, 0xB, 0xF, 3, 9);
+		GSPREC(0, 5, 0xA, 0xF, 5, 0);
+		GSPREC(1, 6, 0xB, 0xC, 15, 4);
+		GSPREC(2, 7, 0x8, 0xD, 8, 6);
+		GSPREC(3, 4, 0x9, 0xE, 2, 10);
+		//	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+		GSPREC(0, 4, 0x8, 0xC, 6, 15);
+		GSPREC(1, 5, 0x9, 0xD, 14, 9);
+		GSPREC(2, 6, 0xA, 0xE, 11, 3);
+		GSPREC(3, 7, 0xB, 0xF, 0, 8);
+		GSPREC(0, 5, 0xA, 0xF, 12, 2);
+		GSPREC(1, 6, 0xB, 0xC, 13, 7);
+		GSPREC(2, 7, 0x8, 0xD, 1, 4);
+		GSPREC(3, 4, 0x9, 0xE, 10, 5);
+		//	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+		GSPREC(0, 4, 0x8, 0xC, 10, 2);
+		GSPREC(1, 5, 0x9, 0xD, 8, 4);
+		GSPREC(2, 6, 0xA, 0xE, 7, 6);
+		GSPREC(3, 7, 0xB, 0xF, 1, 5);
+		GSPREC(0, 5, 0xA, 0xF, 15, 11);
+		GSPREC(1, 6, 0xB, 0xC, 9, 14);
+		GSPREC(2, 7, 0x8, 0xD, 3, 12);
+		GSPREC(3, 4, 0x9, 0xE, 13, 0);
+		//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		GSPREC(0, 4, 0x8, 0xC, 0, 1);
+		GSPREC(1, 5, 0x9, 0xD, 2, 3);
+		GSPREC(2, 6, 0xA, 0xE, 4, 5);
+		GSPREC(3, 7, 0xB, 0xF, 6, 7);
+		GSPREC(0, 5, 0xA, 0xF, 8, 9);
+		GSPREC(1, 6, 0xB, 0xC, 10, 11);
+		GSPREC(2, 7, 0x8, 0xD, 12, 13);
+		GSPREC(3, 4, 0x9, 0xE, 14, 15);
+
+		//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		GSPREC(0, 4, 0x8, 0xC, 14, 10);
+		GSPREC(1, 5, 0x9, 0xD, 4, 8);
+		GSPREC(2, 6, 0xA, 0xE, 9, 15);
+		GSPREC(3, 7, 0xB, 0xF, 13, 6);
+		GSPREC(0, 5, 0xA, 0xF, 1, 12);
+		GSPREC(1, 6, 0xB, 0xC, 0, 2);
+		GSPREC(2, 7, 0x8, 0xD, 11, 7);
+		GSPREC(3, 4, 0x9, 0xE, 5, 3);
+
+		//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		GSPREC(0, 4, 0x8, 0xC, 11, 8);
+		GSPREC(1, 5, 0x9, 0xD, 12, 0);
+		GSPREC(2, 6, 0xA, 0xE, 5, 2);
+		GSPREC(3, 7, 0xB, 0xF, 15, 13);
+		GSPREC(0, 5, 0xA, 0xF, 10, 14);
+		GSPREC(1, 6, 0xB, 0xC, 3, 6);
+		GSPREC(2, 7, 0x8, 0xD, 7, 1);
+		GSPREC(3, 4, 0x9, 0xE, 9, 4);
+		//	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		GSPREC(0, 4, 0x8, 0xC, 7, 9);
+		GSPREC(1, 5, 0x9, 0xD, 3, 1);
+		GSPREC(2, 6, 0xA, 0xE, 13, 12);
+		GSPREC(3, 7, 0xB, 0xF, 11, 14);
+		GSPREC(0, 5, 0xA, 0xF, 2, 6);
+		GSPREC(1, 6, 0xB, 0xC, 5, 10);
+		GSPREC(2, 7, 0x8, 0xD, 4, 0);
+		GSPREC(3, 4, 0x9, 0xE, 15, 8);
+
+
+
+
+		h[0] = cuda_swab32(h[0] ^ v[0] ^ v[8]);
+		h[1] = cuda_swab32(h[1] ^ v[1] ^ v[9]);
+		h[2] = cuda_swab32(h[2] ^ v[2] ^ v[10]);
+		h[3] = cuda_swab32(h[3] ^ v[3] ^ v[11]);
+		h[4] = cuda_swab32(h[4] ^ v[4] ^ v[12]);
+		h[5] = cuda_swab32(h[5] ^ v[5] ^ v[13]);
+		h[6] = cuda_swab32(h[6] ^ v[6] ^ v[14]);
+		h[7] = cuda_swab32(h[7] ^ v[7] ^ v[15]);
+
+		uint2 keccak_gpu_state[25] = { 0 };
+		keccak_gpu_state[0].x = h[0];
+		keccak_gpu_state[0].y = h[1];
+		keccak_gpu_state[1].x = h[2];
+		keccak_gpu_state[1].y = h[3];
+		keccak_gpu_state[2].x = h[4];
+		keccak_gpu_state[2].y = h[5];
+		keccak_gpu_state[3].x = h[6];
+		keccak_gpu_state[3].y = h[7];
+		keccak_gpu_state[4] = UINT2(1, 0);
+
+		keccak_gpu_state[16] = UINT2(0, 0x80000000);
+		keccak_block(keccak_gpu_state);
+		uint64_t *outputHash = (uint64_t *)Hash;
+#pragma unroll 4
+		for (int i = 0; i<4; i++)
+			outputHash[i*threads + thread] = devectorize(keccak_gpu_state[i]);
+	}

-		#pragma unroll
-		for (int i = 0; i < 8; i++) h[i] = cpu_h[i];

-		#pragma unroll
-		for (int i = 0; i < 3; ++i) input[i] = c_data[i];

-		input[3] = startNonce + thread;
-		blake256_compress2nd(h, input, 640);
+}
+

-		#pragma unroll
-		for (int i = 0; i<4; i++) {
-			Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2*i+1]));
-		}
+__global__ __launch_bounds__(256, 4)
+void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t * Hash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNonce + thread;
+		uint32_t h[8];
+		//		uint32_t input[4];
+		const uint32_t T0 = 640;
+#pragma unroll 8
+		for (int i = 0; i<8; i++) { h[i] = cpu_h[i]; }
+
+		uint32_t v[16];
+
+		const uint32_t c_Padding[12] = {
+			0x80000000, 0, 0, 0,
+			0, 0, 0, 0,
+			0, 1, 0, 640
+		};
+
+		const uint32_t  u256[16] =
+		{
+			0x243F6A88, 0x85A308D3,
+			0x13198A2E, 0x03707344,
+			0xA4093822, 0x299F31D0,
+			0x082EFA98, 0xEC4E6C89,
+			0x452821E6, 0x38D01377,
+			0xBE5466CF, 0x34E90C6C,
+			0xC0AC29B7, 0xC97C50DD,
+			0x3F84D5B5, 0xB5470917
+		};
+
+		uint32_t m[16] =
+		{
+			c_data[0], c_data[1], c_data[2], nonce,
+			c_Padding[0], c_Padding[1], c_Padding[2], c_Padding[3],
+			c_Padding[4], c_Padding[5], c_Padding[6], c_Padding[7],
+			c_Padding[8], c_Padding[9], c_Padding[10], c_Padding[11]
+		};
+
+#pragma unroll 8
+		for (int i = 0; i < 8; i++)
+			v[i] = h[i];
+
+		v[8] = u256[0];
+		v[9] = u256[1];
+		v[10] = u256[2];
+		v[11] = u256[3];
+		v[12] = u256[4] ^ T0;
+		v[13] = u256[5] ^ T0;
+		v[14] = u256[6];
+		v[15] = u256[7];
+
+		//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		GSPREC(0, 4, 0x8, 0xC, 0, 1);
+		GSPREC(1, 5, 0x9, 0xD, 2, 3);
+		GSPREC(2, 6, 0xA, 0xE, 4, 5);
+		GSPREC(3, 7, 0xB, 0xF, 6, 7);
+		GSPREC(0, 5, 0xA, 0xF, 8, 9);
+		GSPREC(1, 6, 0xB, 0xC, 10, 11);
+		GSPREC(2, 7, 0x8, 0xD, 12, 13);
+		GSPREC(3, 4, 0x9, 0xE, 14, 15);
+		//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		GSPREC(0, 4, 0x8, 0xC, 14, 10);
+		GSPREC(1, 5, 0x9, 0xD, 4, 8);
+		GSPREC(2, 6, 0xA, 0xE, 9, 15);
+		GSPREC(3, 7, 0xB, 0xF, 13, 6);
+		GSPREC(0, 5, 0xA, 0xF, 1, 12);
+		GSPREC(1, 6, 0xB, 0xC, 0, 2);
+		GSPREC(2, 7, 0x8, 0xD, 11, 7);
+		GSPREC(3, 4, 0x9, 0xE, 5, 3);
+		//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		GSPREC(0, 4, 0x8, 0xC, 11, 8);
+		GSPREC(1, 5, 0x9, 0xD, 12, 0);
+		GSPREC(2, 6, 0xA, 0xE, 5, 2);
+		GSPREC(3, 7, 0xB, 0xF, 15, 13);
+		GSPREC(0, 5, 0xA, 0xF, 10, 14);
+		GSPREC(1, 6, 0xB, 0xC, 3, 6);
+		GSPREC(2, 7, 0x8, 0xD, 7, 1);
+		GSPREC(3, 4, 0x9, 0xE, 9, 4);
+		//	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		GSPREC(0, 4, 0x8, 0xC, 7, 9);
+		GSPREC(1, 5, 0x9, 0xD, 3, 1);
+		GSPREC(2, 6, 0xA, 0xE, 13, 12);
+		GSPREC(3, 7, 0xB, 0xF, 11, 14);
+		GSPREC(0, 5, 0xA, 0xF, 2, 6);
+		GSPREC(1, 6, 0xB, 0xC, 5, 10);
+		GSPREC(2, 7, 0x8, 0xD, 4, 0);
+		GSPREC(3, 4, 0x9, 0xE, 15, 8);
+
+		//	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+		GSPREC(0, 4, 0x8, 0xC, 9, 0);
+		GSPREC(1, 5, 0x9, 0xD, 5, 7);
+		GSPREC(2, 6, 0xA, 0xE, 2, 4);
+		GSPREC(3, 7, 0xB, 0xF, 10, 15);
+		GSPREC(0, 5, 0xA, 0xF, 14, 1);
+		GSPREC(1, 6, 0xB, 0xC, 11, 12);
+		GSPREC(2, 7, 0x8, 0xD, 6, 8);
+		GSPREC(3, 4, 0x9, 0xE, 3, 13);
+		//	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+		GSPREC(0, 4, 0x8, 0xC, 2, 12);
+		GSPREC(1, 5, 0x9, 0xD, 6, 10);
+		GSPREC(2, 6, 0xA, 0xE, 0, 11);
+		GSPREC(3, 7, 0xB, 0xF, 8, 3);
+		GSPREC(0, 5, 0xA, 0xF, 4, 13);
+		GSPREC(1, 6, 0xB, 0xC, 7, 5);
+		GSPREC(2, 7, 0x8, 0xD, 15, 14);
+		GSPREC(3, 4, 0x9, 0xE, 1, 9);
+
+		//	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+		GSPREC(0, 4, 0x8, 0xC, 12, 5);
+		GSPREC(1, 5, 0x9, 0xD, 1, 15);
+		GSPREC(2, 6, 0xA, 0xE, 14, 13);
+		GSPREC(3, 7, 0xB, 0xF, 4, 10);
+		GSPREC(0, 5, 0xA, 0xF, 0, 7);
+		GSPREC(1, 6, 0xB, 0xC, 6, 3);
+		GSPREC(2, 7, 0x8, 0xD, 9, 2);
+		GSPREC(3, 4, 0x9, 0xE, 8, 11);
+
+		//	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+		GSPREC(0, 4, 0x8, 0xC, 13, 11);
+		GSPREC(1, 5, 0x9, 0xD, 7, 14);
+		GSPREC(2, 6, 0xA, 0xE, 12, 1);
+		GSPREC(3, 7, 0xB, 0xF, 3, 9);
+		GSPREC(0, 5, 0xA, 0xF, 5, 0);
+		GSPREC(1, 6, 0xB, 0xC, 15, 4);
+		GSPREC(2, 7, 0x8, 0xD, 8, 6);
+		GSPREC(3, 4, 0x9, 0xE, 2, 10);
+		//	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+		GSPREC(0, 4, 0x8, 0xC, 6, 15);
+		GSPREC(1, 5, 0x9, 0xD, 14, 9);
+		GSPREC(2, 6, 0xA, 0xE, 11, 3);
+		GSPREC(3, 7, 0xB, 0xF, 0, 8);
+		GSPREC(0, 5, 0xA, 0xF, 12, 2);
+		GSPREC(1, 6, 0xB, 0xC, 13, 7);
+		GSPREC(2, 7, 0x8, 0xD, 1, 4);
+		GSPREC(3, 4, 0x9, 0xE, 10, 5);
+		//	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+		GSPREC(0, 4, 0x8, 0xC, 10, 2);
+		GSPREC(1, 5, 0x9, 0xD, 8, 4);
+		GSPREC(2, 6, 0xA, 0xE, 7, 6);
+		GSPREC(3, 7, 0xB, 0xF, 1, 5);
+		GSPREC(0, 5, 0xA, 0xF, 15, 11);
+		GSPREC(1, 6, 0xB, 0xC, 9, 14);
+		GSPREC(2, 7, 0x8, 0xD, 3, 12);
+		GSPREC(3, 4, 0x9, 0xE, 13, 0);
+		//	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+		GSPREC(0, 4, 0x8, 0xC, 0, 1);
+		GSPREC(1, 5, 0x9, 0xD, 2, 3);
+		GSPREC(2, 6, 0xA, 0xE, 4, 5);
+		GSPREC(3, 7, 0xB, 0xF, 6, 7);
+		GSPREC(0, 5, 0xA, 0xF, 8, 9);
+		GSPREC(1, 6, 0xB, 0xC, 10, 11);
+		GSPREC(2, 7, 0x8, 0xD, 12, 13);
+		GSPREC(3, 4, 0x9, 0xE, 14, 15);
+
+		//	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+		GSPREC(0, 4, 0x8, 0xC, 14, 10);
+		GSPREC(1, 5, 0x9, 0xD, 4, 8);
+		GSPREC(2, 6, 0xA, 0xE, 9, 15);
+		GSPREC(3, 7, 0xB, 0xF, 13, 6);
+		GSPREC(0, 5, 0xA, 0xF, 1, 12);
+		GSPREC(1, 6, 0xB, 0xC, 0, 2);
+		GSPREC(2, 7, 0x8, 0xD, 11, 7);
+		GSPREC(3, 4, 0x9, 0xE, 5, 3);
+
+		//	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+		GSPREC(0, 4, 0x8, 0xC, 11, 8);
+		GSPREC(1, 5, 0x9, 0xD, 12, 0);
+		GSPREC(2, 6, 0xA, 0xE, 5, 2);
+		GSPREC(3, 7, 0xB, 0xF, 15, 13);
+		GSPREC(0, 5, 0xA, 0xF, 10, 14);
+		GSPREC(1, 6, 0xB, 0xC, 3, 6);
+		GSPREC(2, 7, 0x8, 0xD, 7, 1);
+		GSPREC(3, 4, 0x9, 0xE, 9, 4);
+		//	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+		GSPREC(0, 4, 0x8, 0xC, 7, 9);
+		GSPREC(1, 5, 0x9, 0xD, 3, 1);
+		GSPREC(2, 6, 0xA, 0xE, 13, 12);
+		GSPREC(3, 7, 0xB, 0xF, 11, 14);
+		GSPREC(0, 5, 0xA, 0xF, 2, 6);
+		GSPREC(1, 6, 0xB, 0xC, 5, 10);
+		GSPREC(2, 7, 0x8, 0xD, 4, 0);
+		GSPREC(3, 4, 0x9, 0xE, 15, 8);
+
+		h[0] = cuda_swab32(h[0] ^ v[0] ^ v[8]);
+		h[1] = cuda_swab32(h[1] ^ v[1] ^ v[9]);
+		h[2] = cuda_swab32(h[2] ^ v[2] ^ v[10]);
+		h[3] = cuda_swab32(h[3] ^ v[3] ^ v[11]);
+		h[4] = cuda_swab32(h[4] ^ v[4] ^ v[12]);
+		h[5] = cuda_swab32(h[5] ^ v[5] ^ v[13]);
+		h[6] = cuda_swab32(h[6] ^ v[6] ^ v[14]);
+		h[7] = cuda_swab32(h[7] ^ v[7] ^ v[15]);
+
+		Hash[((0 * threads) + thread) * 2] = (h[0]);
+		Hash[((0 * threads) + thread) * 2 + 1] = (h[1]);
+		Hash[((1 * threads) + thread) * 2] = (h[2]);
+		Hash[((1 * threads) + thread) * 2 + 1] = (h[3]);
+		Hash[((2 * threads) + thread) * 2] = (h[4]);
+		Hash[((2 * threads) + thread) * 2 + 1] = (h[5]);
+		Hash[((3 * threads) + thread) * 2] = (h[6]);
+		Hash[((3 * threads) + thread) * 2 + 1] = (h[7]);
 	}
 }

 __host__
 void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order)
 {
-	const uint32_t threadsperblock = 256;
+	const uint32_t threadsperblock = 64;

 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);

-	blake256_gpu_hash_80 <<<grid, block>>> (threads, startNonce, Hash);
-	MyStreamSynchronize(NULL, order, thr_id);
+	blake256_gpu_hash_80 << <grid, block >> > (threads, startNonce, (uint32_t *)Hash);
 }

 __host__
 void blake256_cpu_setBlock_80(uint32_t *pdata)
 {
-	uint32_t h[8], data[20];
-
+	uint32_t h[8];
+	uint32_t data[20];
 	memcpy(data, pdata, 80);
-	memcpy(h, c_IV256, sizeof(c_IV256));
+	for (int i = 0; i<8; i++) {
+		h[i] = c_IV256[i];
+	}
 	blake256_compress1st(h, pdata, 512);

 	cudaMemcpyToSymbol(cpu_h, h, sizeof(h), 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol(c_data, &data[16], sizeof(c_data), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_data, &data[16], 3 * 4, 0, cudaMemcpyHostToDevice);
 }

 __host__
-void blake256_cpu_init(int thr_id, uint32_t threads)
+void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order)
 {
-	cudaMemcpyToSymbol(u256, c_u256, sizeof(c_u256), 0, cudaMemcpyHostToDevice);
-	cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice);
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	blakeKeccak256_gpu_hash_80 << <grid, block >> > (threads, startNonce, (uint32_t *)Hash);
+}
+
+__host__
+void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order, cudaStream_t stream)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	blakeKeccak256_gpu_hash_80 << <grid, block, 0, stream >> > (threads, startNonce, (uint32_t *)Hash);
 }
--- a/Algo256/cuda_bmw256.cu
+++ b/Algo256/cuda_bmw256.cu
@ -14,87 +14,85 @@ __constant__ uint64_t pTarget[4];
				@@ -14,87 +14,85 @@ __constant__ uint64_t pTarget[4];
 #define shl(x, n) ((x) << (n))
 #define shr(x, n) ((x) >> (n))

-#define ss0(x) (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x),  4) ^ SPH_ROTL32((x), 19))
-#define ss1(x) (shr((x), 1) ^ shl((x), 2) ^ SPH_ROTL32((x),  8) ^ SPH_ROTL32((x), 23))
-#define ss2(x) (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25))
-#define ss3(x) (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29))
-#define ss4(x) (shr((x), 1) ^ (x))
-#define ss5(x) (shr((x), 2) ^ (x))
-
+#define ss0(x)  (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x),  4) ^ SPH_ROTL32((x), 19))
+#define ss1(x)  (shr((x), 1) ^ shl((x), 2) ^ __byte_perm(x,0,0x2103) ^ SPH_ROTL32((x), 23))
+#define ss2(x)  (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25))
+#define ss3(x)  (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29))
+#define ss4(x)  (shr((x), 1) ^ (x))
+#define ss5(x)  (shr((x), 2) ^ (x))
 #define rs1(x) SPH_ROTL32((x),  3)
 #define rs2(x) SPH_ROTL32((x),  7)
 #define rs3(x) SPH_ROTL32((x), 13)
-#define rs4(x) SPH_ROTL32((x), 16)
+#define rs4(x) __byte_perm(x,0,0x1032)
 #define rs5(x) SPH_ROTL32((x), 19)
 #define rs6(x) SPH_ROTL32((x), 23)
 #define rs7(x) SPH_ROTL32((x), 27)

 /* Message expansion function 1 */
-__forceinline__ __device__
-uint32_t expand32_1(int i, uint32_t *M32, const uint32_t *H, uint32_t *Q)
+__forceinline__ __device__ uint32_t expand32_1(int i, uint32_t *M32, const uint32_t *H, uint32_t *Q)
 {
 	return (ss1(Q[i - 16]) + ss2(Q[i - 15]) + ss3(Q[i - 14]) + ss0(Q[i - 13])
 		+ ss1(Q[i - 12]) + ss2(Q[i - 11]) + ss3(Q[i - 10]) + ss0(Q[i - 9])
 		+ ss1(Q[i - 8]) + ss2(Q[i - 7]) + ss3(Q[i - 6]) + ss0(Q[i - 5])
 		+ ss1(Q[i - 4]) + ss2(Q[i - 3]) + ss3(Q[i - 2]) + ss0(Q[i - 1])
-		+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1)
-			+ SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1)
-			- SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
+		+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
 }

 /* Message expansion function 2 */
-__forceinline__ __device__
-uint32_t expand32_2(int i, uint32_t *M32, const uint32_t *H, uint32_t *Q)
+__forceinline__ __device__ uint32_t expand32_2(const int i, uint32_t *M32, const uint32_t *H, uint32_t *Q)
 {
-	return (Q[i - 16] + rs1(Q[i - 15]) + Q[i - 14] + rs2(Q[i - 13])
-		+ Q[i - 12] + rs3(Q[i - 11]) + Q[i - 10] + rs4(Q[i - 9])
-		+ Q[i - 8] + rs5(Q[i - 7]) + Q[i - 6] + rs6(Q[i - 5])
-		+ Q[i - 4] + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1])
-		+ ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1)
-			+ SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1)
-			- SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]));
+	return (
+		rs2(Q[i - 13]) + rs3(Q[i - 11]) + rs4(Q[i - 9]) + rs1(Q[i - 15]) +
+		+rs5(Q[i - 7]) + rs6(Q[i - 5]) + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1]));
 }

-__forceinline__ __device__
-void Compression256(uint32_t *  M32)
+__forceinline__ __device__ void Compression256(uint32_t M32[16])
 {
-	uint32_t Q[32], XL32, XH32;
-
 	const uint32_t H[16] = {
-		0x40414243, 0x44454647, 0x48494A4B, 0x4C4D4E4F,
-		0x50515253, 0x54555657, 0x58595A5B, 0x5C5D5E5F,
-		0x60616263, 0x64656667, 0x68696A6B, 0x6C6D6E6F,
-		0x70717273, 0x74757677, 0x78797A7B, 0x7C7D7E7F
+		(0x40414243), (0x44454647),
+		(0x48494A4B), (0x4C4D4E4F),
+		(0x50515253), (0x54555657),
+		(0x58595A5B), (0x5C5D5E5F),
+		(0x60616263), (0x64656667),
+		(0x68696A6B), (0x6C6D6E6F),
+		(0x70717273), (0x74757677),
+		(0x78797A7B), (0x7C7D7E7F)
 	};

-	Q[0]  = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]);
-	Q[1]  = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]);
-	Q[2]  = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9]  ^ H[9])  - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
-	Q[3]  = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (M32[8]  ^ H[8])  - (M32[10] ^ H[10]) + (M32[13] ^ H[13]);
-	Q[4]  = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (M32[9]  ^ H[9])  - (M32[11] ^ H[11]) - (M32[14] ^ H[14]);
-	Q[5]  = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (M32[10] ^ H[10]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
-	Q[6]  = (M32[4] ^ H[4]) - (M32[0] ^ H[0]) - (M32[3]  ^ H[3])  - (M32[11] ^ H[11]) + (M32[13] ^ H[13]);
-	Q[7]  = (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[5]  ^ H[5])  - (M32[12] ^ H[12]) - (M32[14] ^ H[14]);
-	Q[8]  = (M32[2] ^ H[2]) - (M32[5] ^ H[5]) - (M32[6]  ^ H[6])  + (M32[13] ^ H[13]) - (M32[15] ^ H[15]);
-	Q[9]  = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (M32[6]  ^ H[6])  - (M32[7]  ^ H[7])  + (M32[14] ^ H[14]);
-	Q[10] = (M32[8] ^ H[8]) - (M32[1] ^ H[1]) - (M32[4]  ^ H[4])  - (M32[7]  ^ H[7])  + (M32[15] ^ H[15]);
-	Q[11] = (M32[8] ^ H[8]) - (M32[0] ^ H[0]) - (M32[2]  ^ H[2])  - (M32[5]  ^ H[5])  + (M32[9]  ^ H[9]);
-	Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (M32[6]  ^ H[6])  - (M32[9]  ^ H[9])  + (M32[10] ^ H[10]);
-	Q[13] = (M32[2] ^ H[2]) + (M32[4] ^ H[4]) + (M32[7]  ^ H[7])  + (M32[10] ^ H[10]) + (M32[11] ^ H[11]);
-	Q[14] = (M32[3] ^ H[3]) - (M32[5] ^ H[5]) + (M32[8]  ^ H[8])  - (M32[11] ^ H[11]) - (M32[12] ^ H[12]);
-	Q[15] = (M32[12] ^ H[12]) - (M32[4] ^ H[4]) - (M32[6] ^ H[6]) - (M32[9]  ^ H[9])  + (M32[13] ^ H[13]);
-
-	/*  Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe. */
-	Q[0]  = ss0(Q[0])  + H[1];
-	Q[1]  = ss1(Q[1])  + H[2];
-	Q[2]  = ss2(Q[2])  + H[3];
-	Q[3]  = ss3(Q[3])  + H[4];
-	Q[4]  = ss4(Q[4])  + H[5];
-	Q[5]  = ss0(Q[5])  + H[6];
-	Q[6]  = ss1(Q[6])  + H[7];
-	Q[7]  = ss2(Q[7])  + H[8];
-	Q[8]  = ss3(Q[8])  + H[9];
-	Q[9]  = ss4(Q[9])  + H[10];
+	M32[8] = 0x80;
+	M32[14] = 0x100;
+
+	//	int i;
+	uint32_t XL32, XH32, Q[32];
+
+	Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]);
+	Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]);
+	Q[2] = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9] ^ H[9]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
+	Q[3] = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (M32[8] ^ H[8]) - (M32[10] ^ H[10]) + (M32[13] ^ H[13]);
+	Q[4] = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (M32[9] ^ H[9]) - (M32[11] ^ H[11]) - (M32[14] ^ H[14]);
+	Q[5] = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (M32[10] ^ H[10]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]);
+	Q[6] = (M32[4] ^ H[4]) - (M32[0] ^ H[0]) - (M32[3] ^ H[3]) - (M32[11] ^ H[11]) + (M32[13] ^ H[13]);
+	Q[7] = (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[5] ^ H[5]) - (M32[12] ^ H[12]) - (M32[14] ^ H[14]);
+	Q[8] = (M32[2] ^ H[2]) - (M32[5] ^ H[5]) - (M32[6] ^ H[6]) + (M32[13] ^ H[13]) - (M32[15] ^ H[15]);
+	Q[9] = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (M32[6] ^ H[6]) - (M32[7] ^ H[7]) + (M32[14] ^ H[14]);
+	Q[10] = (M32[8] ^ H[8]) - (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[7] ^ H[7]) + (M32[15] ^ H[15]);
+	Q[11] = (M32[8] ^ H[8]) - (M32[0] ^ H[0]) - (M32[2] ^ H[2]) - (M32[5] ^ H[5]) + (M32[9] ^ H[9]);
+	Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[10] ^ H[10]);
+	Q[13] = (M32[2] ^ H[2]) + (M32[4] ^ H[4]) + (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[11] ^ H[11]);
+	Q[14] = (M32[3] ^ H[3]) - (M32[5] ^ H[5]) + (M32[8] ^ H[8]) - (M32[11] ^ H[11]) - (M32[12] ^ H[12]);
+	Q[15] = (M32[12] ^ H[12]) - (M32[4] ^ H[4]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[13] ^ H[13]);
+
+	/*  Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe.*/
+	Q[0] = ss0(Q[0]) + H[1];
+	Q[1] = ss1(Q[1]) + H[2];
+	Q[2] = ss2(Q[2]) + H[3];
+	Q[3] = ss3(Q[3]) + H[4];
+	Q[4] = ss4(Q[4]) + H[5];
+	Q[5] = ss0(Q[5]) + H[6];
+	Q[6] = ss1(Q[6]) + H[7];
+	Q[7] = ss2(Q[7]) + H[8];
+	Q[8] = ss3(Q[8]) + H[9];
+	Q[9] = ss4(Q[9]) + H[10];
 	Q[10] = ss0(Q[10]) + H[11];
 	Q[11] = ss1(Q[11]) + H[12];
 	Q[12] = ss2(Q[12]) + H[13];
@ -109,13 +107,91 @@ void Compression256(uint32_t *  M32)
				@@ -109,13 +107,91 @@ void Compression256(uint32_t *  M32)
 	/* The following relation for these parameters should is satisfied: */
 	/* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16                           */

-	#pragma unroll
-	for (int i=16; i<18; i++)
-		Q[i] = expand32_1(i, M32, H, Q);
-
-	#pragma nounroll
-	for (int i=18; i<32; i++)
-		Q[i] = expand32_2(i, M32, H, Q);
+	//	#pragma unroll
+	//	for (i = 0; i<2; i++)
+	//		Q[i + 16] = expand32_1(i + 16, M32, H, Q);
+
+	Q[16] = ss1(Q[16 - 16]) + ss2(Q[16 - 15]) + ss3(Q[16 - 14]) + ss0(Q[16 - 13])
+		+ ss1(Q[16 - 12]) + ss2(Q[16 - 11]) + ss3(Q[16 - 10]) + ss0(Q[16 - 9])
+		+ ss1(Q[16 - 8]) + ss2(Q[16 - 7]) + ss3(Q[16 - 6]) + ss0(Q[16 - 5])
+		+ ss1(Q[16 - 4]) + ss2(Q[16 - 3]) + ss3(Q[16 - 2]) + ss0(Q[16 - 1])
+		+ ((16 * (0x05555555ul) + SPH_ROTL32(M32[0], ((16 - 16) % 16) + 1) + SPH_ROTL32(M32[3], ((16 - 13) % 16) + 1)) ^ H[(16 - 16 + 7) % 16]);
+
+	Q[17] = ss1(Q[17 - 16]) + ss2(Q[17 - 15]) + ss3(Q[17 - 14]) + ss0(Q[17 - 13])
+		+ ss1(Q[17 - 12]) + ss2(Q[17 - 11]) + ss3(Q[17 - 10]) + ss0(Q[17 - 9])
+		+ ss1(Q[17 - 8]) + ss2(Q[17 - 7]) + ss3(Q[17 - 6]) + ss0(Q[17 - 5])
+		+ ss1(Q[17 - 4]) + ss2(Q[17 - 3]) + ss3(Q[17 - 2]) + ss0(Q[17 - 1])
+		+ ((17 * (0x05555555ul) + SPH_ROTL32(M32[(17 - 16) % 16], ((17 - 16) % 16) + 1) + SPH_ROTL32(M32[(17 - 13) % 16], ((17 - 13) % 16) + 1)) ^ H[(17 - 16 + 7) % 16]);
+
+
+	uint32_t precalc = Q[18 - 16] + Q[18 - 14] + Q[18 - 12] + Q[18 - 10] + Q[18 - 8] + Q[18 - 6]; //+ Q[18 - 4]
+	uint32_t precalc2 = Q[19 - 16] + Q[19 - 14] + Q[19 - 12] + Q[19 - 10] + Q[19 - 8] + Q[19 - 6];//+ Q[19 - 4]
+
+	//	#pragma unroll
+	//	for (i = 2 + 16; i < 16 + 16; i+=2)
+	//	{
+	precalc = precalc + Q[18 - 4];
+	precalc2 = precalc2 + Q[18 + 1 - 4];
+	uint32_t p1 = ((18 * (0x05555555ul) + SPH_ROTL32(M32[2], ((18 - 16) % 16) + 1) + SPH_ROTL32(M32[5], ((18 - 13) % 16) + 1)) ^ H[(18 - 16 + 7) % 16]);
+	uint32_t p2 = (((18 + 1)*(0x05555555ul) + SPH_ROTL32(M32[3], (((18 + 1) - 16) % 16) + 1) + SPH_ROTL32(M32[6], (((18 + 1) - 13) % 16) + 1)) ^ H[((18 + 1) - 16 + 7) % 16]);
+	Q[18] = precalc + expand32_2(18, M32, H, Q) + p1;
+	Q[18 + 1] = precalc2 + expand32_2(18 + 1, M32, H, Q) + p2;
+	precalc = precalc - Q[18 - 16];
+	precalc2 = precalc2 - Q[18 + 1 - 16];
+
+	precalc = precalc + Q[20 - 4];
+	precalc2 = precalc2 + Q[20 + 1 - 4];
+	p1 = ((20 * (0x05555555ul) + SPH_ROTL32(M32[4], ((20 - 16) % 16) + 1) + SPH_ROTL32(M32[7], ((20 - 13) % 16) + 1) - (0x100 << 15)) ^ H[(20 - 16 + 7) % 16]);
+	p2 = (((20 + 1)*(0x05555555ul) + SPH_ROTL32(M32[5], (((20 + 1) - 16) % 16) + 1) + (0x80 << 9)) ^ H[((20 + 1) - 16 + 7) % 16]);
+	Q[20] = precalc + expand32_2(20, M32, H, Q) + p1;
+	Q[20 + 1] = precalc2 + expand32_2(20 + 1, M32, H, Q) + p2;
+	precalc = precalc - Q[20 - 16];
+	precalc2 = precalc2 - Q[20 + 1 - 16];
+
+	precalc = precalc + Q[22 - 4];
+	precalc2 = precalc2 + Q[22 + 1 - 4];
+	p1 = ((22 * (0x05555555ul) + SPH_ROTL32(M32[6], ((22 - 16) % 16) + 1) - SPH_ROTL32(M32[0], ((22 - 6) % 16) + 1)) ^ H[(22 - 16 + 7) % 16]);
+	p2 = (((22 + 1)*(0x05555555ul) + SPH_ROTL32(M32[7], (((22 + 1) - 16) % 16) + 1) - SPH_ROTL32(M32[1], (((22 + 1) - 6) % 16) + 1)) ^ H[((22 + 1) - 16 + 7) % 16]);
+	Q[22] = precalc + expand32_2(22, M32, H, Q) + p1;
+	Q[22 + 1] = precalc2 + expand32_2(22 + 1, M32, H, Q) + p2;
+	precalc = precalc - Q[22 - 16];
+	precalc2 = precalc2 - Q[22 + 1 - 16];
+
+	precalc = precalc + Q[24 - 4];
+	precalc2 = precalc2 + Q[24 + 1 - 4];
+	p1 = ((24 * (0x05555555ul) + (0x80 << 9) - SPH_ROTL32(M32[2], ((24 - 6) % 16) + 1)) ^ H[(24 - 16 + 7) % 16]);
+	p2 = (((24 + 1)*(0x05555555ul) - SPH_ROTL32(M32[3], (((24 + 1) - 6) % 16) + 1)) ^ H[((24 + 1) - 16 + 7) % 16]);
+	Q[24] = precalc + expand32_2(24, M32, H, Q) + p1;
+	Q[24 + 1] = precalc2 + expand32_2(24 + 1, M32, H, Q) + p2;
+	precalc = precalc - Q[24 - 16];
+	precalc2 = precalc2 - Q[24 + 1 - 16];
+
+	precalc = precalc + Q[26 - 4];
+	precalc2 = precalc2 + Q[26 + 1 - 4];
+	p1 = ((26 * (0x05555555ul) - SPH_ROTL32(M32[4], ((26 - 6) % 16) + 1)) ^ H[(26 - 16 + 7) % 16]);
+	p2 = (((26 + 1)*(0x05555555ul) + (0x100 << 15) - SPH_ROTL32(M32[5], (((26 + 1) - 6) % 16) + 1)) ^ H[((26 + 1) - 16 + 7) % 16]);
+	Q[26] = precalc + expand32_2(26, M32, H, Q) + p1;
+	Q[26 + 1] = precalc2 + expand32_2(26 + 1, M32, H, Q) + p2;
+	precalc = precalc - Q[26 - 16];
+	precalc2 = precalc2 - Q[26 + 1 - 16];
+
+	precalc = precalc + Q[28 - 4];
+	precalc2 = precalc2 + Q[28 + 1 - 4];
+	p1 = ((28 * (0x05555555ul) - SPH_ROTL32(M32[6], ((28 - 6) % 16) + 1)) ^ H[(28 - 16 + 7) % 16]);
+	p2 = (((28 + 1)*(0x05555555ul) + SPH_ROTL32(M32[0], (((28 + 1) - 13) % 16) + 1) - SPH_ROTL32(M32[7], (((28 + 1) - 6) % 16) + 1)) ^ H[((28 + 1) - 16 + 7) % 16]);
+	Q[28] = precalc + expand32_2(28, M32, H, Q) + p1;
+	Q[28 + 1] = precalc2 + expand32_2(28 + 1, M32, H, Q) + p2;
+	precalc = precalc - Q[28 - 16];
+	precalc2 = precalc2 - Q[28 + 1 - 16];
+
+	precalc = precalc + Q[30 - 4];
+	precalc2 = precalc2 + Q[30 + 1 - 4];
+	p1 = ((30 * (0x05555555ul) + (0x100 << 15) + SPH_ROTL32(M32[1], ((30 - 13) % 16) + 1) - (0x80 << 9)) ^ H[(30 - 16 + 7) % 16]);
+	p2 = (((30 + 1)*(0x05555555ul) + SPH_ROTL32(M32[2], (((30 + 1) - 13) % 16) + 1)) ^ H[((30 + 1) - 16 + 7) % 16]);
+	Q[30] = precalc + expand32_2(30, M32, H, Q) + p1;
+	Q[30 + 1] = precalc2 + expand32_2(30 + 1, M32, H, Q) + p2;
+	precalc = precalc - Q[30 - 16];
+	precalc2 = precalc2 - Q[30 + 1 - 16];

 	/* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */
 	/* 16 new variables that are prooduced in the Message Expansion part.                    */
@ -145,17 +221,18 @@ void Compression256(uint32_t *  M32)
				@@ -145,17 +221,18 @@ void Compression256(uint32_t *  M32)
 	M32[15] = SPH_ROTL32(M32[3], 16) + (XH32     ^     Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
 }

-__forceinline__ __device__
-void Compression256_2(uint32_t *  M32)
+__forceinline__ __device__ void Compression256_2(uint32_t  M32[16])
 {
-	uint32_t XL32, XH32, Q[32];
-
 	const uint32_t H[16] = {
-		0xaaaaaaa0, 0xaaaaaaa1, 0xaaaaaaa2, 0xaaaaaaa3,
-		0xaaaaaaa4, 0xaaaaaaa5, 0xaaaaaaa6, 0xaaaaaaa7,
-		0xaaaaaaa8, 0xaaaaaaa9, 0xaaaaaaaa, 0xaaaaaaab,
-		0xaaaaaaac, 0xaaaaaaad, 0xaaaaaaae, 0xaaaaaaaf
+		(0xaaaaaaa0), (0xaaaaaaa1), (0xaaaaaaa2),
+		(0xaaaaaaa3), (0xaaaaaaa4), (0xaaaaaaa5),
+		(0xaaaaaaa6), (0xaaaaaaa7), (0xaaaaaaa8),
+		(0xaaaaaaa9), (0xaaaaaaaa), (0xaaaaaaab),
+		(0xaaaaaaac), (0xaaaaaaad), (0xaaaaaaae),
+		(0xaaaaaaaf)
 	};
+	int i;
+	uint32_t XL32, XH32, Q[32];

 	Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]);
 	Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]);
@ -199,45 +276,69 @@ void Compression256_2(uint32_t *  M32)
				@@ -199,45 +276,69 @@ void Compression256_2(uint32_t *  M32)
 	/* The following relation for these parameters should is satisfied: */
 	/* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16                           */

-	#pragma unroll
-	for (int i = 16; i<18; i++)
-		Q[i] = expand32_1(i, M32, H, Q);
+#pragma unroll
+	for (i = 0; i<2; i++)
+		Q[i + 16] = expand32_1(i + 16, M32, H, Q);
+
+	/*	#pragma unroll
+	for (i = 2; i<16; i++)
+	Q[i + 16] = expand32_2(i + 16, M32, H, Q);
+	*/
+	uint32_t precalc = Q[18 - 16] + Q[18 - 14] + Q[18 - 12] + Q[18 - 10] + Q[18 - 8] + Q[18 - 6]; //+ Q[18 - 4]
+	uint32_t precalc2 = Q[19 - 16] + Q[19 - 14] + Q[19 - 12] + Q[19 - 10] + Q[19 - 8] + Q[19 - 6];//+ Q[19 - 4]
+
+#pragma unroll
+	for (i = 2 + 16; i < 16 + 16; i += 2)
+	{
+		precalc = precalc + Q[i - 4];
+		precalc2 = precalc2 + Q[i + 1 - 4];
+		uint32_t p1 = ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]);
+		uint32_t p2 = (((i + 1)*(0x05555555ul) + SPH_ROTL32(M32[((i + 1) - 16) % 16], (((i + 1) - 16) % 16) + 1) + SPH_ROTL32(M32[((i + 1) - 13) % 16], (((i + 1) - 13) % 16) + 1) - SPH_ROTL32(M32[((i + 1) - 6) % 16], (((i + 1) - 6) % 16) + 1)) ^ H[((i + 1) - 16 + 7) % 16]);
+		Q[i] = precalc + expand32_2(i, M32, H, Q) + p1;
+		Q[i + 1] = precalc2 + expand32_2(i + 1, M32, H, Q) + p2;
+		precalc = precalc - Q[i - 16];
+		precalc2 = precalc2 - Q[i + 1 - 16];
+	}
+

-	#pragma nounroll
-	for (int i = 18; i<32; i++)
-		Q[i] = expand32_2(i, M32, H, Q);

 	/* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */
 	/* 16 new variables that are prooduced in the Message Expansion part.                    */
 	XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23];
-	XH32 = XL32 ^ Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31];
+	XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31];
+
+
+	M32[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32    ^ Q[26] ^ Q[2]);
+	M32[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32    ^ Q[27] ^ Q[3]);
+	M32[14] = SPH_ROTL32(M32[2], 15) + (XH32     ^     Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]);
+	M32[15] = SPH_ROTL32(M32[3], 16) + (XH32     ^     Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
+

-	M32[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32 ^ Q[26] ^ Q[2]);
-	M32[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32 ^ Q[27] ^ Q[3]);
-	M32[14] = SPH_ROTL32(M32[2], 15) + (XH32 ^ Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]);
-	M32[15] = SPH_ROTL32(M32[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]);
 }

 #define TPB 512
 __global__ __launch_bounds__(TPB, 2)
-void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *const __restrict__ nonceVector)
+void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *const __restrict__ nonceVector, uint32_t Target)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		uint32_t message[16] = { 0 };
-
-		LOHI(message[0], message[1], __ldg(&g_hash[thread]));
-		LOHI(message[2], message[3], __ldg(&g_hash[thread + 1 * threads]));
-		LOHI(message[4], message[5], __ldg(&g_hash[thread + 2 * threads]));
-		LOHI(message[6], message[7], __ldg(&g_hash[thread + 3 * threads]));
-
-		message[8]=0x80;
-		message[14]=0x100;
-		Compression256(message);
-		Compression256_2(message);
-
-		if (((uint64_t*)message)[7] <= pTarget[3])
+		uint2 message[8] = { 0 };
+
+		message[0] = __ldg(&g_hash[thread + 0 * threads]);
+		message[1] = __ldg(&g_hash[thread + 1 * threads]);
+		message[2] = __ldg(&g_hash[thread + 2 * threads]);
+		message[3] = __ldg(&g_hash[thread + 3 * threads]);
+		//LOHI(message[2], message[3], __ldg(&g_hash[thread + 1 * threads]));
+		//LOHI(message[4], message[5], __ldg(&g_hash[thread + 2 * threads]));
+		//LOHI(message[6], message[7], __ldg(&g_hash[thread + 3 * threads]));
+
+		message[4].x = 0x80;
+		message[7].x = 0x100;
+		Compression256((uint32_t*)message);
+		Compression256_2((uint32_t*)message);
+
+		if (message[7].y <= Target)
 		{
 			uint32_t tmp = atomicExch(&nonceVector[0], startNounce + thread);
 			if (tmp != 0)
@ -247,7 +348,7 @@ void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash
				@@ -247,7 +348,7 @@ void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash
 }

 __host__
-void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces)
+void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces, uint32_t Target)
 {
 	const uint32_t threadsperblock = TPB;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
@ -255,13 +356,12 @@ void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint
				@@ -255,13 +356,12 @@ void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint

 	cudaMemset(d_GNonce[thr_id], 0, 2 * sizeof(uint32_t));

-	bmw256_gpu_hash_32 << <grid, block >> >(threads, startNounce, g_hash, d_GNonce[thr_id]);
+	bmw256_gpu_hash_32 << <grid, block >> >(threads, startNounce, (uint2*)g_hash, d_GNonce[thr_id], Target);
 	cudaMemcpy(d_gnounce[thr_id], d_GNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	resultnonces[0] = *(d_gnounce[thr_id]);
 	resultnonces[1] = *(d_gnounce[thr_id] + 1);
 }

-
 __host__
 void bmw256_cpu_init(int thr_id, uint32_t threads)
 {
@ -276,8 +376,10 @@ void bmw256_cpu_free(int thr_id)
				@@ -276,8 +376,10 @@ void bmw256_cpu_free(int thr_id)
 	cudaFreeHost(d_gnounce[thr_id]);
 }

+/*
 __host__
 void bmw256_setTarget(const void *pTargetIn)
 {
 	cudaMemcpyToSymbol(pTarget, pTargetIn, 32, 0, cudaMemcpyHostToDevice);
 }
+*/
--- a/Algo256/cuda_cubehash256.cu
+++ b/Algo256/cuda_cubehash256.cu
@ -3,179 +3,247 @@
				@@ -3,179 +3,247 @@
 #define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
 #define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */

+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#define __CUDA_ARCH__ 520
+#endif
+
 #if __CUDA_ARCH__ < 350
 #define LROT(x,bits) ((x << bits) | (x >> (32 - bits)))
 #else
 #define LROT(x, bits) __funnelshift_l(x, x, bits)
 #endif

-#if __CUDA_ARCH__ < 500
-#define TPB 576
-#else
-#define TPB 1024
-#endif
+#define TPB35 576
+#define TPB50 1024

 #define ROTATEUPWARDS7(a)  LROT(a,7)
 #define ROTATEUPWARDS11(a) LROT(a,11)

-//#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
-#define SWAP(a,b) { a ^= b; b ^= a; a ^= b; }
-
 __device__ __forceinline__ void rrounds(uint32_t x[2][2][2][2][2])
 {
 	int r;
-	int j;
-	int k;
-	int l;
-	int m;
-
-	#pragma unroll 2
-	for (r = 0; r < CUBEHASH_ROUNDS; ++r) {
-
-		/* "add x_0jklm into x_1jklmn modulo 2^32" */
-#pragma unroll 2
-		for (j = 0; j < 2; ++j)
-#pragma unroll 2
-			for (k = 0; k < 2; ++k)
-#pragma unroll 2
-				for (l = 0; l < 2; ++l)
-#pragma unroll 2
-					for (m = 0; m < 2; ++m)
-						x[1][j][k][l][m] += x[0][j][k][l][m];

+	uint32_t x0[2][2][2][2];
+	uint32_t x1[2][2][2][2];
+
+	for (r = 0; r < CUBEHASH_ROUNDS; r += 2) {
 		/* "rotate x_0jklm upwards by 7 bits" */
-#pragma unroll 2
-		for (j = 0; j < 2; ++j)
-#pragma unroll 2
-			for (k = 0; k < 2; ++k)
-#pragma unroll 2
-				for (l = 0; l < 2; ++l)
-#pragma unroll 2
-					for (m = 0; m < 2; ++m)
-						x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]);
-
-		/* "swap x_00klm with x_01klm" */
-#pragma unroll 2
-		for (k = 0; k < 2; ++k)
-#pragma unroll 2
-			for (l = 0; l < 2; ++l)
-#pragma unroll 2
-				for (m = 0; m < 2; ++m)
-					SWAP(x[0][0][k][l][m], x[0][1][k][l][m])
-
-					/* "xor x_1jklm into x_0jklm" */
-#pragma unroll 2
-					for (j = 0; j < 2; ++j)
-#pragma unroll 2
-						for (k = 0; k < 2; ++k)
-#pragma unroll 2
-							for (l = 0; l < 2; ++l)
-#pragma unroll 2
-								for (m = 0; m < 2; ++m)
-									x[0][j][k][l][m] ^= x[1][j][k][l][m];
-
-		/* "swap x_1jk0m with x_1jk1m" */
-#pragma unroll 2
-		for (j = 0; j < 2; ++j)
-#pragma unroll 2
-			for (k = 0; k < 2; ++k)
-#pragma unroll 2
-				for (m = 0; m < 2; ++m)
-					SWAP(x[1][j][k][0][m], x[1][j][k][1][m])
-
-					/* "add x_0jklm into x_1jklm modulo 2^32" */
-#pragma unroll 2
-					for (j = 0; j < 2; ++j)
-#pragma unroll 2
-						for (k = 0; k < 2; ++k)
-#pragma unroll 2
-							for (l = 0; l < 2; ++l)
-#pragma unroll 2
-								for (m = 0; m < 2; ++m)
-									x[1][j][k][l][m] += x[0][j][k][l][m];
+		x0[0][0][0][0] = ROTATEUPWARDS7(x[0][0][0][0][0]);
+		x0[0][0][0][1] = ROTATEUPWARDS7(x[0][0][0][0][1]);
+		x0[0][0][1][0] = ROTATEUPWARDS7(x[0][0][0][1][0]);
+		x0[0][0][1][1] = ROTATEUPWARDS7(x[0][0][0][1][1]);
+		x0[0][1][0][0] = ROTATEUPWARDS7(x[0][0][1][0][0]);
+		x0[0][1][0][1] = ROTATEUPWARDS7(x[0][0][1][0][1]);
+		x0[0][1][1][0] = ROTATEUPWARDS7(x[0][0][1][1][0]);
+		x0[0][1][1][1] = ROTATEUPWARDS7(x[0][0][1][1][1]);
+		x0[1][0][0][0] = ROTATEUPWARDS7(x[0][1][0][0][0]);
+		x0[1][0][0][1] = ROTATEUPWARDS7(x[0][1][0][0][1]);
+		x0[1][0][1][0] = ROTATEUPWARDS7(x[0][1][0][1][0]);
+		x0[1][0][1][1] = ROTATEUPWARDS7(x[0][1][0][1][1]);
+		x0[1][1][0][0] = ROTATEUPWARDS7(x[0][1][1][0][0]);
+		x0[1][1][0][1] = ROTATEUPWARDS7(x[0][1][1][0][1]);
+		x0[1][1][1][0] = ROTATEUPWARDS7(x[0][1][1][1][0]);
+		x0[1][1][1][1] = ROTATEUPWARDS7(x[0][1][1][1][1]);
+
+		/* "add x_0jklm into x_1jklm modulo 2^32" */
+		x1[0][0][0][0] = x[1][0][0][0][0] + x[0][0][0][0][0];
+		x1[0][0][0][1] = x[1][0][0][0][1] + x[0][0][0][0][1];
+		x1[0][0][1][0] = x[1][0][0][1][0] + x[0][0][0][1][0];
+		x1[0][0][1][1] = x[1][0][0][1][1] + x[0][0][0][1][1];
+		x1[0][1][0][0] = x[1][0][1][0][0] + x[0][0][1][0][0];
+		x1[0][1][0][1] = x[1][0][1][0][1] + x[0][0][1][0][1];
+		x1[0][1][1][0] = x[1][0][1][1][0] + x[0][0][1][1][0];
+		x1[0][1][1][1] = x[1][0][1][1][1] + x[0][0][1][1][1];
+		x1[1][0][0][0] = x[1][1][0][0][0] + x[0][1][0][0][0];
+		x1[1][0][0][1] = x[1][1][0][0][1] + x[0][1][0][0][1];
+		x1[1][0][1][0] = x[1][1][0][1][0] + x[0][1][0][1][0];
+		x1[1][0][1][1] = x[1][1][0][1][1] + x[0][1][0][1][1];
+		x1[1][1][0][0] = x[1][1][1][0][0] + x[0][1][1][0][0];
+		x1[1][1][0][1] = x[1][1][1][0][1] + x[0][1][1][0][1];
+		x1[1][1][1][0] = x[1][1][1][1][0] + x[0][1][1][1][0];
+		x1[1][1][1][1] = x[1][1][1][1][1] + x[0][1][1][1][1];
+
+		/* "xor x_1~jklm into x_0jklm" */
+		x[0][0][0][0][0] = x0[0][0][0][0] ^ x1[1][0][0][0];
+		x[0][0][0][0][1] = x0[0][0][0][1] ^ x1[1][0][0][1];
+		x[0][0][0][1][0] = x0[0][0][1][0] ^ x1[1][0][1][0];
+		x[0][0][0][1][1] = x0[0][0][1][1] ^ x1[1][0][1][1];
+		x[0][0][1][0][0] = x0[0][1][0][0] ^ x1[1][1][0][0];
+		x[0][0][1][0][1] = x0[0][1][0][1] ^ x1[1][1][0][1];
+		x[0][0][1][1][0] = x0[0][1][1][0] ^ x1[1][1][1][0];
+		x[0][0][1][1][1] = x0[0][1][1][1] ^ x1[1][1][1][1];
+		x[0][1][0][0][0] = x0[1][0][0][0] ^ x1[0][0][0][0];
+		x[0][1][0][0][1] = x0[1][0][0][1] ^ x1[0][0][0][1];
+		x[0][1][0][1][0] = x0[1][0][1][0] ^ x1[0][0][1][0];
+		x[0][1][0][1][1] = x0[1][0][1][1] ^ x1[0][0][1][1];
+		x[0][1][1][0][0] = x0[1][1][0][0] ^ x1[0][1][0][0];
+		x[0][1][1][0][1] = x0[1][1][0][1] ^ x1[0][1][0][1];
+		x[0][1][1][1][0] = x0[1][1][1][0] ^ x1[0][1][1][0];
+		x[0][1][1][1][1] = x0[1][1][1][1] ^ x1[0][1][1][1];

 		/* "rotate x_0jklm upwards by 11 bits" */
-#pragma unroll 2
-		for (j = 0; j < 2; ++j)
-#pragma unroll 2
-			for (k = 0; k < 2; ++k)
-#pragma unroll 2
-				for (l = 0; l < 2; ++l)
-#pragma unroll 2
-					for (m = 0; m < 2; ++m)
-						x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]);
-
-		/* "swap x_0j0lm with x_0j1lm" */
-#pragma unroll 2
-		for (j = 0; j < 2; ++j)
-#pragma unroll 2
-			for (l = 0; l < 2; ++l)
-#pragma unroll 2
-				for (m = 0; m < 2; ++m)
-					SWAP(x[0][j][0][l][m], x[0][j][1][l][m])
-
-					/* "xor x_1jklm into x_0jklm" */
-#pragma unroll 2
-					for (j = 0; j < 2; ++j)
-#pragma unroll 2
-						for (k = 0; k < 2; ++k)
-#pragma unroll 2
-							for (l = 0; l < 2; ++l)
-#pragma unroll 2
-								for (m = 0; m < 2; ++m)
-									x[0][j][k][l][m] ^= x[1][j][k][l][m];
-
-		/* "swap x_1jkl0 with x_1jkl1" */
-#pragma unroll 2
-		for (j = 0; j < 2; ++j)
-#pragma unroll 2
-			for (k = 0; k < 2; ++k)
-#pragma unroll 2
-				for (l = 0; l < 2; ++l)
-					SWAP(x[1][j][k][l][0], x[1][j][k][l][1])
+		x0[0][0][0][0] = ROTATEUPWARDS11(x[0][0][0][0][0]);
+		x0[0][0][0][1] = ROTATEUPWARDS11(x[0][0][0][0][1]);
+		x0[0][0][1][0] = ROTATEUPWARDS11(x[0][0][0][1][0]);
+		x0[0][0][1][1] = ROTATEUPWARDS11(x[0][0][0][1][1]);
+		x0[0][1][0][0] = ROTATEUPWARDS11(x[0][0][1][0][0]);
+		x0[0][1][0][1] = ROTATEUPWARDS11(x[0][0][1][0][1]);
+		x0[0][1][1][0] = ROTATEUPWARDS11(x[0][0][1][1][0]);
+		x0[0][1][1][1] = ROTATEUPWARDS11(x[0][0][1][1][1]);
+		x0[1][0][0][0] = ROTATEUPWARDS11(x[0][1][0][0][0]);
+		x0[1][0][0][1] = ROTATEUPWARDS11(x[0][1][0][0][1]);
+		x0[1][0][1][0] = ROTATEUPWARDS11(x[0][1][0][1][0]);
+		x0[1][0][1][1] = ROTATEUPWARDS11(x[0][1][0][1][1]);
+		x0[1][1][0][0] = ROTATEUPWARDS11(x[0][1][1][0][0]);
+		x0[1][1][0][1] = ROTATEUPWARDS11(x[0][1][1][0][1]);
+		x0[1][1][1][0] = ROTATEUPWARDS11(x[0][1][1][1][0]);
+		x0[1][1][1][1] = ROTATEUPWARDS11(x[0][1][1][1][1]);
+
+		/* "add x_0jklm into x_1~jk~lm modulo 2^32" */
+		x[1][1][0][1][0] = x1[1][0][1][0] + x[0][0][0][0][0];
+		x[1][1][0][1][1] = x1[1][0][1][1] + x[0][0][0][0][1];
+		x[1][1][0][0][0] = x1[1][0][0][0] + x[0][0][0][1][0];
+		x[1][1][0][0][1] = x1[1][0][0][1] + x[0][0][0][1][1];
+		x[1][1][1][1][0] = x1[1][1][1][0] + x[0][0][1][0][0];
+		x[1][1][1][1][1] = x1[1][1][1][1] + x[0][0][1][0][1];
+		x[1][1][1][0][0] = x1[1][1][0][0] + x[0][0][1][1][0];
+		x[1][1][1][0][1] = x1[1][1][0][1] + x[0][0][1][1][1];
+		x[1][0][0][1][0] = x1[0][0][1][0] + x[0][1][0][0][0];
+		x[1][0][0][1][1] = x1[0][0][1][1] + x[0][1][0][0][1];
+		x[1][0][0][0][0] = x1[0][0][0][0] + x[0][1][0][1][0];
+		x[1][0][0][0][1] = x1[0][0][0][1] + x[0][1][0][1][1];
+		x[1][0][1][1][0] = x1[0][1][1][0] + x[0][1][1][0][0];
+		x[1][0][1][1][1] = x1[0][1][1][1] + x[0][1][1][0][1];
+		x[1][0][1][0][0] = x1[0][1][0][0] + x[0][1][1][1][0];
+		x[1][0][1][0][1] = x1[0][1][0][1] + x[0][1][1][1][1];
+
+		/* "xor x_1~j~k~lm into x_0jklm" */
+		x[0][0][0][0][0] = x0[0][0][0][0] ^ x[1][1][1][1][0];
+		x[0][0][0][0][1] = x0[0][0][0][1] ^ x[1][1][1][1][1];
+		x[0][0][0][1][0] = x0[0][0][1][0] ^ x[1][1][1][0][0];
+		x[0][0][0][1][1] = x0[0][0][1][1] ^ x[1][1][1][0][1];
+		x[0][0][1][0][0] = x0[0][1][0][0] ^ x[1][1][0][1][0];
+		x[0][0][1][0][1] = x0[0][1][0][1] ^ x[1][1][0][1][1];
+		x[0][0][1][1][0] = x0[0][1][1][0] ^ x[1][1][0][0][0];
+		x[0][0][1][1][1] = x0[0][1][1][1] ^ x[1][1][0][0][1];
+		x[0][1][0][0][0] = x0[1][0][0][0] ^ x[1][0][1][1][0];
+		x[0][1][0][0][1] = x0[1][0][0][1] ^ x[1][0][1][1][1];
+		x[0][1][0][1][0] = x0[1][0][1][0] ^ x[1][0][1][0][0];
+		x[0][1][0][1][1] = x0[1][0][1][1] ^ x[1][0][1][0][1];
+		x[0][1][1][0][0] = x0[1][1][0][0] ^ x[1][0][0][1][0];
+		x[0][1][1][0][1] = x0[1][1][0][1] ^ x[1][0][0][1][1];
+		x[0][1][1][1][0] = x0[1][1][1][0] ^ x[1][0][0][0][0];
+		x[0][1][1][1][1] = x0[1][1][1][1] ^ x[1][0][0][0][1];

-	}
-}
-
-__device__ __forceinline__ void block_tox(const uint32_t *in, uint32_t x[2][2][2][2][2])
-{
-	x[0][0][0][0][0] ^= in[0];
-	x[0][0][0][0][1] ^= in[1];
-	x[0][0][0][1][0] ^= in[2];
-	x[0][0][0][1][1] ^= in[3];
-	x[0][0][1][0][0] ^= in[4];
-	x[0][0][1][0][1] ^= in[5];
-	x[0][0][1][1][0] ^= in[6];
-	x[0][0][1][1][1] ^= in[7];
-}
-
-__device__ __forceinline__ void hash_fromx(uint32_t *out, uint32_t x[2][2][2][2][2])
-{
-	out[0] = x[0][0][0][0][0];
-	out[1] = x[0][0][0][0][1];
-	out[2] = x[0][0][0][1][0];
-	out[3] = x[0][0][0][1][1];
-	out[4] = x[0][0][1][0][0];
-	out[5] = x[0][0][1][0][1];
-	out[6] = x[0][0][1][1][0];
-	out[7] = x[0][0][1][1][1];
-
-}
-
-__device__ __forceinline__
-void Update32(uint32_t x[2][2][2][2][2], const uint32_t *data)
-{
-	/* "xor the block into the first b bytes of the state" */
-	/* "and then transform the state invertibly through r identical rounds" */
-	block_tox(data, x);
-	rrounds(x);
-}
+		/* "rotate x_0jklm upwards by 7 bits" */
+		x0[0][0][0][0] = ROTATEUPWARDS7(x[0][0][0][0][0]);
+		x0[0][0][0][1] = ROTATEUPWARDS7(x[0][0][0][0][1]);
+		x0[0][0][1][0] = ROTATEUPWARDS7(x[0][0][0][1][0]);
+		x0[0][0][1][1] = ROTATEUPWARDS7(x[0][0][0][1][1]);
+		x0[0][1][0][0] = ROTATEUPWARDS7(x[0][0][1][0][0]);
+		x0[0][1][0][1] = ROTATEUPWARDS7(x[0][0][1][0][1]);
+		x0[0][1][1][0] = ROTATEUPWARDS7(x[0][0][1][1][0]);
+		x0[0][1][1][1] = ROTATEUPWARDS7(x[0][0][1][1][1]);
+		x0[1][0][0][0] = ROTATEUPWARDS7(x[0][1][0][0][0]);
+		x0[1][0][0][1] = ROTATEUPWARDS7(x[0][1][0][0][1]);
+		x0[1][0][1][0] = ROTATEUPWARDS7(x[0][1][0][1][0]);
+		x0[1][0][1][1] = ROTATEUPWARDS7(x[0][1][0][1][1]);
+		x0[1][1][0][0] = ROTATEUPWARDS7(x[0][1][1][0][0]);
+		x0[1][1][0][1] = ROTATEUPWARDS7(x[0][1][1][0][1]);
+		x0[1][1][1][0] = ROTATEUPWARDS7(x[0][1][1][1][0]);
+		x0[1][1][1][1] = ROTATEUPWARDS7(x[0][1][1][1][1]);
+
+		/* "add x_0jklm into x_1~j~k~l~m modulo 2^32" */
+		x1[1][1][1][1] = x[1][1][1][1][1] + x[0][0][0][0][0];
+		x1[1][1][1][0] = x[1][1][1][1][0] + x[0][0][0][0][1];
+		x1[1][1][0][1] = x[1][1][1][0][1] + x[0][0][0][1][0];
+		x1[1][1][0][0] = x[1][1][1][0][0] + x[0][0][0][1][1];
+		x1[1][0][1][1] = x[1][1][0][1][1] + x[0][0][1][0][0];
+		x1[1][0][1][0] = x[1][1][0][1][0] + x[0][0][1][0][1];
+		x1[1][0][0][1] = x[1][1][0][0][1] + x[0][0][1][1][0];
+		x1[1][0][0][0] = x[1][1][0][0][0] + x[0][0][1][1][1];
+		x1[0][1][1][1] = x[1][0][1][1][1] + x[0][1][0][0][0];
+		x1[0][1][1][0] = x[1][0][1][1][0] + x[0][1][0][0][1];
+		x1[0][1][0][1] = x[1][0][1][0][1] + x[0][1][0][1][0];
+		x1[0][1][0][0] = x[1][0][1][0][0] + x[0][1][0][1][1];
+		x1[0][0][1][1] = x[1][0][0][1][1] + x[0][1][1][0][0];
+		x1[0][0][1][0] = x[1][0][0][1][0] + x[0][1][1][0][1];
+		x1[0][0][0][1] = x[1][0][0][0][1] + x[0][1][1][1][0];
+		x1[0][0][0][0] = x[1][0][0][0][0] + x[0][1][1][1][1];
+
+		/* "xor x_1j~k~l~m into x_0jklm" */
+		x[0][0][0][0][0] = x0[0][0][0][0] ^ x1[0][1][1][1];
+		x[0][0][0][0][1] = x0[0][0][0][1] ^ x1[0][1][1][0];
+		x[0][0][0][1][0] = x0[0][0][1][0] ^ x1[0][1][0][1];
+		x[0][0][0][1][1] = x0[0][0][1][1] ^ x1[0][1][0][0];
+		x[0][0][1][0][0] = x0[0][1][0][0] ^ x1[0][0][1][1];
+		x[0][0][1][0][1] = x0[0][1][0][1] ^ x1[0][0][1][0];
+		x[0][0][1][1][0] = x0[0][1][1][0] ^ x1[0][0][0][1];
+		x[0][0][1][1][1] = x0[0][1][1][1] ^ x1[0][0][0][0];
+		x[0][1][0][0][0] = x0[1][0][0][0] ^ x1[1][1][1][1];
+		x[0][1][0][0][1] = x0[1][0][0][1] ^ x1[1][1][1][0];
+		x[0][1][0][1][0] = x0[1][0][1][0] ^ x1[1][1][0][1];
+		x[0][1][0][1][1] = x0[1][0][1][1] ^ x1[1][1][0][0];
+		x[0][1][1][0][0] = x0[1][1][0][0] ^ x1[1][0][1][1];
+		x[0][1][1][0][1] = x0[1][1][0][1] ^ x1[1][0][1][0];
+		x[0][1][1][1][0] = x0[1][1][1][0] ^ x1[1][0][0][1];
+		x[0][1][1][1][1] = x0[1][1][1][1] ^ x1[1][0][0][0];

-__device__ __forceinline__
-void Update32_const(uint32_t x[2][2][2][2][2])
-{
-	x[0][0][0][0][0] ^= 0x80;
-	rrounds(x);
+		/* "rotate x_0jklm upwards by 11 bits" */
+		x0[0][0][0][0] = ROTATEUPWARDS11(x[0][0][0][0][0]);
+		x0[0][0][0][1] = ROTATEUPWARDS11(x[0][0][0][0][1]);
+		x0[0][0][1][0] = ROTATEUPWARDS11(x[0][0][0][1][0]);
+		x0[0][0][1][1] = ROTATEUPWARDS11(x[0][0][0][1][1]);
+		x0[0][1][0][0] = ROTATEUPWARDS11(x[0][0][1][0][0]);
+		x0[0][1][0][1] = ROTATEUPWARDS11(x[0][0][1][0][1]);
+		x0[0][1][1][0] = ROTATEUPWARDS11(x[0][0][1][1][0]);
+		x0[0][1][1][1] = ROTATEUPWARDS11(x[0][0][1][1][1]);
+		x0[1][0][0][0] = ROTATEUPWARDS11(x[0][1][0][0][0]);
+		x0[1][0][0][1] = ROTATEUPWARDS11(x[0][1][0][0][1]);
+		x0[1][0][1][0] = ROTATEUPWARDS11(x[0][1][0][1][0]);
+		x0[1][0][1][1] = ROTATEUPWARDS11(x[0][1][0][1][1]);
+		x0[1][1][0][0] = ROTATEUPWARDS11(x[0][1][1][0][0]);
+		x0[1][1][0][1] = ROTATEUPWARDS11(x[0][1][1][0][1]);
+		x0[1][1][1][0] = ROTATEUPWARDS11(x[0][1][1][1][0]);
+		x0[1][1][1][1] = ROTATEUPWARDS11(x[0][1][1][1][1]);
+
+		/* "add x_0jklm into x_1j~kl~m modulo 2^32" */
+		x[1][0][1][0][1] = x1[0][1][0][1] + x[0][0][0][0][0];
+		x[1][0][1][0][0] = x1[0][1][0][0] + x[0][0][0][0][1];
+		x[1][0][1][1][1] = x1[0][1][1][1] + x[0][0][0][1][0];
+		x[1][0][1][1][0] = x1[0][1][1][0] + x[0][0][0][1][1];
+		x[1][0][0][0][1] = x1[0][0][0][1] + x[0][0][1][0][0];
+		x[1][0][0][0][0] = x1[0][0][0][0] + x[0][0][1][0][1];
+		x[1][0][0][1][1] = x1[0][0][1][1] + x[0][0][1][1][0];
+		x[1][0][0][1][0] = x1[0][0][1][0] + x[0][0][1][1][1];
+		x[1][1][1][0][1] = x1[1][1][0][1] + x[0][1][0][0][0];
+		x[1][1][1][0][0] = x1[1][1][0][0] + x[0][1][0][0][1];
+		x[1][1][1][1][1] = x1[1][1][1][1] + x[0][1][0][1][0];
+		x[1][1][1][1][0] = x1[1][1][1][0] + x[0][1][0][1][1];
+		x[1][1][0][0][1] = x1[1][0][0][1] + x[0][1][1][0][0];
+		x[1][1][0][0][0] = x1[1][0][0][0] + x[0][1][1][0][1];
+		x[1][1][0][1][1] = x1[1][0][1][1] + x[0][1][1][1][0];
+		x[1][1][0][1][0] = x1[1][0][1][0] + x[0][1][1][1][1];
+
+		/* "xor x_1jkl~m into x_0jklm" */
+		x[0][0][0][0][0] = x0[0][0][0][0] ^ x[1][0][0][0][1];
+		x[0][0][0][0][1] = x0[0][0][0][1] ^ x[1][0][0][0][0];
+		x[0][0][0][1][0] = x0[0][0][1][0] ^ x[1][0][0][1][1];
+		x[0][0][0][1][1] = x0[0][0][1][1] ^ x[1][0][0][1][0];
+		x[0][0][1][0][0] = x0[0][1][0][0] ^ x[1][0][1][0][1];
+		x[0][0][1][0][1] = x0[0][1][0][1] ^ x[1][0][1][0][0];
+		x[0][0][1][1][0] = x0[0][1][1][0] ^ x[1][0][1][1][1];
+		x[0][0][1][1][1] = x0[0][1][1][1] ^ x[1][0][1][1][0];
+		x[0][1][0][0][0] = x0[1][0][0][0] ^ x[1][1][0][0][1];
+		x[0][1][0][0][1] = x0[1][0][0][1] ^ x[1][1][0][0][0];
+		x[0][1][0][1][0] = x0[1][0][1][0] ^ x[1][1][0][1][1];
+		x[0][1][0][1][1] = x0[1][0][1][1] ^ x[1][1][0][1][0];
+		x[0][1][1][0][0] = x0[1][1][0][0] ^ x[1][1][1][0][1];
+		x[0][1][1][0][1] = x0[1][1][0][1] ^ x[1][1][1][0][0];
+		x[0][1][1][1][0] = x0[1][1][1][0] ^ x[1][1][1][1][1];
+		x[0][1][1][1][1] = x0[1][1][1][1] ^ x[1][1][1][1][0];
+	}
 }

 __device__ __forceinline__
@ -185,27 +253,44 @@ void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
				@@ -185,27 +253,44 @@ void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
 	x[1][1][1][1][1] ^= 1U;

 	/* "the state is then transformed invertibly through 10r identical rounds" */
-	#pragma unroll 2
 	for (int i = 0; i < 10; ++i) rrounds(x);

 	/* "output the first h/8 bytes of the state" */
-	hash_fromx(hashval, x);
+	hashval[0] = x[0][0][0][0][0];
+	hashval[1] = x[0][0][0][0][1];
+	hashval[2] = x[0][0][0][1][0];
+	hashval[3] = x[0][0][0][1][1];
+	hashval[4] = x[0][0][1][0][0];
+	hashval[5] = x[0][0][1][0][1];
+	hashval[6] = x[0][0][1][1][0];
+	hashval[7] = x[0][0][1][1][1];
 }

 #if __CUDA_ARCH__ >= 500
-
-__global__	__launch_bounds__(TPB, 1)
+__global__	__launch_bounds__(TPB50, 1)
+#else
+__global__	__launch_bounds__(TPB35, 1)
+#endif
 void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
+#if __CUDA_ARCH__ >= 500
 		uint2 Hash[4];

 		Hash[0] = __ldg(&g_hash[thread]);
 		Hash[1] = __ldg(&g_hash[thread + 1 * threads]);
 		Hash[2] = __ldg(&g_hash[thread + 2 * threads]);
 		Hash[3] = __ldg(&g_hash[thread + 3 * threads]);
+#else
+		uint32_t Hash[8];
+
+		LOHI(Hash[0], Hash[1], __ldg(&((uint64_t*)g_hash)[thread]));
+		LOHI(Hash[2], Hash[3], __ldg(&((uint64_t*)g_hash)[thread + 1 * threads]));
+		LOHI(Hash[4], Hash[5], __ldg(&((uint64_t*)g_hash)[thread + 2 * threads]));
+		LOHI(Hash[6], Hash[7], __ldg(&((uint64_t*)g_hash)[thread + 3 * threads]));
+#endif

 		uint32_t x[2][2][2][2][2] =
 		{
@ -219,6 +304,7 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_ha
				@@ -219,6 +304,7 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_ha
 			0x15815AEB, 0x4AB6AAD6, 0x9CDAF8AF, 0xD6032C0A
 		};

+#if __CUDA_ARCH__ >= 500
 		x[0][0][0][0][0] ^= Hash[0].x;
 		x[0][0][0][0][1] ^= Hash[0].y;
 		x[0][0][0][1][0] ^= Hash[1].x;
@ -227,48 +313,7 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_ha
				@@ -227,48 +313,7 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_ha
 		x[0][0][1][0][1] ^= Hash[2].y;
 		x[0][0][1][1][0] ^= Hash[3].x;
 		x[0][0][1][1][1] ^= Hash[3].y;
-
-		rrounds(x);
-		x[0][0][0][0][0] ^= 0x80U;
-		rrounds(x);
-
-		Final(x, (uint32_t*) Hash);
-
-		g_hash[thread] =               Hash[0];
-		g_hash[1 * threads + thread] = Hash[1];
-		g_hash[2 * threads + thread] = Hash[2];
-		g_hash[3 * threads + thread] = Hash[3];
-	}
-}
-
 #else
-
-__global__	__launch_bounds__(TPB, 1)
-void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *d_hash)
-{
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		uint32_t Hash[8];
-		uint64_t* g_hash = (uint64_t*) d_hash;
-
-		LOHI(Hash[0], Hash[1], __ldg(&g_hash[thread]));
-		LOHI(Hash[2], Hash[3], __ldg(&g_hash[thread + 1 * threads]));
-		LOHI(Hash[4], Hash[5], __ldg(&g_hash[thread + 2 * threads]));
-		LOHI(Hash[6], Hash[7], __ldg(&g_hash[thread + 3 * threads]));
-
-		uint32_t x[2][2][2][2][2] =
-		{
-			0xEA2BD4B4, 0xCCD6F29F, 0x63117E71, 0x35481EAE,
-			0x22512D5B, 0xE5D94E63, 0x7E624131, 0xF4CC12BE,
-			0xC2D0B696, 0x42AF2070, 0xD0720C35, 0x3361DA8C,
-			0x28CCECA4, 0x8EF8AD83, 0x4680AC00, 0x40E5FBAB,
-			0xD89041C3, 0x6107FBD5, 0x6C859D41, 0xF0B26679,
-			0x09392549, 0x5FA25603, 0x65C892FD, 0x93CB6285,
-			0x2AF2B5AE, 0x9E4B4E60, 0x774ABFDD, 0x85254725,
-			0x15815AEB, 0x4AB6AAD6, 0x9CDAF8AF, 0xD6032C0A
-		};
-
 		x[0][0][0][0][0] ^= Hash[0];
 		x[0][0][0][0][1] ^= Hash[1];
 		x[0][0][0][1][0] ^= Hash[2];
@ -277,29 +322,48 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *d_ha
				@@ -277,29 +322,48 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *d_ha
 		x[0][0][1][0][1] ^= Hash[5];
 		x[0][0][1][1][0] ^= Hash[6];
 		x[0][0][1][1][1] ^= Hash[7];
-
+#endif
 		rrounds(x);
 		x[0][0][0][0][0] ^= 0x80U;
 		rrounds(x);

+#if __CUDA_ARCH__ >= 500
+		Final(x, (uint32_t*)Hash);
+
+		g_hash[thread] = Hash[0];
+		g_hash[1 * threads + thread] = Hash[1];
+		g_hash[2 * threads + thread] = Hash[2];
+		g_hash[3 * threads + thread] = Hash[3];
+#else
 		Final(x, Hash);

-		g_hash[thread] =               ((uint64_t*)Hash)[0];
-		g_hash[1 * threads + thread] = ((uint64_t*)Hash)[1];
-		g_hash[2 * threads + thread] = ((uint64_t*)Hash)[2];
-		g_hash[3 * threads + thread] = ((uint64_t*)Hash)[3];
+		((uint64_t*)g_hash)[thread] = ((uint64_t*)Hash)[0];
+		((uint64_t*)g_hash)[1 * threads + thread] = ((uint64_t*)Hash)[1];
+		((uint64_t*)g_hash)[2 * threads + thread] = ((uint64_t*)Hash)[2];
+		((uint64_t*)g_hash)[3 * threads + thread] = ((uint64_t*)Hash)[3];
+#endif
 	}
 }

-#endif
-
 __host__
 void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order)
 {
-	uint32_t tpb = TPB;
+	uint32_t tpb = TPB35;
+	if (cuda_arch[thr_id] >= 500) tpb = TPB50;
+
+	dim3 grid((threads + tpb - 1) / tpb);
+	dim3 block(tpb);
+
+	cubehash256_gpu_hash_32 << <grid, block >> > (threads, startNounce, (uint2*)d_hash);
+}
+__host__
+void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order, cudaStream_t stream)
+{
+	uint32_t tpb = TPB35;
+	if (cuda_arch[thr_id] >= 500) tpb = TPB50;

-	dim3 grid((threads + tpb-1)/tpb);
+	dim3 grid((threads + tpb - 1) / tpb);
 	dim3 block(tpb);

-	cubehash256_gpu_hash_32 <<<grid, block>>> (threads, startNounce, (uint2*) d_hash);
+	cubehash256_gpu_hash_32 << <grid, block, 0, stream >> > (threads, startNounce, (uint2*)d_hash);
 }
--- a/Algo256/cuda_skein256.cu
+++ b/Algo256/cuda_skein256.cu
@ -13,40 +13,296 @@ void Round512v35(uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p
				@@ -13,40 +13,296 @@ void Round512v35(uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p
 }

 __forceinline__ __device__
-void Round_8_512v35(const uint2 *const __restrict__ ks, const uint2 *const __restrict__ ts,
-	uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int R)
+void Round_8_512v35_1(const uint2 ks[9], const uint2 ts[3],
+uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
 {
 	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
 	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
 	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
-	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44,  9, 54, 56);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);

-	p0 += ks[(R+0) % 9];
-	p1 += ks[(R+1) % 9];
-	p2 += ks[(R+2) % 9];
-	p3 += ks[(R+3) % 9];
-	p4 += ks[(R+4) % 9];
-	p5 += ks[(R+5) % 9] + ts[(R+0) % 3];
-	p6 += ks[(R+6) % 9] + ts[(R+1) % 3];
-	p7 += ks[(R+7) % 9] + make_uint2(R, 0);
+	p0 += ks[1];
+	p1 += ks[2];
+	p2 += ks[3];
+	p3 += ks[4];
+	p4 += ks[5];
+	p5 += ks[6] + ts[1];
+	p6 += ks[7] + ts[2];
+	p7 += ks[8] + make_uint2(1, 0);

 	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
 	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
 	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
-	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8,  35, 56, 22);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
+
+	p0 += ks[2];
+	p1 += ks[3];
+	p2 += ks[4];
+	p3 += ks[5];
+	p4 += ks[6];
+	p5 += ks[7] + ts[2];
+	p6 += ks[8] + ts[0];
+	p7 += ks[0] + make_uint2(2, 0);
+}
+__forceinline__ __device__
+void Round_8_512v35_3(const uint2 ks[9], const uint2 ts[3],
+uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);

-	p0 += ks[(R+1) % 9];
-	p1 += ks[(R+2) % 9];
-	p2 += ks[(R+3) % 9];
-	p3 += ks[(R+4) % 9];
-	p4 += ks[(R+5) % 9];
-	p5 += ks[(R+6) % 9] + ts[(R+1) % 3];
-	p6 += ks[(R+7) % 9] + ts[(R+2) % 3];
-	p7 += ks[(R+8) % 9] + make_uint2(R+1, 0);
+	p0 += ks[3];
+	p1 += ks[4];
+	p2 += ks[5];
+	p3 += ks[6];
+	p4 += ks[7];
+	p5 += ks[8] + ts[0];
+	p6 += ks[0] + ts[1];
+	p7 += ks[1] + make_uint2(3, 0);
+
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
+
+	p0 += ks[4];
+	p1 += ks[5];
+	p2 += ks[6];
+	p3 += ks[7];
+	p4 += ks[8];
+	p5 += ks[0] + ts[1];
+	p6 += ks[1] + ts[2];
+	p7 += ks[2] + make_uint2(4, 0);
+}
+__forceinline__ __device__
+void Round_8_512v35_5(const uint2 ks[9], const uint2 ts[3],
+uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
+
+	p0 += ks[5];
+	p1 += ks[6];
+	p2 += ks[7];
+	p3 += ks[8];
+	p4 += ks[0];
+	p5 += ks[1] + ts[2];
+	p6 += ks[2] + ts[0];
+	p7 += ks[3] + make_uint2(5, 0);
+
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
+
+	p0 += ks[6];
+	p1 += ks[7];
+	p2 += ks[8];
+	p3 += ks[0];
+	p4 += ks[1];
+	p5 += ks[2] + ts[0];
+	p6 += ks[3] + ts[1];
+	p7 += ks[4] + make_uint2(6, 0);
+}
+__forceinline__ __device__
+void Round_8_512v35_7(const uint2 ks[9], const uint2 ts[3],
+uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
+
+	p0 += ks[7];
+	p1 += ks[8];
+	p2 += ks[0];
+	p3 += ks[1];
+	p4 += ks[2];
+	p5 += ks[3] + ts[1];
+	p6 += ks[4] + ts[2];
+	p7 += ks[5] + make_uint2(7, 0);
+
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
+
+	p0 += ks[8];
+	p1 += ks[0];
+	p2 += ks[1];
+	p3 += ks[2];
+	p4 += ks[3];
+	p5 += ks[4] + ts[2];
+	p6 += ks[5] + ts[0];
+	p7 += ks[6] + make_uint2(8, 0);
 }
+__forceinline__ __device__
+void Round_8_512v35_9(const uint2 ks[9], const uint2 ts[3],
+uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
+
+	p0 += ks[0];
+	p1 += ks[1];
+	p2 += ks[2];
+	p3 += ks[3];
+	p4 += ks[4];
+	p5 += ks[5] + ts[0];
+	p6 += ks[6] + ts[1];
+	p7 += ks[7] + make_uint2(9, 0);

+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
+
+	p0 += ks[1];
+	p1 += ks[2];
+	p2 += ks[3];
+	p3 += ks[4];
+	p4 += ks[5];
+	p5 += ks[6] + ts[1];
+	p6 += ks[7] + ts[2];
+	p7 += ks[8] + make_uint2(10, 0);
+}
 __forceinline__ __device__
-void Round_8_512v35_final(const uint2 *const __restrict__ ks, const uint2 *const __restrict__ ts,
+void Round_8_512v35_11(const uint2 ks[9], const uint2 ts[3],
+uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
+
+	p0 += ks[2];
+	p1 += ks[3];
+	p2 += ks[4];
+	p3 += ks[5];
+	p4 += ks[6];
+	p5 += ks[7] + ts[2];
+	p6 += ks[8] + ts[0];
+	p7 += ks[0] + make_uint2(11, 0);
+
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
+
+	p0 += ks[3];
+	p1 += ks[4];
+	p2 += ks[5];
+	p3 += ks[6];
+	p4 += ks[7];
+	p5 += ks[8] + ts[0];
+	p6 += ks[0] + ts[1];
+	p7 += ks[1] + make_uint2(12, 0);
+}
+__forceinline__ __device__
+void Round_8_512v35_13(const uint2 ks[9], const uint2 ts[3],
+uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
+
+	p0 += ks[4];
+	p1 += ks[5];
+	p2 += ks[6];
+	p3 += ks[7];
+	p4 += ks[8];
+	p5 += ks[0] + ts[1];
+	p6 += ks[1] + ts[2];
+	p7 += ks[2] + make_uint2(13, 0);
+
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
+
+	p0 += ks[5];
+	p1 += ks[6];
+	p2 += ks[7];
+	p3 += ks[8];
+	p4 += ks[0];
+	p5 += ks[1] + ts[2];
+	p6 += ks[2] + ts[0];
+	p7 += ks[3] + make_uint2(14, 0);
+}
+__forceinline__ __device__
+void Round_8_512v35_15(const uint2 ks[9], const uint2 ts[3],
+uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
+
+	p0 += ks[6];
+	p1 += ks[7];
+	p2 += ks[8];
+	p3 += ks[0];
+	p4 += ks[1];
+	p5 += ks[2] + ts[0];
+	p6 += ks[3] + ts[1];
+	p7 += ks[4] + make_uint2(15, 0);
+
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
+
+	p0 += ks[7];
+	p1 += ks[8];
+	p2 += ks[0];
+	p3 += ks[1];
+	p4 += ks[2];
+	p5 += ks[3] + ts[1];
+	p6 += ks[4] + ts[2];
+	p7 += ks[5] + make_uint2(16, 0);
+}
+__forceinline__ __device__
+void Round_8_512v35_17(const uint2 ks[9], const uint2 ts[3],
+uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56);
+
+	p0 += ks[8];
+	p1 += ks[0];
+	p2 += ks[1];
+	p3 += ks[2];
+	p4 += ks[3];
+	p5 += ks[4] + ts[2];
+	p6 += ks[5] + ts[0];
+	p7 += ks[6] + make_uint2(17, 0);
+
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22);
+
+	p0 += ks[0];
+	p1 += ks[1];
+	p2 += ks[2];
+	p3 += ks[3];
+	p4 += ks[4];
+	p5 += ks[5] + ts[0];
+	p6 += ks[6] + ts[1];
+	p7 += ks[7] + make_uint2(18, 0);
+}
+
+__forceinline__ __device__
+void Round_8_512v35_final(const uint2 ks[9], const uint2 ts[3],
 	uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7)
 {
 	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37);
@ -74,96 +330,88 @@ void Round_8_512v35_final(const uint2 *const __restrict__ ks, const uint2 *const
				@@ -74,96 +330,88 @@ void Round_8_512v35_final(const uint2 *const __restrict__ ks, const uint2 *const
 	p3 += ks[3];
 }

-__global__ __launch_bounds__(256,3)
-void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
+__global__ __launch_bounds__(256, 4)
+void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint2 skein_ks_parity = { 0xA9FC1A22, 0x1BD11BDA };
+
+	const uint2 h2[9] = {
+		{ 0x2FDB3E13, 0xCCD044A1 },
+		{ 0x1A79A9EB, 0xE8359030 },
+		{ 0x4F816E6F, 0x55AEA061 },
+		{ 0xAE9B94DB, 0x2A2767A4 },
+		{ 0x74DD7683, 0xEC06025E },
+		{ 0xC4746251, 0xE7A436CD },
+		{ 0x393AD185, 0xC36FBAF9 },
+		{ 0x33EDFC13, 0x3EEDBA18 },
+		{ 0xC73A4E2A, 0xB69D3CFC }
+	};
+	const uint2 t12[2][3] = {
+		{ { 0x20, 0 },
+		{ 0, 0xf0000000 },
+		{ 0x20, 0xf0000000 } },
+		{ { 0x08, 0 },
+		{ 0, 0xff000000 },
+		{ 0x08, 0xff000000 } }
+	};

 	if (thread < threads)
 	{
-		const uint2 skein_ks_parity = { 0xA9FC1A22, 0x1BD11BDA };
-		const uint2 t12[6] = {
-			{ 0x20, 0 },
-			{ 0,    0xf0000000 },
-			{ 0x20, 0xf0000000 },
-			{ 0x08, 0 },
-			{ 0,    0xff000000 },
-			{ 0x08, 0xff000000 }
-		};

-		uint2 h[9] = {
-			{ 0x2FDB3E13, 0xCCD044A1 },
-			{ 0x1A79A9EB, 0xE8359030 },
-			{ 0x4F816E6F, 0x55AEA061 },
-			{ 0xAE9B94DB, 0x2A2767A4 },
-			{ 0x74DD7683, 0xEC06025E },
-			{ 0xC4746251, 0xE7A436CD },
-			{ 0x393AD185, 0xC36FBAF9 },
-			{ 0x33EDFC13, 0x3EEDBA18 },
-			{ 0xC73A4E2A, 0xB69D3CFC }
-		};
 		uint2 dt0,dt1,dt2,dt3;
 		uint2 p0, p1, p2, p3, p4, p5, p6, p7;

-		LOHI(dt0.x,dt0.y,outputHash[thread]);
-		LOHI(dt1.x,dt1.y,outputHash[threads+thread]);
-		LOHI(dt2.x,dt2.y,outputHash[2*threads+thread]);
-		LOHI(dt3.x,dt3.y,outputHash[3*threads+thread]);
+		dt0 = __ldg(&outputHash[0 * threads + thread]);
+		dt1 = __ldg(&outputHash[1 * threads + thread]);
+		dt2 = __ldg(&outputHash[2 * threads + thread]);
+		dt3 = __ldg(&outputHash[3 * threads + thread]);

-		p0 = h[0] + dt0;
-		p1 = h[1] + dt1;
-		p2 = h[2] + dt2;
-		p3 = h[3] + dt3;
-		p4 = h[4];
-		p5 = h[5] + t12[0];
-		p6 = h[6] + t12[1];
-		p7 = h[7];
+		p0 = h2[0] + dt0;
+		p1 = h2[1] + dt1;
+		p2 = h2[2] + dt2;
+		p3 = h2[3] + dt3;
+		p4 = h2[4];
+		p5 = h2[5] + t12[0][0];
+		p6 = h2[6] + t12[0][1];
+		p7 = h2[7];

 		// forced unroll required
-		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 1);
-		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 3);
-		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 5);
-		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 7);
-		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 9);
-		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 11);
-		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 13);
-		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 15);
-		Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 17);
+		Round_8_512v35_1(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_3(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_5(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_7(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_9(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_11(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_13(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_15(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_17(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7);

 		p0 ^= dt0;
 		p1 ^= dt1;
 		p2 ^= dt2;
 		p3 ^= dt3;

-		h[0] = p0;
-		h[1] = p1;
-		h[2] = p2;
-		h[3] = p3;
-		h[4] = p4;
-		h[5] = p5;
-		h[6] = p6;
-		h[7] = p7;
-		h[8] = skein_ks_parity ^ h[0] ^ h[1] ^ h[2] ^ h[3] ^ h[4] ^ h[5] ^ h[6] ^ h[7];
+		const uint2 h[9] = { p0, p1, p2, p3, p4, p5, p6, p7, skein_ks_parity ^ h[0] ^ h[1] ^ h[2] ^ h[3] ^ h[4] ^ h[5] ^ h[6] ^ h[7] };

-		const uint2 *t = t12+3;
-		p5 += t12[3];  //p5 already equal h[5]
-		p6 += t12[4];
+		p5 += t12[1][0];  //p5 already equal h[5]
+		p6 += t12[1][1];

 		// forced unroll
-		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 1);
-		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 3);
-		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 5);
-		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 7);
-		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 9);
-		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 11);
-		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 13);
-		Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 15);
-		Round_8_512v35_final(h, t, p0, p1, p2, p3, p4, p5, p6, p7);
-
-		outputHash[thread]           = devectorize(p0);
-		outputHash[threads+thread]   = devectorize(p1);
-		outputHash[2*threads+thread] = devectorize(p2);
-		outputHash[3*threads+thread] = devectorize(p3);
+		Round_8_512v35_1(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_3(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_5(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_7(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_9(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_11(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_13(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_15(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
+		Round_8_512v35_final(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7);
+
+		outputHash[0 * threads + thread] = p0;
+		outputHash[1 * threads + thread] = p1;
+		outputHash[2 * threads + thread] = p2;
+		outputHash[3 * threads + thread] = p3;
 	}
 }

@ -304,10 +552,27 @@ void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, ui
				@@ -304,10 +552,27 @@ void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, ui

 	// only 1kH/s perf change between kernels on a 960...
 	if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
-		skein256_gpu_hash_32<<<grid, block>>>(threads, startNounce, d_outputHash);
+		skein256_gpu_hash_32 << <grid, block >> >(threads, startNounce, (uint2*)d_outputHash);
 	else
-		skein256_gpu_hash_32_v30<<<grid, block>>>(threads, startNounce, d_outputHash);
+		skein256_gpu_hash_32_v30 << <grid, block >> >(threads, startNounce, d_outputHash);

-	MyStreamSynchronize(NULL, order, thr_id);
+	//MyStreamSynchronize(NULL, order, thr_id);
 }

+__host__
+void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order, cudaStream_t stream)
+{
+	const uint32_t threadsperblock = 256;
+	int dev_id = device_map[thr_id];
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	// only 1kH/s perf change between kernels on a 960...
+	if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300)
+		skein256_gpu_hash_32 << <grid, block, 0, stream >> >(threads, startNounce, (uint2*)d_outputHash);
+	else
+		skein256_gpu_hash_32_v30 << <grid, block,0, stream >> >(threads, startNounce, d_outputHash);
+
+	//MyStreamSynchronize(NULL, order, thr_id);
+}
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -83,6 +83,7 @@ bool opt_debug_threads = false;
				@@ -83,6 +83,7 @@ bool opt_debug_threads = false;
 bool opt_protocol = false;
 bool opt_benchmark = false;
 bool opt_showdiff = false;
+bool opt_eco_mode = false;

 // todo: limit use of these flags,
 // prefer the pools[] attributes
@ -91,6 +92,7 @@ bool have_longpoll = false;
				@@ -91,6 +92,7 @@ bool have_longpoll = false;
 bool want_stratum = true;
 bool have_stratum = false;
 bool allow_gbt = true;
+bool allow_getwork = true;
 bool allow_mininginfo = true;
 bool check_dups = false;
 bool check_stratum_jobs = false;
@ -165,6 +167,8 @@ char *short_url = NULL;
				@@ -165,6 +167,8 @@ char *short_url = NULL;
 struct stratum_ctx stratum = { 0 };
 pthread_mutex_t stratum_sock_lock;
 pthread_mutex_t stratum_work_lock;
+static unsigned char pk_script[25] = { 0 };
+static size_t pk_script_size = 0;

 char *opt_cert;
 char *opt_proxy;
@ -185,6 +189,7 @@ pthread_mutex_t stats_lock;
				@@ -185,6 +189,7 @@ pthread_mutex_t stats_lock;
 double thr_hashrates[MAX_GPUS] = { 0 };
 uint64_t global_hashrate = 0;
 double   stratum_diff = 0.0;
+static char *lp_id;
 double   net_diff = 0;
 uint64_t net_hashrate = 0;
 uint64_t net_blocks = 0;
@ -226,8 +231,8 @@ Options:\n\
				@@ -226,8 +231,8 @@ Options:\n\
 			jackpot     Jackpot\n\
 			keccak      Keccak-256 (Maxcoin)\n\
 			luffa       Joincoin\n\
-			lyra2       LyraBar\n\
-			lyra2v2     VertCoin\n\
+			lyra2       Lyra2RE(Crypto)\n\
+			lyra2v2     Lyra2REv2(VertCoin)\n\
 			mjollnir    Mjollnircoin\n\
 			myr-gr      Myriad-Groestl\n\
 			neoscrypt   FeatherCoin, Phoenix, UFO...\n\
@ -256,6 +261,8 @@ Options:\n\
				@@ -256,6 +261,8 @@ Options:\n\
                        (matching 2nd gt640 in the PC)\n\
  -i  --intensity=N[,N] GPU intensity 8.0-25.0 (default: auto) \n\
                        Decimals are allowed for fine tuning \n\
+      --eco             Use Eco mode\n\
+	                    Auto tuning for low energy (Lyra2REv2 only)\n\
      --cuda-schedule   Set device threads scheduling mode (default: auto)\n\
  -f, --diff-factor     Divide difficulty by this factor (default 1.0) \n\
  -m, --diff-multiplier Multiply difficulty by this value (default 1.0) \n\
@ -278,6 +285,8 @@ Options:\n\
				@@ -278,6 +285,8 @@ Options:\n\
                          long polling is unavailable, in seconds (default: 10)\n\
  -n, --ndevs           list cuda devices\n\
  -N, --statsavg        number of samples used to compute hashrate (default: 30)\n\
+      --coinbase-addr=ADDR  payout address for solo mining\n\
+      --no-getwork      disable getwork support\n\
      --no-gbt          disable getblocktemplate support (height check in solo)\n\
      --no-longpoll     disable X-Long-Polling support\n\
      --no-stratum      disable X-Stratum support\n\
@ -329,6 +338,7 @@ struct option options[] = {
				@@ -329,6 +338,7 @@ struct option options[] = {
 	{ "background", 0, NULL, 'B' },
 	{ "benchmark", 0, NULL, 1005 },
 	{ "cert", 1, NULL, 1001 },
+	{ "coinbase-addr", 1, NULL, 1016 },
 	{ "config", 1, NULL, 'c' },
 	{ "cputest", 0, NULL, 1006 },
 	{ "cpu-affinity", 1, NULL, 1020 },
@ -341,6 +351,7 @@ struct option options[] = {
				@@ -341,6 +351,7 @@ struct option options[] = {
 	{ "no-color", 0, NULL, 1002 },
 	{ "no-extranonce", 0, NULL, 1012 },
 	{ "no-gbt", 0, NULL, 1011 },
+	{ "no-getwork", 0, NULL, 1010 },
 	{ "no-longpoll", 0, NULL, 1003 },
 	{ "no-stratum", 0, NULL, 1007 },
 	{ "no-autotune", 0, NULL, 1004 },  // scrypt
@ -394,6 +405,7 @@ struct option options[] = {
				@@ -394,6 +405,7 @@ struct option options[] = {
 	{ "diff-multiplier", 1, NULL, 'm' },
 	{ "diff-factor", 1, NULL, 'f' },
 	{ "diff", 1, NULL, 'f' }, // compat
+	{ "eco", 0, NULL, 1080 },
 	{ 0, 0, 0, 0 }
 };

@ -892,7 +904,65 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
				@@ -892,7 +904,65 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		if (check_dups)
 			hashlog_remember_submit(work, nonce);

-	} else {
+	}
+	else if (work->txs2)
+	{
+
+		char data_str[2 * sizeof(work->data) + 1];
+		char *req;
+
+		for (int i = 0; i < ARRAY_SIZE(work->data); i++)
+			be32enc(work->data + i, work->data[i]);
+		cbin2hex(data_str, (char *)work->data, 80);
+		if (work->workid) {
+			char *params;
+			val = json_object();
+			json_object_set_new(val, "workid", json_string(work->workid));
+			params = json_dumps(val, 0);
+			json_decref(val);
+			req = (char*)malloc(128 + 2 * 80 + strlen(work->txs2) + strlen(params));
+			sprintf(req,
+				"{\"method\": \"submitblock\", \"params\": [\"%s%s\", %s], \"id\":4}\r\n",
+				data_str, work->txs2, params);
+			free(params);
+		}
+		else {
+			req = (char*)malloc(128 + 2 * 80 + strlen(work->txs2));
+			sprintf(req,
+				"{\"method\": \"submitblock\", \"params\": [\"%s%s\"], \"id\":4}\r\n",
+				data_str, work->txs2);
+		}
+
+		val = json_rpc_call_pool(curl, pool, req, false, false, NULL);
+		free(req);
+		if (unlikely(!val)) {
+			applog(LOG_ERR, "submit_upstream_work json_rpc_call failed");
+			return false;
+		}
+
+		res = json_object_get(val, "result");
+		if (json_is_object(res)) {
+			char *res_str;
+			bool sumres = false;
+			void *iter = json_object_iter(res);
+			while (iter) {
+				if (json_is_null(json_object_iter_value(iter))) {
+					sumres = true;
+					break;
+				}
+				iter = json_object_iter_next(res, iter);
+			}
+			res_str = json_dumps(res, 0);
+			share_result(sumres, work->pooln, work->sharediff, res_str);
+			free(res_str);
+		}
+		else
+			share_result(json_is_null(res), work->pooln, work->sharediff, json_string_value(res));
+
+		json_decref(val);
+
+	}
+	else {

 		int data_size = 128;
 		int adata_sz = data_size / sizeof(uint32_t);
@ -924,6 +994,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
				@@ -924,6 +994,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)

 		/* issue JSON-RPC request */
 		val = json_rpc_call_pool(curl, pool, s, false, false, NULL);
+		free(str);
 		if (unlikely(!val)) {
 			applog(LOG_ERR, "submit_upstream_work json_rpc_call failed");
 			return false;
@ -940,12 +1011,15 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
				@@ -940,12 +1011,15 @@ static bool submit_upstream_work(CURL *curl, struct work *work)

 		json_decref(val);

-		free(str);
 	}

 	return true;
 }

+#ifndef ORG
+#define BLOCK_VERSION_CURRENT 7
+#endif
+
 /* simplified method to only get some extra infos in solo mode */
 static bool gbt_work_decode(const json_t *val, struct work *work)
 {
@ -985,8 +1059,311 @@ static bool gbt_work_decode(const json_t *val, struct work *work)
				@@ -985,8 +1059,311 @@ static bool gbt_work_decode(const json_t *val, struct work *work)
 	return true;
 }

+#ifndef ORG
+int varint_encode(unsigned char *p, uint64_t n)
+{
+	int i;
+	if (n < 0xfd) {
+		p[0] = (uchar)n;
+		return 1;
+	}
+	if (n <= 0xffff) {
+		p[0] = 0xfd;
+		p[1] = n & 0xff;
+		p[2] = (uchar)(n >> 8);
+		return 3;
+	}
+	if (n <= 0xffffffff) {
+		p[0] = 0xfe;
+		for (i = 1; i < 5; i++) {
+			p[i] = n & 0xff;
+			n >>= 8;
+		}
+		return 5;
+	}
+	p[0] = 0xff;
+	for (i = 1; i < 9; i++) {
+		p[i] = n & 0xff;
+		n >>= 8;
+	}
+	return 9;
+}
+
+static bool gbt_work_decode_full(const json_t *val, struct work *work)
+{
+	int i, n;
+	uint32_t version, curtime, bits;
+	uint32_t prevhash[8];
+	uint32_t target[8];
+	int cbtx_size;
+	uchar *cbtx = NULL;
+	int tx_count, tx_size;
+	uchar txc_vi[9];
+	uchar(*merkle_tree)[32] = NULL;
+	bool coinbase_append = false;
+	bool submit_coinbase = false;
+	bool version_force = false;
+	bool version_reduce = false;
+	json_t *tmp, *txa;
+	bool rc = false;
+
+	tmp = json_object_get(val, "mutable");
+	if (tmp && json_is_array(tmp)) {
+		n = (int)json_array_size(tmp);
+		for (i = 0; i < n; i++) {
+			const char *s = json_string_value(json_array_get(tmp, i));
+			if (!s)
+				continue;
+			if (!strcmp(s, "coinbase/append"))
+				coinbase_append = true;
+			else if (!strcmp(s, "submit/coinbase"))
+				submit_coinbase = true;
+			else if (!strcmp(s, "version/force"))
+				version_force = true;
+			else if (!strcmp(s, "version/reduce"))
+				version_reduce = true;
+		}
+	}
+
+	tmp = json_object_get(val, "height");
+	if (!tmp || !json_is_integer(tmp)) {
+		applog(LOG_ERR, "JSON invalid height");
+		goto out;
+	}
+	work->height = (int)json_integer_value(tmp);
+	applog(LOG_BLUE, "Current block is %d", work->height);
+
+	tmp = json_object_get(val, "version");
+	if (!tmp || !json_is_integer(tmp)) {
+		applog(LOG_ERR, "JSON invalid version");
+		goto out;
+	}
+	version = (uint32_t)json_integer_value(tmp);
+	if ((version & 0xffU) > BLOCK_VERSION_CURRENT) {
+		if (version_reduce) {
+			version = (version & ~0xffU) | BLOCK_VERSION_CURRENT;
+		}
+		else if (allow_gbt && allow_getwork && !version_force) {
+			applog(LOG_DEBUG, "Switching to getwork, gbt version %d", version);
+			allow_gbt = false;
+			goto out;
+		}
+		else if (!version_force) {
+			applog(LOG_ERR, "Unrecognized block version: %u", version);
+			goto out;
+		}
+	}
+
+	if (unlikely(!jobj_binary(val, "previousblockhash", prevhash, sizeof(prevhash)))) {
+		applog(LOG_ERR, "JSON invalid previousblockhash");
+		goto out;
+	}
+
+	tmp = json_object_get(val, "curtime");
+	if (!tmp || !json_is_integer(tmp)) {
+		applog(LOG_ERR, "JSON invalid curtime");
+		goto out;
+	}
+	curtime = (uint32_t)json_integer_value(tmp);
+
+	if (unlikely(!jobj_binary(val, "bits", &bits, sizeof(bits)))) {
+		applog(LOG_ERR, "JSON invalid bits");
+		goto out;
+	}
+
+	/* find count and size of transactions */
+	txa = json_object_get(val, "transactions");
+	if (!txa || !json_is_array(txa)) {
+		applog(LOG_ERR, "JSON invalid transactions");
+		goto out;
+	}
+	tx_count = (int)json_array_size(txa);
+	tx_size = 0;
+	for (i = 0; i < tx_count; i++) {
+		const json_t *tx = json_array_get(txa, i);
+		const char *tx_hex = json_string_value(json_object_get(tx, "data"));
+		if (!tx_hex) {
+			applog(LOG_ERR, "JSON invalid transactions");
+			goto out;
+		}
+		tx_size += (int)(strlen(tx_hex) / 2);
+	}
+
+	/* build coinbase transaction */
+	tmp = json_object_get(val, "coinbasetxn");
+	if (tmp) {
+		const char *cbtx_hex = json_string_value(json_object_get(tmp, "data"));
+		cbtx_size = cbtx_hex ? (int)strlen(cbtx_hex) / 2 : 0;
+		cbtx = (uchar*)malloc(cbtx_size + 100);
+		if (cbtx_size < 60 || !hex2bin(cbtx, cbtx_hex, cbtx_size)) {
+			applog(LOG_ERR, "JSON invalid coinbasetxn");
+			goto out;
+		}
+	}
+	else {
+		int64_t cbvalue;
+		if (!pk_script_size) {
+			if (allow_getwork) {
+				applog(LOG_INFO, "No payout address provided, switching to getwork");
+				allow_gbt = false;
+			}
+			else
+				applog(LOG_ERR, "No payout address provided");
+			goto out;
+		}
+		tmp = json_object_get(val, "coinbasevalue");
+		if (!tmp || !json_is_number(tmp)) {
+			applog(LOG_ERR, "JSON invalid coinbasevalue");
+			goto out;
+		}
+		cbvalue = (int64_t)(json_is_integer(tmp) ? json_integer_value(tmp) : json_number_value(tmp));
+		cbtx = (uchar*)malloc(256);
+		le32enc((uint32_t *)cbtx, 1); /* version */
+		cbtx[4] = 1; /* in-counter */
+		memset(cbtx + 5, 0x00, 32); /* prev txout hash */
+		le32enc((uint32_t *)(cbtx + 37), 0xffffffff); /* prev txout index */
+		cbtx_size = 43;
+		/* BIP 34: height in coinbase */
+		for (n = work->height; n; n >>= 8)
+			cbtx[cbtx_size++] = n & 0xff;
+		cbtx[42] = cbtx_size - 43;
+		cbtx[41] = cbtx_size - 42; /* scriptsig length */
+		le32enc((uint32_t *)(cbtx + cbtx_size), 0xffffffff); /* sequence */
+		cbtx_size += 4;
+		cbtx[cbtx_size++] = 1; /* out-counter */
+		le32enc((uint32_t *)(cbtx + cbtx_size), (uint32_t)cbvalue); /* value */
+		le32enc((uint32_t *)(cbtx + cbtx_size + 4), cbvalue >> 32);
+		cbtx_size += 8;
+		cbtx[cbtx_size++] = (uint8_t)pk_script_size; /* txout-script length */
+		memcpy(cbtx + cbtx_size, pk_script, pk_script_size);
+		cbtx_size += (int)pk_script_size;
+		le32enc((uint32_t *)(cbtx + cbtx_size), 0); /* lock time */
+		cbtx_size += 4;
+		coinbase_append = true;
+	}
+	if (coinbase_append) {
+		unsigned char xsig[100];
+		int xsig_len = 0;
+		tmp = json_object_get(val, "coinbaseaux");
+		if (tmp && json_is_object(tmp)) {
+			void *iter = json_object_iter(tmp);
+			while (iter) {
+				unsigned char buf[100];
+				const char *s = json_string_value(json_object_iter_value(iter));
+				n = s ? (int)(strlen(s) / 2) : 0;
+				if (!s || n > 100 || !hex2bin(buf, s, n)) {
+					applog(LOG_ERR, "JSON invalid coinbaseaux");
+					break;
+				}
+				if (cbtx[41] + xsig_len + n <= 100) {
+					memcpy(xsig + xsig_len, buf, n);
+					xsig_len += n;
+				}
+				iter = json_object_iter_next(tmp, iter);
+			}
+		}
+		if (xsig_len) {
+			unsigned char *ssig_end = cbtx + 42 + cbtx[41];
+			int push_len = cbtx[41] + xsig_len < 76 ? 1 :
+				cbtx[41] + 2 + xsig_len > 100 ? 0 : 2;
+			n = xsig_len + push_len;
+			memmove(ssig_end + n, ssig_end, cbtx_size - 42 - cbtx[41]);
+			cbtx[41] += n;
+			if (push_len == 2)
+				*(ssig_end++) = 0x4c; /* OP_PUSHDATA1 */
+			if (push_len)
+				*(ssig_end++) = xsig_len;
+			memcpy(ssig_end, xsig, xsig_len);
+			cbtx_size += n;
+		}
+	}
+
+	n = varint_encode(txc_vi, 1 + tx_count);
+
+	work->txs2 = (char*)malloc(2 * (n + cbtx_size + tx_size) + 1);
+	cbin2hex(work->txs2, (char *)txc_vi, n);
+	cbin2hex(work->txs2 + 2 * n, (char *)cbtx, cbtx_size);
+
+	/* generate merkle root */
+	merkle_tree = (uchar(*)[32]) calloc(((1 + tx_count + 1) & ~1), 32);
+	sha256d(merkle_tree[0], cbtx, cbtx_size);
+
+	for (i = 0; i < tx_count; i++) {
+		tmp = json_array_get(txa, i);
+		const char *tx_hex = json_string_value(json_object_get(tmp, "data"));
+		const int tx_size = tx_hex ? (int)(strlen(tx_hex) / 2) : 0;
+		unsigned char *tx = (uchar*)malloc(tx_size);
+		if (!tx_hex || !hex2bin(tx, tx_hex, tx_size)) {
+			applog(LOG_ERR, "JSON invalid transactions");
+			free(tx);
+			goto out;
+		}
+		sha256d(merkle_tree[1 + i], tx, tx_size);
+		if (!submit_coinbase)
+			strcat(work->txs2, tx_hex);
+	}
+	n = 1 + tx_count;
+	while (n > 1) {
+		if (n % 2) {
+			memcpy(merkle_tree[n], merkle_tree[n - 1], 32);
+			++n;
+		}
+		n /= 2;
+		for (i = 0; i < n; i++)
+			sha256d(merkle_tree[i], merkle_tree[2 * i], 64);
+	}
+
+	/* assemble block header */
+	work->data[0] = swab32(version);
+	for (i = 0; i < 8; i++)
+		work->data[8 - i] = le32dec(prevhash + i);
+	for (i = 0; i < 8; i++)
+		work->data[9 + i] = be32dec((uint32_t *)merkle_tree[0] + i);
+	work->data[17] = swab32(curtime);
+	work->data[18] = le32dec(&bits);
+	memset(work->data + 19, 0x00, 52);
+	work->data[20] = 0x80000000;
+	work->data[31] = 0x00000280;
+
+	if (unlikely(!jobj_binary(val, "target", target, sizeof(target)))) {
+		applog(LOG_ERR, "JSON invalid target");
+		goto out;
+	}
+	for (i = 0; i < ARRAY_SIZE(work->target); i++)
+		work->target[7 - i] = be32dec(target + i);
+	tmp = json_object_get(val, "workid");
+	if (tmp) {
+		if (!json_is_string(tmp)) {
+			applog(LOG_ERR, "JSON invalid workid");
+			goto out;
+		}
+		work->workid = strdup(json_string_value(tmp));
+	}
+
+	rc = true;
+out:
+	/* Long polling */
+	tmp = json_object_get(val, "longpollid");
+	if (want_longpoll && json_is_string(tmp)) {
+		free(lp_id);
+		lp_id = strdup(json_string_value(tmp));
+		if (!have_longpoll) {
+			char *lp_uri;
+			tmp = json_object_get(val, "longpolluri");
+			lp_uri = json_is_string(tmp) ? strdup(json_string_value(tmp)) : rpc_url;
+			have_longpoll = true;
+			tq_push(thr_info[longpoll_thr_id].q, lp_uri);
+		}
+	}
+
+	free(merkle_tree);
+	free(cbtx);
+	return rc;
+}
+#endif
+
 #define GBT_CAPABILITIES "[\"coinbasetxn\", \"coinbasevalue\", \"longpoll\", \"workid\"]"
-static const char *gbt_req =
+static const char *gbt_req_ =
 	"{\"method\": \"getblocktemplate\", \"params\": [{"
 	//	"\"capabilities\": " GBT_CAPABILITIES ""
 	"}], \"id\":9}\r\n";
@ -998,7 +1375,7 @@ static bool get_blocktemplate(CURL *curl, struct work *work)
				@@ -998,7 +1375,7 @@ static bool get_blocktemplate(CURL *curl, struct work *work)
 		return false;

 	int curl_err = 0;
-	json_t *val = json_rpc_call_pool(curl, pool, gbt_req, false, false, &curl_err);
+	json_t *val = json_rpc_call_pool(curl, pool, gbt_req_, false, false, &curl_err);

 	if (!val && curl_err == -1) {
 		// when getblocktemplate is not supported, disable it
@ -1068,8 +1445,19 @@ static bool get_mininginfo(CURL *curl, struct work *work)
				@@ -1068,8 +1445,19 @@ static bool get_mininginfo(CURL *curl, struct work *work)
 	return true;
 }

+#ifdef ORG
 static const char *rpc_req =
 	"{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n";
+#else
+static const char *getwork_req =
+"{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n";
+static const char *gbt_req =
+"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
+GBT_CAPABILITIES "}], \"id\":0}\r\n";
+#endif
+static const char *gbt_lp_req =
+"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": "
+GBT_CAPABILITIES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n";

 static bool get_upstream_work(CURL *curl, struct work *work)
 {
@ -1082,9 +1470,18 @@ static bool get_upstream_work(CURL *curl, struct work *work)
				@@ -1082,9 +1470,18 @@ static bool get_upstream_work(CURL *curl, struct work *work)
 		applog(LOG_DEBUG, "%s: want_longpoll=%d have_longpoll=%d",
 			__func__, want_longpoll, have_longpoll);

+#ifndef ORG
+	int err;
+start:
+#endif
 	gettimeofday(&tv_start, NULL);
 	/* want_longpoll/have_longpoll required here to init/unlock the lp thread */
+#ifdef ORG
 	val = json_rpc_call_pool(curl, pool, rpc_req, want_longpoll, have_longpoll, NULL);
+#else
+	val = json_rpc_call_pool(curl, pool, allow_gbt ? gbt_req : getwork_req, want_longpoll, have_longpoll, &err);
+
+#endif
 	gettimeofday(&tv_end, NULL);

 	if (have_stratum || unlikely(work->pooln != cur_pooln)) {
@ -1093,10 +1490,39 @@ static bool get_upstream_work(CURL *curl, struct work *work)
				@@ -1093,10 +1490,39 @@ static bool get_upstream_work(CURL *curl, struct work *work)
 		return false;
 	}

+#ifndef ORG
+	if (!allow_gbt && !allow_getwork) {
+		applog(LOG_ERR, "No usable protocol");
+		if (val)
+			json_decref(val);
+		return false;
+	}
+
+	if (allow_gbt && allow_getwork && !val && err == CURLE_OK) {
+		applog(LOG_NOTICE, "getblocktemplate failed, falling back to getwork");
+		allow_gbt = false;
+		goto start;
+	}
+
+#endif
+
 	if (!val)
 		return false;

-	rc = work_decode(json_object_get(val, "result"), work);
+#ifndef ORG
+	if (allow_gbt) {
+		rc = gbt_work_decode_full(json_object_get(val, "result"), work);
+		if (!allow_gbt) {
+			json_decref(val);
+			goto start;
+		}
+	}
+	else {
+#endif
+		rc = work_decode(json_object_get(val, "result"), work);
+#ifndef ORG
+	}
+#endif

 	if (opt_protocol && rc) {
 		timeval_subtract(&diff, &tv_end, &tv_start);
@ -1393,7 +1819,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
				@@ -1393,7 +1819,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		else
 			sha256d(merkle_root, merkle_root, 64);
 	}
-	
+
 	/* Increment extranonce2 */
 	for (i = 0; i < (int)sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++);

@ -1720,8 +2146,8 @@ static void *miner_thread(void *userdata)
				@@ -1720,8 +2146,8 @@ static void *miner_thread(void *userdata)
 			#endif
 			memcpy(&work, &g_work, sizeof(struct work));
 			nonceptr[0] = (UINT32_MAX / opt_n_threads) * thr_id; // 0 if single thr
-		} else
-			nonceptr[0]++; //??
+		}
+		//else			nonceptr[0]++; //??

 		if (opt_algo == ALGO_DECRED) {
 			// suprnova job_id check without data/target/height change...
@ -2136,10 +2562,15 @@ static void *miner_thread(void *userdata)
				@@ -2136,10 +2562,15 @@ static void *miner_thread(void *userdata)
 			}
 		}

-		if (rc > 0)
+
+
+/*		if (rc > 0)
 			work.scanned_to = work.nonces[0];
 		if (rc > 1)
 			work.scanned_to = max(work.nonces[0], work.nonces[1]);
+*/
+		if (rc > 0)
+			work.scanned_to = start_nonce + hashes_done;
 		else {
 			work.scanned_to = max_nonce;
 			if (opt_debug && opt_benchmark) {
@ -2209,6 +2640,7 @@ static void *miner_thread(void *userdata)
				@@ -2209,6 +2640,7 @@ static void *miner_thread(void *userdata)
 					break;
 			}
 		}
+		nonceptr[0] = start_nonce + hashes_done;
 	}

 out:
@ -2278,6 +2710,7 @@ longpoll_retry:
				@@ -2278,6 +2710,7 @@ longpoll_retry:

 	while (!abort_flag) {
 		json_t *val = NULL, *soval;
+		char *req = NULL;
 		int err = 0;

 		if (opt_debug_threads)
@ -2288,7 +2721,12 @@ longpoll_retry:
				@@ -2288,7 +2721,12 @@ longpoll_retry:
 		if (switchn != pool_switch_count)
 			goto need_reinit;

-		val = json_rpc_longpoll(curl, lp_url, pool, rpc_req, &err);
+		if (allow_gbt) {
+			req = (char*)malloc(strlen(gbt_lp_req) + strlen(lp_id) + 1);
+			sprintf(req, gbt_lp_req, lp_id);
+		}
+		val = json_rpc_longpoll(curl, lp_url, pool, req ? req : getwork_req, &err);
+		if (allow_gbt) free(req);
 		if (have_stratum || switchn != pool_switch_count) {
 			if (val)
 				json_decref(val);
@ -2486,7 +2924,7 @@ wait_stratum_url:
				@@ -2486,7 +2924,7 @@ wait_stratum_url:
 			}
 			pthread_mutex_unlock(&g_work_lock);
 		}
-		
+
 		// check we are on the right pool
 		if (switchn != pool_switch_count) goto pool_switched;

@ -2552,6 +2990,109 @@ static void show_usage_and_exit(int status)
				@@ -2552,6 +2990,109 @@ static void show_usage_and_exit(int status)
 	}
 	proper_exit(status);
 }
+static const char b58digits[] = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz";
+
+static bool b58dec(unsigned char *bin, size_t binsz, const char *b58)
+{
+	size_t i, j;
+	uint64_t t;
+	uint32_t c;
+	uint32_t *outi;
+	size_t outisz = (binsz + 3) / 4;
+	int rem = binsz % 4;
+	uint32_t remmask = 0xffffffff << (8 * rem);
+	size_t b58sz = strlen(b58);
+	bool rc = false;
+
+	outi = (uint32_t *)calloc(outisz, sizeof(*outi));
+
+	for (i = 0; i < b58sz; ++i) {
+		for (c = 0; b58digits[c] != b58[i]; c++)
+			if (!b58digits[c])
+				goto out;
+		for (j = outisz; j--;) {
+			t = (uint64_t)outi[j] * 58 + c;
+			c = t >> 32;
+			outi[j] = t & 0xffffffff;
+		}
+		if (c || outi[0] & remmask)
+			goto out;
+	}
+
+	j = 0;
+	switch (rem) {
+	case 3:
+		*(bin++) = (outi[0] >> 16) & 0xff;
+	case 2:
+		*(bin++) = (outi[0] >> 8) & 0xff;
+	case 1:
+		*(bin++) = outi[0] & 0xff;
+		++j;
+	default:
+		break;
+	}
+	for (; j < outisz; ++j) {
+		be32enc((uint32_t *)bin, outi[j]);
+		bin += sizeof(uint32_t);
+	}
+
+	rc = true;
+out:
+	free(outi);
+	return rc;
+}
+
+static int b58check(unsigned char *bin, size_t binsz, const char *b58)
+{
+	unsigned char buf[32];
+	int i;
+
+	sha256d(buf, bin, (int)(binsz - 4));
+	if (memcmp(&bin[binsz - 4], buf, 4))
+		return -1;
+
+	/* Check number of zeros is correct AFTER verifying checksum
+	* (to avoid possibility of accessing the string beyond the end) */
+	for (i = 0; bin[i] == '\0' && b58[i] == '1'; ++i);
+	if (bin[i] == '\0' || b58[i] == '1')
+		return -3;
+
+	return bin[0];
+}
+
+size_t address_to_script(unsigned char *out, size_t outsz, const char *addr)
+{
+	unsigned char addrbin[25];
+	int addrver;
+	size_t rv;
+
+	if (!b58dec(addrbin, sizeof(addrbin), addr))
+		return 0;
+	addrver = b58check(addrbin, sizeof(addrbin), addr);
+	if (addrver < 0)
+		return 0;
+	switch (addrver) {
+	case 5:    /* Bitcoin script hash */
+	case 196:  /* Testnet script hash */
+		if (outsz < (rv = 23))
+			return rv;
+		out[0] = 0xa9;  /* OP_HASH160 */
+		out[1] = 0x14;  /* push 20 bytes */
+		memcpy(&out[2], &addrbin[1], 20);
+		out[22] = 0x87;  /* OP_EQUAL */
+		return rv;
+	default:
+		if (outsz < (rv = 25))
+			return rv;
+		out[0] = 0x76;  /* OP_DUP */
+		out[1] = 0xa9;  /* OP_HASH160 */
+		out[2] = 0x14;  /* push 20 bytes */
+		memcpy(&out[3], &addrbin[1], 20);
+		out[23] = 0x88;  /* OP_EQUALVERIFY */
+		out[24] = 0xac;  /* OP_CHECKSIG */
+		return rv;
+	}
+}

 void parse_arg(int key, char *arg)
 {
@ -2611,6 +3152,9 @@ void parse_arg(int key, char *arg)
				@@ -2611,6 +3152,9 @@ void parse_arg(int key, char *arg)
 	case 1030: /* --api-remote */
 		opt_api_remote = 1;
 		break;
+	case 1080:
+		opt_eco_mode = true;
+		break;
 	case 'B':
 		opt_background = true;
 		break;
@ -2946,9 +3490,19 @@ void parse_arg(int key, char *arg)
				@@ -2946,9 +3490,19 @@ void parse_arg(int key, char *arg)
 	case 1009:
 		opt_shares_limit = atoi(arg);
 		break;
+	case 1010:
+		allow_getwork = false;
+		break;
 	case 1011:
 		allow_gbt = false;
 		break;
+	case 1016:			/* --coinbase-addr */
+		pk_script_size = address_to_script(pk_script, sizeof(pk_script), arg);
+		if (!pk_script_size) {
+			fprintf(stderr, "invalid address -- '%s'\n", arg);
+			show_usage_and_exit(1);
+		}
+		break;
 	case 1012:
 		opt_extranonce = false;
 		break;
@ -3186,7 +3740,7 @@ static void parse_cmdline(int argc, char *argv[])
				@@ -3186,7 +3740,7 @@ static void parse_cmdline(int argc, char *argv[])
 		show_usage_and_exit(1);
 	}

-	if (opt_algo == ALGO_DECRED && opt_vote == 9999) {
+	if (opt_vote == 9999) {
 		opt_vote = 0; // default, don't vote
 	}
 }
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -41,10 +41,7 @@
				@@ -41,10 +41,7 @@
    <LinkIncremental>false</LinkIncremental>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
-  <ImportGroup Label="ExtensionSettings" Condition="'$(Platform)'=='Win32'">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.props" />
-  </ImportGroup>
-  <ImportGroup Label="ExtensionSettings" Condition="'$(Platform)'=='x64'">
+  <ImportGroup Label="ExtensionSettings">
    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
@ -83,10 +80,10 @@
				@@ -83,10 +80,10 @@
    <CudaCompile>
      <CInterleavedPTX>false</CInterleavedPTX>
      <GenerateLineInfo>true</GenerateLineInfo>
-      <MaxRegCount>80</MaxRegCount>
+      <MaxRegCount>255</MaxRegCount>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>true</Keep>
-      <CodeGeneration>compute_50,sm_50</CodeGeneration>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_35,sm_35;compute_30,sm_30;compute_20,sm_20</CodeGeneration>
      <Include>$(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99</Include>
    </CudaCompile>
  </ItemDefinitionGroup>
@ -115,15 +112,16 @@
				@@ -115,15 +112,16 @@
    <CudaCompile>
      <CInterleavedPTX>false</CInterleavedPTX>
      <GenerateLineInfo>true</GenerateLineInfo>
-      <MaxRegCount>80</MaxRegCount>
+      <MaxRegCount>255</MaxRegCount>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>true</Keep>
-      <CodeGeneration>compute_50,sm_50</CodeGeneration>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_35,sm_35;compute_30,sm_30;compute_20,sm_20</CodeGeneration>
      <Include>$(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99</Include>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
    <CudaLink>
      <PerformDeviceLink>false</PerformDeviceLink>
+      <Optimization>O3</Optimization>
    </CudaLink>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
@ -158,16 +156,16 @@
				@@ -158,16 +156,16 @@
    </Link>
    <CudaCompile>
      <CInterleavedPTX>false</CInterleavedPTX>
-      <MaxRegCount>80</MaxRegCount>
+      <MaxRegCount>255</MaxRegCount>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>true</Keep>
-      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_30,sm_30;compute_20,sm_21</CodeGeneration>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_35,sm_35;compute_30,sm_30;compute_20,sm_20</CodeGeneration>
      <AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
-      <Optimization>O2</Optimization>
+      <Optimization>O3</Optimization>
    </CudaCompile>
    <CudaLink>
      <GPUDebugInfo>false</GPUDebugInfo>
-      <Optimization>O3</Optimization>
+      <Optimization>O2</Optimization>
    </CudaLink>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
@ -201,10 +199,10 @@
				@@ -201,10 +199,10 @@
    </Link>
    <CudaCompile>
      <CInterleavedPTX>false</CInterleavedPTX>
-      <MaxRegCount>80</MaxRegCount>
+      <MaxRegCount>255</MaxRegCount>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>true</Keep>
-      <CodeGeneration>compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;compute_20,sm_21</CodeGeneration>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52;compute_35,sm_35;compute_30,sm_30;compute_20,sm_20</CodeGeneration>
      <Include>$(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99</Include>
      <Optimization>O3</Optimization>
      <TargetMachinePlatform>64</TargetMachinePlatform>
@ -250,6 +248,7 @@
				@@ -250,6 +248,7 @@
    <ClCompile Include="lyra2\Lyra2.c" />
    <ClCompile Include="lyra2\Sponge.c" />
    <ClInclude Include="lyra2\cuda_lyra2_sm2.cuh" />
+    <ClInclude Include="lyra2\cuda_lyra2_sm5.cuh" />
    <ClInclude Include="neoscrypt\neoscrypt.h" />
    <ClCompile Include="neoscrypt\neoscrypt.cpp" />
    <ClCompile Include="neoscrypt\neoscrypt-cpu.c" />
@ -347,7 +346,6 @@
				@@ -347,7 +346,6 @@
    <ClInclude Include="uint256.h" />
    <ClInclude Include="lyra2\Lyra2.h" />
    <ClInclude Include="lyra2\Sponge.h" />
-    <ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh" />
    <ClInclude Include="quark\groestl_transf_quad.h" />
    <ClInclude Include="quark\groestl_functions_quad.h" />
    <ClInclude Include="quark\cuda_quark.h" />
@ -527,10 +525,7 @@
				@@ -527,10 +525,7 @@
    <Text Include="README.txt" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets" Condition="'$(Platform)'=='Win32'">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.targets" />
-  </ImportGroup>
-  <ImportGroup Label="ExtensionTargets" Condition="'$(Platform)'=='x64'">
+  <ImportGroup Label="ExtensionTargets">
    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.targets" />
  </ImportGroup>
  <!-- Copy the required dlls -->
@ -540,4 +535,4 @@
				@@ -540,4 +535,4 @@
  <Target Name="AfterClean">
    <Delete Files="@(FilesToCopy->'$(OutDir)%(Filename)%(Extension)')" TreatErrorsAsWarnings="true" />
  </Target>
-</Project>
+</Project>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -437,9 +437,6 @@
				@@ -437,9 +437,6 @@
    <ClInclude Include="bignum.hpp">
      <Filter>Header Files</Filter>
    </ClInclude>
-    <ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh">
-      <Filter>Source Files\CUDA\lyra2</Filter>
-    </ClInclude>
    <ClInclude Include="lyra2\cuda_lyra2_sm2.cuh">
      <Filter>Source Files\CUDA\lyra2</Filter>
    </ClInclude>
@ -455,6 +452,9 @@
				@@ -455,6 +452,9 @@
    <ClInclude Include="x11\cuda_x11_simd512_sm2.cuh">
      <Filter>Source Files\CUDA\x11</Filter>
    </ClInclude>
+    <ClInclude Include="lyra2\cuda_lyra2_sm5.cuh">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <CudaCompile Include="cuda.cpp">
@ -728,4 +728,4 @@
				@@ -728,4 +728,4 @@
      <Filter>Ressources</Filter>
    </Text>
  </ItemGroup>
-</Project>
+</Project>
--- a/configure.ac
+++ b/configure.ac
@ -1,4 +1,4 @@
				@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [1.7.6], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [1.7.6-r10], [], [ccminer], [http://github.com/tpruvot/ccminer])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -96,7 +96,6 @@ __device__ __forceinline__ uint64_t REPLACE_LODWORD(const uint64_t &x, const uin
				@@ -96,7 +96,6 @@ __device__ __forceinline__ uint64_t REPLACE_LODWORD(const uint64_t &x, const uin
 	return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y);
 }

-// Endian Drehung für 32 Bit Typen
 #ifdef __CUDA_ARCH__
 __device__ __forceinline__ uint32_t cuda_swab32(uint32_t x)
 {
@ -471,6 +470,15 @@ static __host__ __device__ __forceinline__ uint64_t devectorize(uint2 v) {
				@@ -471,6 +470,15 @@ static __host__ __device__ __forceinline__ uint64_t devectorize(uint2 v) {
 #endif
 }

+static __device__ __forceinline__ uint2 eorswap32(uint2 u, uint2 v)
+{
+	uint2 result;
+	result.y = u.x ^ v.x;
+	result.x = u.y ^ v.y;
+	return result;
+}
+
+
 /**
 * uint2 direct ops by c++ operator definitions
 */
@ -561,11 +569,9 @@ uint2 ROR2(const uint2 a, const int offset)
				@@ -561,11 +569,9 @@ uint2 ROR2(const uint2 a, const int offset)
 	return result;
 }

-__device__ __forceinline__
-uint2 ROL2(const uint2 a, const int offset)
-{
+#if  __CUDA_ARCH__ >= 350
+__inline__ __device__ uint2 ROL2(const uint2 a, const int offset) {
 	uint2 result;
-#if __CUDA_ARCH__ > 300
 	if (offset >= 32) {
 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
@ -574,14 +580,20 @@ uint2 ROL2(const uint2 a, const int offset)
				@@ -574,14 +580,20 @@ uint2 ROL2(const uint2 a, const int offset)
 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
 	}
+	return result;
+}
 #else
-	if (!offset)
-		result = a;
+__inline__ __device__ uint2 ROL2(const uint2 v, const int n)
+{
+	uint2 result;
+	if (!n)
+		result = v;
 	else
-		result = ROR2(a, 64 - offset);
-#endif
+		result = ROR2(v, 64 - n);
+
 	return result;
 }
+#endif

 __device__ __forceinline__
 uint2 SWAPUINT2(uint2 value)
--- a/lyra2/cuda_lyra2.cu
+++ b/lyra2/cuda_lyra2.cu
@ -1,41 +1,211 @@
				@@ -1,41 +1,211 @@
 /**
- * Lyra2 (v1) cuda implementation based on djm34 work - SM 5/5.2
- * tpruvot@github 2015
- */
+* Lyra2 (v1) cuda implementation based on djm34 work - SM 5/5.2
+* tpruvot@github 2015
+*/

 #include <stdio.h>
 #include <memory.h>

-#define TPB50 16
-#define TPB52 8
+#define TPB52 32

 #include "cuda_lyra2_sm2.cuh"
+#include "cuda_lyra2_sm5.cuh"

 #ifdef __INTELLISENSE__
 /* just for vstudio code colors */
-#define __CUDA_ARCH__ 500
+#define __CUDA_ARCH__ 520
 #endif

-#if !defined(__CUDA_ARCH__) ||  __CUDA_ARCH__ >= 500
+#if !defined(__CUDA_ARCH__) ||  __CUDA_ARCH__ > 500

-#include "cuda_vector_uint2x4.h"
+#include "cuda_lyra2_vectors.h"

-#define memshift 3
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+__device__ uint32_t __shfl(uint32_t a, uint32_t b, uint32_t c);
+#endif

+#define Nrow 8
 #define Ncol 8
-#define NcolMask 0x7
+#define memshift 3
+
+#define BUF_COUNT 0
+
+__device__ uint2 *DMatrix;
+
+__device__ __forceinline__ void LD4S(uint2 res[3], const int row, const int col, const int thread, const int threads)
+{
+#if BUF_COUNT != 8
+	extern __shared__ uint2 shared_mem[];
+	const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift;
+#endif
+#if BUF_COUNT != 0
+	const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x;
+#endif
+
+#if BUF_COUNT == 8
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		res[j] = *(DMatrix + d0 + j * threads * blockDim.x);
+#elif BUF_COUNT == 0
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+#else
+	if (row < BUF_COUNT)
+	{
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			res[j] = *(DMatrix + d0 + j * threads * blockDim.x);
+	}
+	else
+	{
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+	}
+#endif
+}

-__device__ uint2x4* DMatrix;
+__device__ __forceinline__ void ST4S(const int row, const int col, const uint2 data[3], const int thread, const int threads)
+{
+#if BUF_COUNT != 8
+	extern __shared__ uint2 shared_mem[];
+	const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift;
+#endif
+#if BUF_COUNT != 0
+	const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x;
+#endif
+
+#if BUF_COUNT == 8
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		*(DMatrix + d0 + j * threads * blockDim.x) = data[j];
+#elif BUF_COUNT == 0
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j];
+#else
+	if (row < BUF_COUNT)
+	{
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + d0 + j * threads * blockDim.x) = data[j];
+	}
+	else
+	{
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j];
+	}
+#endif
+}
+
+#if __CUDA_ARCH__ >= 300
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	return __shfl(a, b, c);
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	a1 = WarpShuffle(a1, b1, c);
+	a2 = WarpShuffle(a2, b2, c);
+	a3 = WarpShuffle(a3, b3, c);
+}
+
+#else
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+	uint32_t *_ptr = (uint32_t*)shared_mem;
+
+	__threadfence_block();
+	uint32_t buf = _ptr[thread];
+
+	_ptr[thread] = a;
+	__threadfence_block();
+	uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	_ptr[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a;
+	__threadfence_block();
+	uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a1;
+	__threadfence_block();
+	a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a2;
+	__threadfence_block();
+	a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a3;
+	__threadfence_block();
+	a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+	__threadfence_block();
+}
+
+#endif

 static __device__ __forceinline__
 void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
 {
-	a += b; d ^= a; d = SWAPUINT2(d);
-	c += d; b ^= c; b = ROR2(b, 24);
-	a += b; d ^= a; d = ROR2(d, 16);
+	a += b; d = eorswap32(a, d);
+	c += d; b ^= c; b = ROR24(b);
+	a += b; d ^= a; d = ROR16(d);
 	c += d; b ^= c; b = ROR2(b, 63);
 }

+__device__ __forceinline__ void round_lyra(uint2 s[4])
+{
+	Gfunc(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4);
+	Gfunc(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4);
+}
+
 static __device__ __forceinline__
 void round_lyra(uint2x4* s)
 {
@ -50,21 +220,24 @@ void round_lyra(uint2x4* s)
				@@ -50,21 +220,24 @@ void round_lyra(uint2x4* s)
 }

 static __device__ __forceinline__
-void reduceDuplex(uint2x4 state[4], uint32_t thread)
+void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads)
 {
-	uint2x4 state1[3];
-
-	const uint32_t ps1 = (256 * thread);
-	const uint32_t ps2 = (memshift * 7 + memshift * 8 + 256 * thread);
+	uint2 state1[3];

-	#pragma unroll 4
-	for (int i = 0; i < 8; i++)
+#if __CUDA_ARCH__ > 500
+#pragma unroll
+#endif
+	for (int i = 0; i < Nrow; i++)
 	{
-		const uint32_t s1 = ps1 + i*memshift;
-		const uint32_t s2 = ps2 - i*memshift;
+		ST4S(0, Ncol - i - 1, state, thread, threads);

-		for (int j = 0; j < 3; j++)
-			state1[j] = __ldg4(&(DMatrix+s1)[j]);
+		round_lyra(state);
+	}
+
+#pragma unroll 4
+	for (int i = 0; i < Nrow; i++)
+	{
+		LD4S(state1, 0, i, thread, threads);
 		for (int j = 0; j < 3; j++)
 			state[j] ^= state1[j];

@ -72,208 +245,342 @@ void reduceDuplex(uint2x4 state[4], uint32_t thread)
				@@ -72,208 +245,342 @@ void reduceDuplex(uint2x4 state[4], uint32_t thread)

 		for (int j = 0; j < 3; j++)
 			state1[j] ^= state[j];
-		for (int j = 0; j < 3; j++)
-			(DMatrix + s2)[j] = state1[j];
+		ST4S(1, Ncol - i - 1, state1, thread, threads);
 	}
 }

 static __device__ __forceinline__
-void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2x4 state[4], uint32_t thread)
+void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], uint32_t thread, const uint32_t threads)
 {
-	uint2x4 state1[3], state2[3];
-
-	const uint32_t ps1 = (             memshift*8 * rowIn    + 256 * thread);
-	const uint32_t ps2 = (             memshift*8 * rowInOut + 256 * thread);
-	const uint32_t ps3 = (memshift*7 + memshift*8 * rowOut   + 256 * thread);
+	uint2 state1[3], state2[3];

-	#pragma unroll 1
-	for (int i = 0; i < 8; i++)
+#pragma unroll 1
+	for (int i = 0; i < Nrow; i++)
 	{
-		const uint32_t s1 = ps1 + i*memshift;
-		const uint32_t s2 = ps2 + i*memshift;
+		LD4S(state1, rowIn, i, thread, threads);
+		LD4S(state2, rowInOut, i, thread, threads);
 		for (int j = 0; j < 3; j++)
-			state1[j]= __ldg4(&(DMatrix + s1)[j]);
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+#pragma unroll
 		for (int j = 0; j < 3; j++)
-			state2[j]= __ldg4(&(DMatrix + s2)[j]);
-		for (int j = 0; j < 3; j++) {
-			uint2x4 tmp = state1[j] + state2[j];
-			state[j] ^= tmp;
+			state1[j] ^= state[j];
+
+		ST4S(rowOut, Ncol - i - 1, state1, thread, threads);
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
 		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		ST4S(rowInOut, i, state2, thread, threads);
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	for (int i = 0; i < Nrow; i++)
+	{
+		uint2 state1[3], state2[3];
+
+		LD4S(state1, rowIn, i, thread, threads);
+		LD4S(state2, rowInOut, i, thread, threads);
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];

 		round_lyra(state);

-		for (int j = 0; j < 3; j++) {
-			const uint32_t s3 = ps3 - i*memshift;
-			state1[j] ^= state[j];
-			(DMatrix + s3)[j] = state1[j];
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
 		}

-		((uint2*)state2)[0] ^= ((uint2*)state)[11];
+		ST4S(rowInOut, i, state2, thread, threads);

-		for (int j = 0; j < 11; j++)
-			((uint2*)state2)[j+1] ^= ((uint2*)state)[j];
+		LD4S(state1, rowOut, i, thread, threads);

+#pragma unroll
 		for (int j = 0; j < 3; j++)
-			(DMatrix + s2)[j] = state2[j];
+			state1[j] ^= state[j];
+
+		ST4S(rowOut, i, state1, thread, threads);
 	}
 }

 static __device__ __forceinline__
-void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2x4* state, const uint32_t thread)
+void reduceDuplexRowt_8(const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads)
 {
-	const uint32_t ps1 = (memshift * 8 * rowIn    + 256 * thread);
-	const uint32_t ps2 = (memshift * 8 * rowInOut + 256 * thread);
-	const uint32_t ps3 = (memshift * 8 * rowOut   + 256 * thread);

-	#pragma unroll 1
-	for (int i = 0; i < 8; i++)
+	uint2 state1[3], state2[3], last[3];
+
+	LD4S(state1, 2, 0, thread, threads);
+	LD4S(last, rowInOut, 0, thread, threads);
+
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= state1[j] + last[j];
+
+	round_lyra(state);
+
+	//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+	uint2 Data0 = state[0];
+	uint2 Data1 = state[1];
+	uint2 Data2 = state[2];
+	WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+	if (threadIdx.x == 0)
+	{
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	}
+	else
 	{
-		uint2x4 state1[3], state2[3];
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
+	}

-		const uint32_t s1 = ps1 + i*memshift;
-		const uint32_t s2 = ps2 + i*memshift;
+	if (rowInOut == 5)
+	{
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}

-		for (int j = 0; j < 3; j++) {
-			state1[j] = __ldg4(&(DMatrix + s1)[j]);
-			state2[j] = __ldg4(&(DMatrix + s2)[j]);
-		}
+	for (int i = 1; i < Nrow; i++)
+	{
+		LD4S(state1, 2, i, thread, threads);
+		LD4S(state2, rowInOut, i, thread, threads);

-		#pragma unroll
-		for (int j = 0; j < 3; j++) {
-			state1[j] += state2[j];
-			state[j]  ^= state1[j];
-		}
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];

 		round_lyra(state);
-
-		((uint2*)state2)[0] ^= ((uint2*)state)[11];
-
-		for (int j = 0; j < 11; j++)
-			((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
-
-		if (rowInOut == rowOut) {
-			for (int j = 0; j < 3; j++) {
-				state2[j] ^= state[j];
-				(DMatrix + s2)[j]=state2[j];
-			}
-		} else {
-			const uint32_t s3 = ps3 + i*memshift;
-			for (int j = 0; j < 3; j++) {
-				(DMatrix + s2)[j] = state2[j];
-				(DMatrix + s3)[j] ^= state[j];
-			}
-		}
 	}
+
+
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= last[j];
 }

-#if __CUDA_ARCH__ == 500
-__global__ __launch_bounds__(TPB50, 1)
-#else
-__global__ __launch_bounds__(TPB52, 2)
-#endif
-void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+__constant__ uint2x4 blake2b_IV[2] = {
+	0xf3bcc908lu, 0x6a09e667lu,
+	0x84caa73blu, 0xbb67ae85lu,
+	0xfe94f82blu, 0x3c6ef372lu,
+	0x5f1d36f1lu, 0xa54ff53alu,
+	0xade682d1lu, 0x510e527flu,
+	0x2b3e6c1flu, 0x9b05688clu,
+	0xfb41bd6blu, 0x1f83d9ablu,
+	0x137e2179lu, 0x5be0cd19lu
+};
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);

-	const uint2x4 blake2b_IV[2] = {
-		{{ 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a }},
-		{{ 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 }}
-	};
-
 	if (thread < threads)
 	{
 		uint2x4 state[4];

-		((uint2*)state)[0] = __ldg(&g_hash[thread]);
-		((uint2*)state)[1] = __ldg(&g_hash[thread + threads]);
-		((uint2*)state)[2] = __ldg(&g_hash[thread + threads*2]);
-		((uint2*)state)[3] = __ldg(&g_hash[thread + threads*3]);
-
-		state[1] = state[0];
+		state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]);
+		state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]);
+		state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]);
+		state[0].w = state[1].w = __ldg(&g_hash[thread + threads * 3]);
 		state[2] = blake2b_IV[0];
 		state[3] = blake2b_IV[1];

 		for (int i = 0; i<24; i++)
 			round_lyra(state); //because 12 is not enough

-		const uint32_t ps1 = (memshift * 7  + 256 * thread);
-		for (int i = 0; i < 8; i++)
-		{
-			const uint32_t s1 = ps1 - memshift * i;
-			for (int j = 0; j < 3; j++)
-				(DMatrix + s1)[j] = (state)[j];
-			round_lyra(state);
-		}
+		((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
+		((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
+		((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
+		((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
+	}
+}

-		reduceDuplex(state, thread);
-
-		reduceDuplexRowSetup(1, 0, 2, state,  thread);
-		reduceDuplexRowSetup(2, 1, 3, state,  thread);
-		reduceDuplexRowSetup(3, 0, 4, state,  thread);
-		reduceDuplexRowSetup(4, 3, 5, state,  thread);
-		reduceDuplexRowSetup(5, 2, 6, state,  thread);
-		reduceDuplexRowSetup(6, 1, 7, state,  thread);
-
-		uint32_t rowa = state[0].x.x & 7;
-		reduceDuplexRowt(7, rowa, 0, state, thread);
-		rowa = state[0].x.x & 7;
-		reduceDuplexRowt(0, rowa, 3, state, thread);
-		rowa = state[0].x.x & 7;
-		reduceDuplexRowt(3, rowa, 6, state, thread);
-		rowa = state[0].x.x & 7;
-		reduceDuplexRowt(6, rowa, 1, state, thread);
-		rowa = state[0].x.x & 7;
-		reduceDuplexRowt(1, rowa, 4, state, thread);
-		rowa = state[0].x.x & 7;
-		reduceDuplexRowt(4, rowa, 7, state, thread);
-		rowa = state[0].x.x & 7;
-		reduceDuplexRowt(7, rowa, 2, state, thread);
-		rowa = state[0].x.x & 7;
-		reduceDuplexRowt(2, rowa, 5, state, thread);
-
-		const int32_t shift = (memshift * 8 * rowa + 256 * thread);
-
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state[j] ^= __ldg4(&(DMatrix + shift)[j]);
+#if __CUDA_ARCH__ < 300
+__global__ __launch_bounds__(TPB20, 1)
+#elif __CUDA_ARCH__ < 500
+__global__ __launch_bounds__(TPB30, 1)
+#elif __CUDA_ARCH__ == 500
+__global__ __launch_bounds__(TPB50, 1)
+#else
+__global__ __launch_bounds__(TPB52, 1)
+#endif
+void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+{
+	const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
+
+	if (thread < threads)
+	{
+		uint2 state[4];
+		state[0] = __ldg(&DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x]);
+		state[1] = __ldg(&DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x]);
+		state[2] = __ldg(&DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x]);
+		state[3] = __ldg(&DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x]);
+
+		reduceDuplex(state, thread, threads);
+
+		reduceDuplexRowSetup(1, 0, 2, state, thread, threads);
+		reduceDuplexRowSetup(2, 1, 3, state, thread, threads);
+		reduceDuplexRowSetup(3, 0, 4, state, thread, threads);
+		reduceDuplexRowSetup(4, 3, 5, state, thread, threads);
+		reduceDuplexRowSetup(5, 2, 6, state, thread, threads);
+		reduceDuplexRowSetup(6, 1, 7, state, thread, threads);
+
+		uint32_t rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(7, rowa, 0, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(0, rowa, 3, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(3, rowa, 6, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(6, rowa, 1, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(1, rowa, 4, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(4, rowa, 7, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt(7, rowa, 2, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt_8(rowa, state, thread, threads);
+
+		DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x] = state[0];
+		DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x] = state[1];
+		DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x] = state[2];
+		DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x] = state[3];
+	}
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	uint28 state[4];
+
+	if (thread < threads)
+	{
+		state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
+		state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
+		state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
+		state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);

 		for (int i = 0; i < 12; i++)
 			round_lyra(state);

-		g_hash[thread]             = ((uint2*)state)[0];
-		g_hash[thread + threads]   = ((uint2*)state)[1];
-		g_hash[thread + threads*2] = ((uint2*)state)[2];
-		g_hash[thread + threads*3] = ((uint2*)state)[3];
-	}
+		g_hash[thread + threads * 0] = state[0].x;
+		g_hash[thread + threads * 1] = state[0].y;
+		g_hash[thread + threads * 2] = state[0].z;
+		g_hash[thread + threads * 3] = state[0].w;
+
+	} //thread
 }
 #else
+#if __CUDA_ARCH__ < 500
+
 /* for unsupported SM arch */
 __device__ void* DMatrix;
-__global__ void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
+#endif
+__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {}
+__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
 #endif

 __host__
-void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix)
+void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
 {
-	cuda_get_arch(thr_id);
+	int dev_id = device_map[thr_id % MAX_GPUS];
+	// just assign the device pointer allocated in main loop
 	cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
 }

 __host__
-void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order)
+void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, bool gtx750ti)
 {
 	int dev_id = device_map[thr_id % MAX_GPUS];
+
 	uint32_t tpb = TPB52;
-	if (device_sm[dev_id] == 500) tpb = TPB50;
-	if (device_sm[dev_id] == 350) tpb = TPB30; // to enhance (or not)
-	if (device_sm[dev_id] <= 300) tpb = TPB30;

-	dim3 grid((threads + tpb - 1) / tpb);
-	dim3 block(tpb);
+	if (cuda_arch[dev_id] >= 520) tpb = TPB52;
+	else if (cuda_arch[dev_id] >= 500) tpb = TPB50;
+	else if (cuda_arch[dev_id] >= 200) tpb = TPB20;

-	if (device_sm[dev_id] >= 500)
-		lyra2_gpu_hash_32 <<< grid, block >>> (threads, startNounce, (uint2*)d_hash);
-	else
-		lyra2_gpu_hash_32_sm2 <<< grid, block >>> (threads, startNounce, d_hash);
+	dim3 grid1((threads * 4 + tpb - 1) / tpb);
+	dim3 block1(4, tpb >> 2);

+	dim3 grid2((threads + 64 - 1) / 64);
+	dim3 block2(64);
+
+	dim3 grid3((threads + tpb - 1) / tpb);
+	dim3 block3(tpb);
+
+	size_t shared_mem = 0;
+
+	//if (cuda_arch[dev_id] < 500) cudaFuncSetCacheConfig(lyra2_gpu_hash_32_2, cudaFuncCachePreferShared);
+
+	if (cuda_arch[dev_id] >= 520)
+	{
+		lyra2_gpu_hash_32_1 << <grid2, block2 >> > (threads, startNounce, (uint2*)d_hash);
+
+		lyra2_gpu_hash_32_2 << <grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >> > (threads, startNounce, d_hash);
+
+		lyra2_gpu_hash_32_3 << <grid2, block2 >> > (threads, startNounce, (uint2*)d_hash);
+	}
+	else if (cuda_arch[dev_id] >= 500)
+	{
+		if (gtx750ti)
+			// 8Warpに調整のため、8192バイト確保する
+			shared_mem = 8192;
+		else
+			// 10Warpに調整のため、6144バイト確保する
+			shared_mem = 6144;
+
+
+		lyra2_gpu_hash_32_1_sm5 << <grid2, block2 >> > (threads, startNounce, (uint2*)d_hash);
+
+		lyra2_gpu_hash_32_2_sm5 << <grid1, block1, shared_mem >> > (threads, startNounce, (uint2*)d_hash);
+
+		lyra2_gpu_hash_32_3_sm5 << <grid2, block2 >> > (threads, startNounce, (uint2*)d_hash);
+	}
+	else
+		lyra2_gpu_hash_32_sm2 << < grid3, block3 >> > (threads, startNounce, d_hash);
 }
--- a/lyra2/cuda_lyra2_sm2.cuh
+++ b/lyra2/cuda_lyra2_sm2.cuh
@ -3,15 +3,16 @@
				@@ -3,15 +3,16 @@
 #ifdef __INTELLISENSE__
 /* just for vstudio code colors */
 #undef __CUDA_ARCH__
-#define __CUDA_ARCH__ 300
+#define __CUDA_ARCH__ 500
 #endif

 #include "cuda_helper.h"

 #define TPB30 160
+#define TPB20 160

 #if (__CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350) || !defined(__CUDA_ARCH__)
-__constant__ static uint2 blake2b_IV[8] = {
+__constant__ static uint2 blake2b_IV_sm2[8] = {
 	{ 0xf3bcc908, 0x6a09e667 },
 	{ 0x84caa73b, 0xbb67ae85 },
 	{ 0xfe94f82b, 0x3c6ef372 },
@ -149,7 +150,7 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_h
				@@ -149,7 +150,7 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_h

 		#pragma unroll
 		for (int i = 0; i<8; i++) {
-			state[i + 8] = blake2b_IV[i];
+			state[i + 8] = blake2b_IV_sm2[i];
 		}

 		// blake2blyra x2
--- a/lyra2/cuda_lyra2_sm5.cuh
+++ b/lyra2/cuda_lyra2_sm5.cuh
@ -0,0 +1,701 @@
				@@ -0,0 +1,701 @@
+#include <memory.h>
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#undef __CUDA_ARCH__
+#define __CUDA_ARCH__ 500
+#endif
+
+#include "cuda_helper.h"
+
+#define TPB50 32
+
+#if __CUDA_ARCH__ == 500
+#include "cuda_lyra2_vectors.h"
+
+#define Nrow 8
+#define Ncol 8
+#define memshift 3
+
+__device__ uint2 *DMatrix;
+
+__device__ __forceinline__ uint2 LD4S(const int index)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+}
+
+__device__ __forceinline__ void ST4S(const int index, const uint2 data)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data;
+}
+
+#if __CUDA_ARCH__ >= 300
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	return __shfl(a, b, c);
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	a1 = WarpShuffle(a1, b1, c);
+	a2 = WarpShuffle(a2, b2, c);
+	a3 = WarpShuffle(a3, b3, c);
+}
+
+#else
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+	uint32_t *_ptr = (uint32_t*)shared_mem;
+
+	__threadfence_block();
+	uint32_t buf = _ptr[thread];
+
+	_ptr[thread] = a;
+	__threadfence_block();
+	uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	_ptr[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a;
+	__threadfence_block();
+	uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a1;
+	__threadfence_block();
+	a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a2;
+	__threadfence_block();
+	a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a3;
+	__threadfence_block();
+	a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+	__threadfence_block();
+}
+
+#endif
+
+static __device__ __forceinline__
+void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
+{
+	a += b; d ^= a; d = SWAPUINT2(d);
+	c += d; b ^= c; b = ROR2(b, 24);
+	a += b; d ^= a; d = ROR2(d, 16);
+	c += d; b ^= c; b = ROR2(b, 63);
+}
+
+__device__ __forceinline__ void round_lyra(uint2 s[4])
+{
+	Gfunc(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4);
+	Gfunc(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4);
+}
+
+static __device__ __forceinline__
+void round_lyra(uint2x4* s)
+{
+	Gfunc(s[0].x, s[1].x, s[2].x, s[3].x);
+	Gfunc(s[0].y, s[1].y, s[2].y, s[3].y);
+	Gfunc(s[0].z, s[1].z, s[2].z, s[3].z);
+	Gfunc(s[0].w, s[1].w, s[2].w, s[3].w);
+	Gfunc(s[0].x, s[1].y, s[2].z, s[3].w);
+	Gfunc(s[0].y, s[1].z, s[2].w, s[3].x);
+	Gfunc(s[0].z, s[1].w, s[2].x, s[3].y);
+	Gfunc(s[0].w, s[1].x, s[2].y, s[3].z);
+}
+
+static __device__ __forceinline__
+void reduceDuplexV5(uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	uint2 state1[3], state2[3];
+
+	const uint32_t ps0 = (memshift * Ncol * 0 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps1 = (memshift * Ncol * 1 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps2 = (memshift * Ncol * 2 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps3 = (memshift * Ncol * 3 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps4 = (memshift * Ncol * 4 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps5 = (memshift * Ncol * 5 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps6 = (memshift * Ncol * 6 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps7 = (memshift * Ncol * 7 * threads + thread)*blockDim.x + threadIdx.x;
+
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s0 = memshift * Ncol * 0 + (Ncol - 1 - i) * memshift;
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			ST4S(s0 + j, state[j]);
+		round_lyra(state);
+	}
+
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s1 = ps1 + (7 - i)*memshift* threads*blockDim.x;
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = LD4S(s0 + j);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j];
+
+		round_lyra(state);
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s1 + j*threads*blockDim.x) = state1[j] ^ state[j];
+	}
+
+	// 1, 0, 2
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
+		const uint32_t s2 = ps2 + (7 - i)*memshift* threads*blockDim.x;
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s1 + j*threads*blockDim.x);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = LD4S(s0 + j);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s2 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			ST4S(s0 + j, state2[j]);
+	}
+
+	// 2, 1, 3
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
+		const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x;
+		const uint32_t s3 = ps3 + (7 - i)*memshift* threads*blockDim.x;
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s2 + j*threads*blockDim.x);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = *(DMatrix + s1 + j*threads*blockDim.x);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s3 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s1 + j*threads*blockDim.x) = state2[j];
+	}
+
+	// 3, 0, 4
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t ls0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s0 = ps0 + i * memshift* threads*blockDim.x;
+		const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x;
+		const uint32_t s4 = ps4 + (7 - i)*memshift* threads*blockDim.x;
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s3 + j*threads*blockDim.x);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = LD4S(ls0 + j);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s4 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s0 + j*threads*blockDim.x) = state2[j];
+	}
+
+	// 4, 3, 5
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x;
+		const uint32_t s4 = ps4 + i * memshift* threads*blockDim.x;
+		const uint32_t s5 = ps5 + (7 - i)*memshift* threads*blockDim.x;
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s4 + j*threads*blockDim.x);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = *(DMatrix + s3 + j*threads*blockDim.x);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s5 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s3 + j*threads*blockDim.x) = state2[j];
+	}
+
+	// 5, 2, 6
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x;
+		const uint32_t s5 = ps5 + i * memshift* threads*blockDim.x;
+		const uint32_t s6 = ps6 + (7 - i)*memshift* threads*blockDim.x;
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s5 + j*threads*blockDim.x);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = *(DMatrix + s2 + j*threads*blockDim.x);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s6 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s2 + j*threads*blockDim.x) = state2[j];
+	}
+
+	// 6, 1, 7
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
+		const uint32_t s6 = ps6 + i * memshift* threads*blockDim.x;
+		const uint32_t s7 = ps7 + (7 - i)*memshift* threads*blockDim.x;
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s6 + j*threads*blockDim.x);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = *(DMatrix + s1 + j*threads*blockDim.x);
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s7 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s1 + j*threads*blockDim.x) = state2[j];
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowV50(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	const uint32_t ps1 = (memshift * Ncol * rowIn*threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps3 = (memshift * Ncol * rowOut*threads + thread)*blockDim.x + threadIdx.x;
+
+#pragma unroll 1
+	for (int i = 0; i < 8; i++)
+	{
+		uint2 state1[3], state2[3];
+
+		const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x;
+		const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x;
+		const uint32_t s3 = ps3 + i*memshift*threads *blockDim.x;
+
+#pragma unroll
+		for (int j = 0; j < 3; j++) {
+			state1[j] = *(DMatrix + s1 + j*threads*blockDim.x);
+			state2[j] = *(DMatrix + s2 + j*threads*blockDim.x);
+		}
+
+#pragma unroll
+		for (int j = 0; j < 3; j++) {
+			state1[j] += state2[j];
+			state[j] ^= state1[j];
+		}
+
+		round_lyra(state);
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+		{
+			*(DMatrix + s2 + j*threads*blockDim.x) = state2[j];
+			*(DMatrix + s3 + j*threads*blockDim.x) ^= state[j];
+		}
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	const uint32_t ps1 = (memshift * Ncol * 2*threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps3 = (memshift * Ncol * 5*threads + thread)*blockDim.x + threadIdx.x;
+
+	uint2 state1[3], last[3];
+
+#pragma unroll
+	for (int j = 0; j < 3; j++) {
+		state1[j] = *(DMatrix + ps1 + j*threads*blockDim.x);
+		last[j] = *(DMatrix + ps2 + j*threads*blockDim.x);
+	}
+
+#pragma unroll
+	for (int j = 0; j < 3; j++) {
+		state1[j] += last[j];
+		state[j] ^= state1[j];
+	}
+
+	round_lyra(state);
+
+	//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+	uint2 Data0 = state[0];
+	uint2 Data1 = state[1];
+	uint2 Data2 = state[2];
+	WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+	if (threadIdx.x == 0)
+	{
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	}
+	else
+	{
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
+	}
+
+	if (rowInOut == 5)
+	{
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}
+
+	for (int i = 1; i < 8; i++)
+	{
+		const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x;
+		const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x;
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= *(DMatrix + s1 + j*threads*blockDim.x) + *(DMatrix + s2 + j*threads*blockDim.x);
+
+		round_lyra(state);
+	}
+
+
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= last[j];
+
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	const uint2x4 blake2b_IV[2] = {
+		{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } },
+		{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }
+	};
+
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+
+		((uint2*)state)[0] = __ldg(&g_hash[thread]);
+		((uint2*)state)[1] = __ldg(&g_hash[thread + threads]);
+		((uint2*)state)[2] = __ldg(&g_hash[thread + threads * 2]);
+		((uint2*)state)[3] = __ldg(&g_hash[thread + threads * 3]);
+
+		state[1] = state[0];
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];
+
+		for (int i = 0; i < 24; i++)
+			round_lyra(state); //because 12 is not enough
+
+		((uint2x4*)DMatrix)[0 * threads + thread] = state[0];
+		((uint2x4*)DMatrix)[1 * threads + thread] = state[1];
+		((uint2x4*)DMatrix)[2 * threads + thread] = state[2];
+		((uint2x4*)DMatrix)[3 * threads + thread] = state[3];
+	}
+}
+
+__global__ __launch_bounds__(TPB50, 1)
+void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+{
+	const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
+
+	if (thread < threads)
+	{
+		uint2 state[4];
+
+		state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]);
+		state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]);
+		state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]);
+		state[3] = __ldg(&DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x]);
+
+		reduceDuplexV5(state, thread, threads);
+
+		uint32_t rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(7, rowa, 0, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(0, rowa, 3, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(3, rowa, 6, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(6, rowa, 1, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(1, rowa, 4, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(4, rowa, 7, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50(7, rowa, 2, state, thread, threads);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50_8(rowa, state, thread, threads);
+
+		DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x] = state[0];
+		DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x] = state[1];
+		DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x] = state[2];
+		DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x] = state[3];
+	}
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+
+		state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]);
+		state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]);
+		state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]);
+		state[3] = __ldg4(&((uint2x4*)DMatrix)[3 * threads + thread]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra(state);
+
+		g_hash[thread] = ((uint2*)state)[0];
+		g_hash[thread + threads] = ((uint2*)state)[1];
+		g_hash[thread + threads * 2] = ((uint2*)state)[2];
+		g_hash[thread + threads * 3] = ((uint2*)state)[3];
+	}
+}
+
+#else
+/* if __CUDA_ARCH__ != 500 .. host */
+__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
+#endif
--- a/lyra2/cuda_lyra2v2.cu
+++ b/lyra2/cuda_lyra2v2.cu
@ -2,35 +2,152 @@
				@@ -2,35 +2,152 @@
 #include <stdint.h>
 #include <memory.h>

-#define TPB52 8
-#define TPB50 16
-
-#include "cuda_lyra2v2_sm3.cuh"
+#define TPB52 32
+#define TPB50 32
+#define TPB30 32
+#define TPB20 32

 #ifdef __INTELLISENSE__
 /* just for vstudio code colors */
-#define __CUDA_ARCH__ 500
+#define __CUDA_ARCH__ 200
 #endif

-#if __CUDA_ARCH__ >= 500
-
 #include "cuda_lyra2_vectors.h"

+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#if __CUDA_ARCH__ >= 300
+__device__ uint32_t __shfl(uint32_t a, uint32_t b, uint32_t c);
+#endif
+#endif
+
 #define Nrow 4
 #define Ncol 4
 #define memshift 3

-__device__ uint2x4 *DMatrix;
+__device__ uint2x4 *DState;
+
+__device__ __forceinline__ uint2 LD4S(const int index)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+}
+
+__device__ __forceinline__ void ST4S(const int index, const uint2 data)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data;
+}

 __device__ __forceinline__
 void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
 {
-	a += b; d ^= a; d = SWAPUINT2(d);
-	c += d; b ^= c; b = ROR2(b, 24);
-	a += b; d ^= a; d = ROR2(d, 16);
+	a += b; d = eorswap32(a, d);
+	c += d; b ^= c; b = ROR24(b);
+	a += b; d ^= a; d = ROR16(d);
 	c += d; b ^= c; b = ROR2(b, 63);
 }

+
+#if __CUDA_ARCH__ >= 300
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	return __shfl(a, b, c);
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	a1 = WarpShuffle(a1, b1, c);
+	a2 = WarpShuffle(a2, b2, c);
+	a3 = WarpShuffle(a3, b3, c);
+}
+
+#else
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+	uint32_t *_ptr = (uint32_t*)shared_mem;
+
+	__threadfence_block();
+	uint32_t buf = _ptr[thread];
+
+	_ptr[thread] = a;
+	__threadfence_block();
+	uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	_ptr[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a;
+	__threadfence_block();
+	uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a1;
+	__threadfence_block();
+	a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a2;
+	__threadfence_block();
+	a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a3;
+	__threadfence_block();
+	a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+	__threadfence_block();
+}
+
+#endif
+
+
+__device__ __forceinline__ void round_lyra_v35(uint2 s[4])
+{
+	Gfunc_v5(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4);
+	Gfunc_v5(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4);
+}
+
 __device__ __forceinline__
 void round_lyra_v5(uint2x4* s)
 {
@ -45,145 +162,142 @@ void round_lyra_v5(uint2x4* s)
				@@ -45,145 +162,142 @@ void round_lyra_v5(uint2x4* s)
 	Gfunc_v5(s[0].w, s[1].x, s[2].y, s[3].z);
 }

-__device__ __forceinline__
-void reduceDuplex(uint2x4 state[4], const uint32_t thread)
+
+__device__ __forceinline__ void reduceDuplexRowSetupV2(uint2 state[4])
 {
-	uint2x4 state1[3];
-	const uint32_t ps1 = (Nrow * Ncol * memshift * thread);
-	const uint32_t ps2 = (memshift * (Ncol-1) + memshift * Ncol + Nrow * Ncol * memshift * thread);
+	int i, j;
+	uint2 state1[Ncol][3], state0[Ncol][3], state2[3];

-	#pragma unroll 4
+#if __CUDA_ARCH__ > 500
+#pragma unroll
+#endif
 	for (int i = 0; i < Ncol; i++)
 	{
-		uint32_t s1 = ps1 + i*memshift;
-		uint32_t s2 = ps2 - i*memshift;
-
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state1[j] = __ldg4(&(DMatrix+s1)[j]);
-
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state[j] ^= state1[j];
-
-		round_lyra_v5(state);
-
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state1[j] ^= state[j];
-
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			(DMatrix + s2)[j] = state1[j];
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state0[Ncol - i - 1][j] = state[j];
+		round_lyra_v35(state);
 	}
-}
-
-__device__ __forceinline__
-void reduceDuplex50(uint2x4 state[4], const uint32_t thread)
-{
-	const uint32_t ps1 = (Nrow * Ncol * memshift * thread);
-	const uint32_t ps2 = (memshift * (Ncol - 1) + memshift * Ncol + Nrow * Ncol * memshift * thread);

-	#pragma unroll 4
-	for (int i = 0; i < Ncol; i++)
+	//#pragma unroll 4
+	for (i = 0; i < Ncol; i++)
 	{
-		const uint32_t s1 = ps1 + i*memshift;
-		const int32_t s2 = ps2 - i*memshift;
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= state0[i][j];

-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state[j] ^= __ldg4(&(DMatrix + s1)[j]);
+		round_lyra_v35(state);

-		round_lyra_v5(state);
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state1[Ncol - i - 1][j] = state0[i][j];

-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			(DMatrix + s2)[j] = __ldg4(&(DMatrix + s1)[j]) ^ state[j];
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state1[Ncol - i - 1][j] ^= state[j];
 	}
-}

-__device__ __forceinline__
-void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOut, uint2x4 state[4], const uint32_t thread)
-{
-	uint2x4 state2[3], state1[3];
-
-	const uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread);
-	const uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread);
-	const uint32_t ps3 = (memshift * (Ncol-1) + memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread);
-
-	for (int i = 0; i < Ncol; i++)
+	for (i = 0; i < Ncol; i++)
 	{
-		const uint32_t s1 = ps1 + i*memshift;
-		const uint32_t s2 = ps2 + i*memshift;
-		const uint32_t s3 = ps3 - i*memshift;
+		const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s2 = memshift * Ncol * 2 + memshift * (Ncol - 1) - i*memshift;
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= state1[i][j] + state0[i][j];

-#if __CUDA_ARCH__ == 500
+		round_lyra_v35(state);

-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state[j] = state[j] ^ (__ldg4(&(DMatrix + s1)[j]) + __ldg4(&(DMatrix + s2)[j]));
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state2[j] = state1[i][j];

-		round_lyra_v5(state);
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state1[j] = __ldg4(&(DMatrix + s1)[j]);
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state2[j] ^= state[j];

-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state2[j] = __ldg4(&(DMatrix + s2)[j]);
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s2 + j, state2[j]);

-		#pragma unroll
-		for (int j = 0; j < 3; j++)
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
 		{
-			state1[j] ^= state[j];
-			(DMatrix + s3)[j] = state1[j];
+			state0[i][0] ^= Data2;
+			state0[i][1] ^= Data0;
+			state0[i][2] ^= Data1;
 		}
-
-#else /* 5.2 */
-
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state1[j] = __ldg4(&(DMatrix + s1)[j]);
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state2[j] = __ldg4(&(DMatrix + s2)[j]);
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
+		else
 		{
-			uint2x4 tmp = state1[j] + state2[j];
-			state[j] ^= tmp;
+			state0[i][0] ^= Data0;
+			state0[i][1] ^= Data1;
+			state0[i][2] ^= Data2;
 		}

-		round_lyra_v5(state);
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s0 + j, state0[i][j]);

-		#pragma unroll
-		for (int j = 0; j < 3; j++)
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state0[i][j] = state2[j];
+
+	}
+
+	for (i = 0; i < Ncol; i++)
+	{
+		const uint32_t s1 = memshift * Ncol * 1 + i*memshift;
+		const uint32_t s3 = memshift * Ncol * 3 + memshift * (Ncol - 1) - i*memshift;
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= state1[i][j] + state0[Ncol - i - 1][j];
+
+		round_lyra_v35(state);
+
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state0[Ncol - i - 1][j] ^= state[j];
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s3 + j, state0[Ncol - i - 1][j]);
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
 		{
-			state1[j] ^= state[j];
-			(DMatrix + s3)[j] = state1[j];
+			state1[i][0] ^= Data2;
+			state1[i][1] ^= Data0;
+			state1[i][2] ^= Data1;
 		}
+		else
+		{
+			state1[i][0] ^= Data0;
+			state1[i][1] ^= Data1;
+			state1[i][2] ^= Data2;
+	}

-#endif
-		((uint2*)state2)[0] ^= ((uint2*)state)[11];
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s1 + j, state1[i][j]);

-		#pragma unroll
-		for (int j = 0; j < 11; j++)
-			((uint2*)state2)[j+1] ^= ((uint2*)state)[j];

-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			(DMatrix + s2)[j] = state2[j];
 	}
 }

-
-__device__ __forceinline__
-void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, uint2x4* state, const uint32_t thread)
+__device__ void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4])
 {
-	uint2x4 state1[3], state2[3];
-	const uint32_t ps1 = (memshift * Ncol * rowIn    + Nrow * Ncol * memshift * thread);
-	const uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread);
-	const uint32_t ps3 = (memshift * Ncol * rowOut   + Nrow * Ncol * memshift * thread);
+	uint2 state1[3], state2[3];
+	const uint32_t ps1 = memshift * Ncol * rowIn;
+	const uint32_t ps2 = memshift * Ncol * rowInOut;
+	const uint32_t ps3 = memshift * Ncol * rowOut;

 	for (int i = 0; i < Ncol; i++)
 	{
@ -191,190 +305,268 @@ void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, u
				@@ -191,190 +305,268 @@ void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, u
 		const uint32_t s2 = ps2 + i*memshift;
 		const uint32_t s3 = ps3 + i*memshift;

-		#pragma unroll
+#pragma unroll
 		for (int j = 0; j < 3; j++)
-			state1[j] = __ldg4(&(DMatrix + s1)[j]);
+			state1[j] = LD4S(s1 + j);

-
-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state2[j] = __ldg4(&(DMatrix + s2)[j]);
-
-		#pragma unroll
+#pragma unroll
 		for (int j = 0; j < 3; j++)
-			state1[j] += state2[j];
+			state2[j] = LD4S(s2 + j);

-		#pragma unroll
+#pragma unroll
 		for (int j = 0; j < 3; j++)
-			state[j] ^= state1[j];
-
-		round_lyra_v5(state);
-
-		((uint2*)state2)[0] ^= ((uint2*)state)[11];
+			state[j] ^= state1[j] + state2[j];

-		#pragma unroll
-		for (int j = 0; j < 11; j++)
-			((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
+		round_lyra_v35(state);

-#if __CUDA_ARCH__ == 500
-		if (rowInOut != rowOut)
-		{
-			#pragma unroll
-			for (int j = 0; j < 3; j++)
-				(DMatrix + s3)[j] ^= state[j];
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);

-		}
-		if (rowInOut == rowOut)
+		if (threadIdx.x == 0)
 		{
-			#pragma unroll
-			for (int j = 0; j < 3; j++)
-				state2[j] ^= state[j];
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
 		}
-#else
-		if (rowInOut != rowOut)
+		else
 		{
-			#pragma unroll
-			for (int j = 0; j < 3; j++)
-				(DMatrix + s3)[j] ^= state[j];
-		} else {
-			#pragma unroll
-			for (int j = 0; j < 3; j++)
-				state2[j] ^= state[j];
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
 		}
-#endif
-		#pragma unroll
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			ST4S(s2 + j, state2[j]);
+#pragma unroll
 		for (int j = 0; j < 3; j++)
-			(DMatrix + s2)[j] = state2[j];
+			ST4S(s3 + j, LD4S(s3 + j) ^ state[j]);
 	}
 }

-
-#if __CUDA_ARCH__ == 500
-__global__ __launch_bounds__(TPB50, 1)
-#else
-__global__ __launch_bounds__(TPB52, 1)
-#endif
-void lyra2v2_gpu_hash_32(const uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+__device__ void reduceDuplexRowtV2_4(const int rowInOut, uint2 state[4])
 {
-	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const int rowIn = 2;
+	const int rowOut = 3;

-	uint2x4 blake2b_IV[2];
+	int i, j;
+	uint2 state2[3], state1[3], last[3];
+	const uint32_t ps1 = memshift * Ncol * rowIn;
+	const uint32_t ps2 = memshift * Ncol * rowInOut;
+	const uint32_t ps3 = memshift * Ncol * rowOut;

-	if (threadIdx.x == 0) {
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		last[j] = LD4S(ps2 + j);

-		((uint16*)blake2b_IV)[0] = make_uint16(
-			0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85,
-			0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a,
-			0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c,
-			0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19
-		);
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= LD4S(ps1 + j) + last[j];
+
+	round_lyra_v35(state);
+
+	//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+	uint2 Data0 = state[0];
+	uint2 Data1 = state[1];
+	uint2 Data2 = state[2];
+	WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+	if (threadIdx.x == 0)
+	{
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	}
+	else
+	{
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
 	}

-	if (thread < threads)
+	if (rowInOut == rowOut)
+	{
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}
+
+	for (i = 1; i < Ncol; i++)
 	{
-		uint2x4 state[4];
+		const uint32_t s1 = ps1 + i*memshift;
+		const uint32_t s2 = ps2 + i*memshift;

-		((uint2*)state)[0] = __ldg(&g_hash[thread]);
-		((uint2*)state)[1] = __ldg(&g_hash[thread + threads]);
-		((uint2*)state)[2] = __ldg(&g_hash[thread + threads*2]);
-		((uint2*)state)[3] = __ldg(&g_hash[thread + threads*3]);
+#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= LD4S(s1 + j) + LD4S(s2 + j);

-		state[1] = state[0];
+		round_lyra_v35(state);
+	}
+
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= last[j];
+}

-		state[2] = ((blake2b_IV)[0]);
-		state[3] = ((blake2b_IV)[1]);
+__constant__ uint28 blake2b_IV[2] = {
+	0xf3bcc908lu, 0x6a09e667lu,
+	0x84caa73blu, 0xbb67ae85lu,
+	0xfe94f82blu, 0x3c6ef372lu,
+	0x5f1d36f1lu, 0xa54ff53alu,
+	0xade682d1lu, 0x510e527flu,
+	0x2b3e6c1flu, 0x9b05688clu,
+	0xfb41bd6blu, 0x1f83d9ablu,
+	0x137e2179lu, 0x5be0cd19lu
+};
+
+__constant__ uint28 Mask[2] = {
+	0x00000020lu, 0x00000000lu,
+	0x00000020lu, 0x00000000lu,
+	0x00000020lu, 0x00000000lu,
+	0x00000001lu, 0x00000000lu,
+	0x00000004lu, 0x00000000lu,
+	0x00000004lu, 0x00000000lu,
+	0x00000080lu, 0x00000000lu,
+	0x00000000lu, 0x01000000lu
+};
+
+__global__ __launch_bounds__(64, 1)
+void lyra2v2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	uint28 state[4];
+
+	if (thread < threads)
+	{
+		state[0].x = state[1].x = __ldg(&outputHash[thread + threads * 0]);
+		state[0].y = state[1].y = __ldg(&outputHash[thread + threads * 1]);
+		state[0].z = state[1].z = __ldg(&outputHash[thread + threads * 2]);
+		state[0].w = state[1].w = __ldg(&outputHash[thread + threads * 3]);
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];

 		for (int i = 0; i<12; i++)
 			round_lyra_v5(state);

-		((uint2*)state)[0].x ^= 0x20;
-		((uint2*)state)[1].x ^= 0x20;
-		((uint2*)state)[2].x ^= 0x20;
-		((uint2*)state)[3].x ^= 0x01;
-		((uint2*)state)[4].x ^= 0x04;
-		((uint2*)state)[5].x ^= 0x04;
-		((uint2*)state)[6].x ^= 0x80;
-		((uint2*)state)[7].y ^= 0x01000000;
+		state[0] ^= Mask[0];
+		state[1] ^= Mask[1];

 		for (int i = 0; i<12; i++)
 			round_lyra_v5(state);

-		const uint32_t ps1 = (memshift * (Ncol - 1) + Nrow * Ncol * memshift * thread);
+		DState[blockDim.x * gridDim.x * 0 + blockDim.x * blockIdx.x + threadIdx.x] = state[0];
+		DState[blockDim.x * gridDim.x * 1 + blockDim.x * blockIdx.x + threadIdx.x] = state[1];
+		DState[blockDim.x * gridDim.x * 2 + blockDim.x * blockIdx.x + threadIdx.x] = state[2];
+		DState[blockDim.x * gridDim.x * 3 + blockDim.x * blockIdx.x + threadIdx.x] = state[3];

-		for (int i = 0; i < Ncol; i++)
-		{
-			const uint32_t s1 = ps1 - memshift * i;
-			DMatrix[s1] = state[0];
-			DMatrix[s1+1] = state[1];
-			DMatrix[s1+2] = state[2];
-			round_lyra_v5(state);
-		}
+	} //thread
+}
+
+#if __CUDA_ARCH__ < 300
+__global__ __launch_bounds__(TPB20, 1)
+#elif __CUDA_ARCH__ < 500
+__global__ __launch_bounds__(TPB30, 1)
+#elif __CUDA_ARCH__ == 500
+__global__ __launch_bounds__(TPB50, 1)
+#else
+__global__ __launch_bounds__(TPB52, 1)
+#endif
+void lyra2v2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *outputHash)
+{
+	const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;

-		reduceDuplex50(state, thread);
+	if (thread < threads)
+	{
+		uint2 state[4];
+		state[0] = ((uint2*)DState)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[1] = ((uint2*)DState)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[2] = ((uint2*)DState)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[3] = ((uint2*)DState)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];

-		reduceDuplexRowSetupV2(1, 0, 2, state, thread);
-		reduceDuplexRowSetupV2(2, 1, 3, state, thread);
+		reduceDuplexRowSetupV2(state);

 		uint32_t rowa;
-		int prev=3;
+		int prev = 3;

-		for (int i = 0; i < 4; i++)
+		for (int i = 0; i < 3; i++)
 		{
-			rowa = ((uint2*)state)[0].x & 3;
-			reduceDuplexRowtV2(prev, rowa, i, state, thread);
+			rowa = WarpShuffle(state[0].x, 0, 4) & 3;
+			reduceDuplexRowtV2(prev, rowa, i, state);
 			prev = i;
 		}

-		const uint32_t shift = (memshift * Ncol * rowa + Nrow * Ncol * memshift * thread);
+		rowa = WarpShuffle(state[0].x, 0, 4) & 3;
+		reduceDuplexRowtV2_4(rowa, state);

-		#pragma unroll
-		for (int j = 0; j < 3; j++)
-			state[j] ^= __ldg4(&(DMatrix + shift)[j]);
+		((uint2*)DState)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[0];
+		((uint2*)DState)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[1];
+		((uint2*)DState)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[2];
+		((uint2*)DState)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[3];
+	} //thread
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2v2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	uint28 state[4];
+
+	if (thread < threads)
+	{
+		state[0] = __ldg4(&DState[blockDim.x * gridDim.x * 0 + blockDim.x * blockIdx.x + threadIdx.x]);
+		state[1] = __ldg4(&DState[blockDim.x * gridDim.x * 1 + blockDim.x * blockIdx.x + threadIdx.x]);
+		state[2] = __ldg4(&DState[blockDim.x * gridDim.x * 2 + blockDim.x * blockIdx.x + threadIdx.x]);
+		state[3] = __ldg4(&DState[blockDim.x * gridDim.x * 3 + blockDim.x * blockIdx.x + threadIdx.x]);

 		for (int i = 0; i < 12; i++)
 			round_lyra_v5(state);

-		g_hash[thread]             = ((uint2*)state)[0];
-		g_hash[thread + threads]   = ((uint2*)state)[1];
-		g_hash[thread + threads*2] = ((uint2*)state)[2];
-		g_hash[thread + threads*3] = ((uint2*)state)[3];
-	}
+		outputHash[thread + threads * 0] = state[0].x;
+		outputHash[thread + threads * 1] = state[0].y;
+		outputHash[thread + threads * 2] = state[0].z;
+		outputHash[thread + threads * 3] = state[0].w;
+
+	} //thread
 }
-#else
-#include "cuda_helper.h"
-#if __CUDA_ARCH__ < 200
-__device__ void* DMatrix;
-#endif
-__global__ void lyra2v2_gpu_hash_32(const uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
-#endif

 __host__
 void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
 {
-	cuda_get_arch(thr_id);
+	int dev_id = device_map[thr_id % MAX_GPUS];
 	// just assign the device pointer allocated in main loop
-	cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(DState, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
 }

 __host__
 void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, int order)
 {
 	int dev_id = device_map[thr_id % MAX_GPUS];
+
 	uint32_t tpb = TPB52;

 	if (cuda_arch[dev_id] > 500) tpb = TPB52;
 	else if (cuda_arch[dev_id] == 500) tpb = TPB50;
-	else if (cuda_arch[dev_id] >= 350) tpb = TPB35;
 	else if (cuda_arch[dev_id] >= 300) tpb = TPB30;
 	else if (cuda_arch[dev_id] >= 200) tpb = TPB20;

-	dim3 grid((threads + tpb - 1) / tpb);
-	dim3 block(tpb);
+	dim3 grid1((threads * 4 + tpb - 1) / tpb);
+	dim3 block1(4, tpb >> 2);

-	if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500)
-		lyra2v2_gpu_hash_32    <<<grid, block>>> (threads, startNounce, (uint2*)g_hash);
-	else
-		lyra2v2_gpu_hash_32_v3 <<<grid, block>>> (threads, startNounce, (uint2*)g_hash);
+	dim3 grid2((threads + 64 - 1) / 64);
+	dim3 block2(64);
+
+	if (cuda_arch[dev_id] < 500)
+		cudaFuncSetCacheConfig(lyra2v2_gpu_hash_32_2, cudaFuncCachePreferShared);
+
+	lyra2v2_gpu_hash_32_1 << <grid2, block2 >> > (threads, startNounce, (uint2*)g_hash);
+
+	lyra2v2_gpu_hash_32_2 << <grid1, block1, 48 * sizeof(uint2) * tpb >> > (threads, startNounce, g_hash);

+	lyra2v2_gpu_hash_32_3 << <grid2, block2 >> > (threads, startNounce, (uint2*)g_hash);
 	//MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/lyra2/cuda_lyra2v2_sm3.cuh
+++ b/lyra2/cuda_lyra2v2_sm3.cuh
@ -1,338 +0,0 @@
				@@ -1,338 +0,0 @@
-/* SM 2/3/3.5 Variant for lyra2REv2 */
-
-#ifdef __INTELLISENSE__
-/* just for vstudio code colors */
-#undef __CUDA_ARCH__
-#define __CUDA_ARCH__ 350
-#endif
-
-#define TPB20 64
-#define TPB30 64
-#define TPB35 64
-
-#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500
-
-#include "cuda_lyra2_vectors.h"
-
-#define Nrow 4
-#define Ncol 4
-
-#define vectype ulonglong4
-#define memshift 4
-
-__device__ vectype *DMatrix;
-
-static __device__ __forceinline__
-void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d)
-{
-	a += b; d ^= a; d = ROTR64(d, 32);
-	c += d; b ^= c; b = ROTR64(b, 24);
-	a += b; d ^= a; d = ROTR64(d, 16);
-	c += d; b ^= c; b = ROTR64(b, 63);
-}
-
-static __device__ __forceinline__
-void round_lyra_v35(vectype* s)
-{
-	Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x);
-	Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y);
-	Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z);
-	Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w);
-
-	Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w);
-	Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x);
-	Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y);
-	Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z);
-}
-
-static __device__ __forceinline__
-void reduceDuplexV3(vectype state[4], uint32_t thread)
-{
-	vectype state1[3];
-	uint32_t ps1 = (Nrow * Ncol * memshift * thread);
-	uint32_t ps2 = (memshift * (Ncol - 1) * Nrow + memshift * 1 + Nrow * Ncol * memshift * thread);
-
-	#pragma unroll 4
-	for (int i = 0; i < Ncol; i++)
-	{
-		uint32_t s1 = ps1 + Nrow * i *memshift;
-		uint32_t s2 = ps2 - Nrow * i *memshift;
-
-		for (int j = 0; j < 3; j++)
-			state1[j] = __ldg4(&(DMatrix + s1)[j]);
-
-		for (int j = 0; j < 3; j++)
-			state[j] ^= state1[j];
-		round_lyra_v35(state);
-
-		for (int j = 0; j < 3; j++)
-			state1[j] ^= state[j];
-
-		for (int j = 0; j < 3; j++)
-			(DMatrix + s2)[j] = state1[j];
-	}
-}
-
-static __device__ __forceinline__
-void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread)
-{
-	vectype state2[3], state1[3];
-
-	uint32_t ps1 = (memshift * rowIn    + Nrow * Ncol * memshift * thread);
-	uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread);
-	uint32_t ps3 = (Nrow * memshift * (Ncol - 1) + memshift *  rowOut + Nrow * Ncol * memshift * thread);
-
-	for (int i = 0; i < Ncol; i++)
-	{
-		uint32_t s1 = ps1 + Nrow*i*memshift;
-		uint32_t s2 = ps2 + Nrow*i*memshift;
-		uint32_t s3 = ps3 - Nrow*i*memshift;
-
-		for (int j = 0; j < 3; j++)
-			state1[j] = __ldg4(&(DMatrix + s1 )[j]);
-		for (int j = 0; j < 3; j++)
-			state2[j] = __ldg4(&(DMatrix + s2 )[j]);
-		for (int j = 0; j < 3; j++) {
-			vectype tmp = state1[j] + state2[j];
-			state[j] ^= tmp;
-		}
-
-		round_lyra_v35(state);
-
-		for (int j = 0; j < 3; j++) {
-			state1[j] ^= state[j];
-			(DMatrix + s3)[j] = state1[j];
-		}
-
-		((uint2*)state2)[0] ^= ((uint2*)state)[11];
-		for (int j = 0; j < 11; j++)
-			((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
-
-		for (int j = 0; j < 3; j++)
-			(DMatrix + s2)[j] = state2[j];
-	}
-}
-
-static __device__ __forceinline__
-void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread)
-{
-	vectype state1[3], state2[3];
-	uint32_t ps1 = (memshift * rowIn    + Nrow * Ncol * memshift * thread);
-	uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread);
-	uint32_t ps3 = (memshift * rowOut   + Nrow * Ncol * memshift * thread);
-
-	#pragma nounroll
-	for (int i = 0; i < Ncol; i++)
-	{
-		uint32_t s1 = ps1 + Nrow * i*memshift;
-		uint32_t s2 = ps2 + Nrow * i*memshift;
-		uint32_t s3 = ps3 + Nrow * i*memshift;
-
-		for (int j = 0; j < 3; j++)
-			state1[j] = __ldg4(&(DMatrix + s1)[j]);
-
-		for (int j = 0; j < 3; j++)
-			state2[j] = __ldg4(&(DMatrix + s2)[j]);
-
-		for (int j = 0; j < 3; j++)
-			state1[j] += state2[j];
-
-		for (int j = 0; j < 3; j++)
-			state[j] ^= state1[j];
-
-		round_lyra_v35(state);
-
-		((uint2*)state2)[0] ^= ((uint2*)state)[11];
-
-		for (int j = 0; j < 11; j++)
-			((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
-
-		if (rowInOut != rowOut) {
-
-			for (int j = 0; j < 3; j++)
-				(DMatrix + s2)[j] = state2[j];
-
-			for (int j = 0; j < 3; j++)
-				(DMatrix + s3)[j] ^= state[j];
-
-		} else {
-
-			for (int j = 0; j < 3; j++)
-				state2[j] ^= state[j];
-
-			for (int j = 0; j < 3; j++)
-				(DMatrix + s2)[j] = state2[j];
-		}
-	}
-}
-
-#if __CUDA_ARCH__ >= 300
-__global__ __launch_bounds__(TPB35, 1)
-void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
-{
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-
-	vectype state[4];
-	vectype blake2b_IV[2];
-	vectype padding[2];
-
-	if (threadIdx.x == 0) {
-
-		((uint16*)blake2b_IV)[0] = make_uint16(
-			0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85,
-			0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a,
-			0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c,
-			0xfb41bd6b, 0x1f83d9ab , 0x137e2179, 0x5be0cd19
-		);
-		((uint16*)padding)[0] = make_uint16(
-			0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0,
-			0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000
-		);
-	}
-
-	if (thread < threads)
-	{
-		((uint2*)state)[0] = __ldg(&outputHash[thread]);
-		((uint2*)state)[1] = __ldg(&outputHash[thread + threads]);
-		((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]);
-		((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]);
-
-		state[1] = state[0];
-		state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0);
-		state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0);
-
-		for (int i = 0; i<12; i++)
-			round_lyra_v35(state);
-
-		state[0] ^= shuffle4(((vectype*)padding)[0], 0);
-		state[1] ^= shuffle4(((vectype*)padding)[1], 0);
-
-		for (int i = 0; i<12; i++)
-			round_lyra_v35(state);
-
-		uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
-
-		//#pragma unroll 4
-		for (int i = 0; i < 4; i++)
-		{
-			uint32_t s1 = ps1 - 4 * memshift * i;
-			for (int j = 0; j < 3; j++)
-				(DMatrix + s1)[j] = (state)[j];
-
-			round_lyra_v35(state);
-		}
-
-		reduceDuplexV3(state, thread);
-		reduceDuplexRowSetupV3(1, 0, 2, state, thread);
-		reduceDuplexRowSetupV3(2, 1, 3, state, thread);
-
-		uint32_t rowa;
-		int prev = 3;
-		for (int i = 0; i < 4; i++)
-		{
-			rowa = ((uint2*)state)[0].x & 3;  reduceDuplexRowtV3(prev, rowa, i, state, thread);
-			prev = i;
-		}
-
-		uint32_t shift = (memshift * rowa + 16 * memshift * thread);
-
-		for (int j = 0; j < 3; j++)
-			state[j] ^= __ldg4(&(DMatrix + shift)[j]);
-
-		for (int i = 0; i < 12; i++)
-			round_lyra_v35(state);
-
-		outputHash[thread] = ((uint2*)state)[0];
-		outputHash[thread + threads] = ((uint2*)state)[1];
-		outputHash[thread + 2 * threads] = ((uint2*)state)[2];
-		outputHash[thread + 3 * threads] = ((uint2*)state)[3];
-
-	} //thread
-}
-#elif __CUDA_ARCH__ >= 200
-__global__ __launch_bounds__(TPB20, 1)
-void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
-{
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-
-	vectype state[4];
-	vectype blake2b_IV[2];
-	vectype padding[2];
-
-	((uint16*)blake2b_IV)[0] = make_uint16(
-		0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85,
-		0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a,
-		0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c,
-		0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19
-	);
-	((uint16*)padding)[0] = make_uint16(
-		0x20, 0x0, 0x20, 0x0, 0x20, 0x0, 0x01, 0x0,
-		0x04, 0x0, 0x04, 0x0, 0x80, 0x0, 0x0, 0x01000000
-	);
-
-	if (thread < threads)
-	{
-
-		((uint2*)state)[0] = outputHash[thread];
-		((uint2*)state)[1] = outputHash[thread + threads];
-		((uint2*)state)[2] = outputHash[thread + 2 * threads];
-		((uint2*)state)[3] = outputHash[thread + 3 * threads];
-
-		state[1] = state[0];
-		state[2] = ((vectype*)blake2b_IV)[0];
-		state[3] = ((vectype*)blake2b_IV)[1];
-
-		for (int i = 0; i<12; i++)
-			round_lyra_v35(state);
-
-		state[0] ^= ((vectype*)padding)[0];
-		state[1] ^= ((vectype*)padding)[1];
-
-		for (int i = 0; i<12; i++)
-			round_lyra_v35(state);
-
-		uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
-
-		//#pragma unroll 4
-		for (int i = 0; i < 4; i++)
-		{
-			uint32_t s1 = ps1 - 4 * memshift * i;
-			for (int j = 0; j < 3; j++)
-				(DMatrix + s1)[j] = (state)[j];
-
-			round_lyra_v35(state);
-		}
-
-		reduceDuplexV3(state, thread);
-		reduceDuplexRowSetupV3(1, 0, 2, state, thread);
-		reduceDuplexRowSetupV3(2, 1, 3, state, thread);
-
-		uint32_t rowa;
-		int prev = 3;
-		for (int i = 0; i < 4; i++)
-		{
-			rowa = ((uint2*)state)[0].x & 3;  reduceDuplexRowtV3(prev, rowa, i, state, thread);
-			prev = i;
-		}
-
-		uint32_t shift = (memshift * rowa + 16 * memshift * thread);
-
-		for (int j = 0; j < 3; j++)
-			state[j] ^= __ldg4(&(DMatrix + shift)[j]);
-
-		for (int i = 0; i < 12; i++)
-			round_lyra_v35(state);
-
-		outputHash[thread] = ((uint2*)state)[0];
-		outputHash[thread + threads] = ((uint2*)state)[1];
-		outputHash[thread + 2 * threads] = ((uint2*)state)[2];
-		outputHash[thread + 3 * threads] = ((uint2*)state)[3];
-
-	} //thread
-}
-#endif
-
-#else
-/* host & sm5+ */
-__global__ void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) {}
-#endif
--- a/lyra2/lyra2RE.cu
+++ b/lyra2/lyra2RE.cu
@ -23,7 +23,7 @@ extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNon
				@@ -23,7 +23,7 @@ extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNon
 extern void skein256_cpu_init(int thr_id, uint32_t threads);

 extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
-extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti);

 extern void groestl256_cpu_init(int thr_id, uint32_t threads);
 extern void groestl256_cpu_free(int thr_id);
@ -85,30 +85,49 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
				@@ -85,30 +85,49 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
-	int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 17 : 16;
-	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
-	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);

 	if (opt_benchmark)
-		ptarget[7] = 0x000f;
+		ptarget[7] = 0x00ff;

+	static bool gtx750ti;
+	static uint32_t throughput[MAX_GPUS];
 	if (!init[thr_id])
 	{
-		cudaSetDevice(device_map[thr_id]);
+		int dev_id = device_map[thr_id];
+		cudaSetDevice(dev_id);
 		CUDA_LOG_ERROR();

-		blake256_cpu_init(thr_id, throughput);
-		keccak256_cpu_init(thr_id,throughput);
-		skein256_cpu_init(thr_id, throughput);
-		groestl256_cpu_init(thr_id, throughput);
+		int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16;
+		if (device_sm[device_map[thr_id]] == 500) intensity = 15;
+		int temp = intensity;
+		throughput[thr_id] = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
+		if (init[thr_id]) throughput[thr_id] = min(throughput[thr_id], max_nonce - first_nonce);

-		// DMatrix
-		cudaMalloc(&d_matrix[thr_id], (size_t)16 * 8 * 8 * sizeof(uint64_t) * throughput);
-		lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
+		cudaDeviceProp props;
+		cudaGetDeviceProperties(&props, dev_id);

-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
+		if (strstr(props.name, "750 Ti")) gtx750ti = true;
+		else gtx750ti = false;
+
+		//blake256_cpu_init(thr_id, throughput);
+		keccak256_cpu_init(thr_id, throughput[thr_id]);
+		skein256_cpu_init(thr_id, throughput[thr_id]);
+		groestl256_cpu_init(thr_id, throughput[thr_id]);
+
+		if (device_sm[dev_id] >= 500)
+		{
+			size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
+			CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput[thr_id]));
+			lyra2_cpu_init(thr_id, throughput[thr_id], d_matrix[thr_id]);
+		}
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput[thr_id]));

 		init[thr_id] = true;
+		if (temp != intensity){
+			gpulog(LOG_INFO, thr_id, "Intensity set to %u, %u cuda threads",
+				intensity, throughput[thr_id]);
+		}
 	}

 	uint32_t _ALIGN(128) endiandata[20];
@ -122,15 +141,15 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
				@@ -122,15 +141,15 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
 		int order = 0;
 		uint32_t foundNonce;

-		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		blake256_cpu_hash_80(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
+		keccak256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
+		lyra2_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], gtx750ti);
+		skein256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
 		TRACE("S")

-		*hashes_done = pdata[19] - first_nonce + throughput;
+		*hashes_done = pdata[19] - first_nonce + throughput[thr_id];

-		foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		foundNonce = groestl256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
 		if (foundNonce != UINT32_MAX)
 		{
 			uint32_t _ALIGN(64) vhash64[8];
@ -162,11 +181,11 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
				@@ -162,11 +181,11 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
 			}
 		}

-		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+		if ((uint64_t)throughput[thr_id] + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
-		pdata[19] += throughput;
+		pdata[19] += throughput[thr_id];

 	} while (!work_restart[thr_id].restart);

--- a/lyra2/lyra2REv2.cu
+++ b/lyra2/lyra2REv2.cu
@ -10,6 +10,7 @@ extern "C" {
				@@ -10,6 +10,7 @@ extern "C" {
 #include "miner.h"
 #include "cuda_helper.h"

+#include <math.h>

 static uint64_t *d_hash[MAX_GPUS];
 static uint64_t* d_matrix[MAX_GPUS];
@ -20,6 +21,9 @@ extern void blake256_cpu_setBlock_80(uint32_t *pdata);
				@@ -20,6 +21,9 @@ extern void blake256_cpu_setBlock_80(uint32_t *pdata);
 extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void keccak256_cpu_init(int thr_id, uint32_t threads);
 extern void keccak256_cpu_free(int thr_id);
+extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
+extern void blakeKeccakcube256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
+
 extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void skein256_cpu_init(int thr_id, uint32_t threads);
 extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);
@ -27,10 +31,11 @@ extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t start
				@@ -27,10 +31,11 @@ extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t start
 extern void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix);

-extern void bmw256_setTarget(const void *ptarget);
+//extern void bmw256_setTarget(const void *ptarget);
 extern void bmw256_cpu_init(int thr_id, uint32_t threads);
 extern void bmw256_cpu_free(int thr_id);
-extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces);
+extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces, uint32_t Target);
+extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces, uint32_t Target, uint32_t **result);

 void lyra2v2_hash(void *state, const void *input)
 {
@ -79,7 +84,7 @@ void lyra2v2_hash(void *state, const void *input)
				@@ -79,7 +84,7 @@ void lyra2v2_hash(void *state, const void *input)
 		uint32_t* debugbuf = NULL; \
 		cudaMallocHost(&debugbuf, 32); \
 		cudaMemcpy(debugbuf, d_hash[thr_id], 32, cudaMemcpyDeviceToHost); \
-		printf("lyra2 %s %08x %08x %08x %08x...%08x... \n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
+		printf("lyra2 %s %08x %08x %08x %08x...%08x... ¥n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \
 			swab32(debugbuf[2]), swab32(debugbuf[3]), swab32(debugbuf[7])); \
 		cudaFreeHost(debugbuf); \
 	} \
@ -89,23 +94,96 @@ void lyra2v2_hash(void *state, const void *input)
				@@ -89,23 +94,96 @@ void lyra2v2_hash(void *state, const void *input)
 #endif

 static bool init[MAX_GPUS] = { 0 };
+static uint32_t throughput[MAX_GPUS] = { 0 };

 extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
-	int dev_id = device_map[thr_id];
-	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 18;
-	uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity);
-	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
-
 	if (opt_benchmark)
 		ptarget[7] = 0x000f;

 	if (!init[thr_id])
 	{
-		size_t matrix_sz = 16 * sizeof(uint64_t) * 4 * 3;
+		int dev_id = device_map[thr_id];
+		cudaDeviceProp props;
+		cudaGetDeviceProperties(&props, dev_id);
+
+		int intensity = 0;
+		// Pascal
+		if (strstr(props.name, "1080")) intensity = 22;
+		else if (strstr(props.name, "1070")) intensity = 21;
+		// Maxwell
+		else if (strstr(props.name, "TITAN X")) intensity = 21;
+		else if (strstr(props.name, "980")) intensity = 21;
+		else if (strstr(props.name, "970")) intensity = 20;
+		else if (strstr(props.name, "960")) intensity = 20;
+		else if (strstr(props.name, "950")) intensity = 19;
+		else if (strstr(props.name, "750 Ti")) intensity = 19;
+		else if (strstr(props.name, "750")) intensity = 18;
+		// Kepler〜Fermi
+		else if (strstr(props.name, "TITAN Z")) intensity = 20;
+		else if (strstr(props.name, "TITAN")) intensity = 19;
+		else if (strstr(props.name, "780")) intensity = 19;
+		else if (strstr(props.name, "760")) intensity = 18;
+		else if (strstr(props.name, "730")) intensity = 16;
+		else if (strstr(props.name, "720")) intensity = 15;
+		else if (strstr(props.name, "710")) intensity = 16;
+		else if (strstr(props.name, "690")) intensity = 20;
+		else if (strstr(props.name, "680")) intensity = 19;
+		else if (strstr(props.name, "660")) intensity = 18;
+		else if (strstr(props.name, "650 Ti")) intensity = 18;
+		else if (strstr(props.name, "640")) intensity = 17;
+		else if (strstr(props.name, "630")) intensity = 16;
+		else if (strstr(props.name, "620")) intensity = 15;
+
+		else if (strstr(props.name, "90")) intensity = 18;	//590
+		else if (strstr(props.name, "80")) intensity = 18;	//480 580
+		else if (strstr(props.name, "70")) intensity = 18;	//470 570 670 770
+		else if (strstr(props.name, "65")) intensity = 17;	//465
+		else if (strstr(props.name, "60")) intensity = 17;	//460 560
+		else if (strstr(props.name, "55")) intensity = 17;	//555
+		else if (strstr(props.name, "50")) intensity = 17;	//450 550Ti 650
+		else if (strstr(props.name, "45")) intensity = 16;	//545
+		else if (strstr(props.name, "40")) intensity = 15;	//440
+		else if (strstr(props.name, "30")) intensity = 15;	//430 530
+		else if (strstr(props.name, "20")) intensity = 14;	//420 520
+		else if (strstr(props.name, "10")) intensity = 14;	//510 610
+
+		if (intensity != 0 && opt_eco_mode) intensity -= 3.0;
+
+		if (intensity == 0)
+		{
+			intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 18;
+			throughput[thr_id] = cuda_default_throughput(dev_id, 1UL << (int)intensity);
+		}
+		else
+		{
+			//uint32_t adds = 0;
+			//	double d = floor(intensity);
+
+			/*	if ((intensity - d) > 0.0) {
+					adds = (uint32_t)floor((intensity - d) * (1 << (int)(d - 10.0)) * 1024;
+					throughput = (1 << (int)d) + adds;
+					gpulog(LOG_INFO, thr_id, "Adding %u threads to intensity %u, %u cuda threads",
+					adds, (int)d, throughput);
+					}
+					else if (gpus_intensity[n] != (1 << (int)intensity)) {
+					throughput = (1 << (int)intensity);
+					applog(LOG_INFO, "Intensity set to %u, %u cuda threads",
+					v, gpus_intensity[n]);
+					}
+					*/
+			uint32_t temp = 1UL << intensity;
+			throughput[thr_id] = cuda_default_throughput(dev_id, temp);
+
+			if (temp == throughput[thr_id])
+			{
+				gpulog(LOG_INFO, thr_id, "Intensity set to %u, %u cuda threads",
+					intensity, throughput[thr_id]);
+			}
+		}
 		cudaSetDevice(dev_id);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
@ -113,52 +191,84 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
				@@ -113,52 +191,84 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
+		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);

-		blake256_cpu_init(thr_id, throughput);
-		keccak256_cpu_init(thr_id,throughput);
-		skein256_cpu_init(thr_id, throughput);
-		bmw256_cpu_init(thr_id, throughput);
+		//blake256_cpu_init(thr_id, throughput);
+		//keccak256_cpu_init(thr_id,throughput);
+		skein256_cpu_init(thr_id, throughput[thr_id]);
+		bmw256_cpu_init(thr_id, throughput[thr_id]);

 		// SM 3 implentation requires a bit more memory
-		if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500)
-			matrix_sz = 16 * sizeof(uint64_t) * 4 * 4;
-			
-		CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));
-		lyra2v2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
+		//if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300)
+		//	matrix_sz = 16 * sizeof(uint64_t) * 4 * 4;
+		//else
+		size_t matrix_sz = sizeof(uint64_t) * 4 * 4;
+		CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput[thr_id]));
+		lyra2v2_cpu_init(thr_id, throughput[thr_id], d_matrix[thr_id]);

-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput[thr_id]));

-		api_set_throughput(thr_id, throughput);
+		api_set_throughput(thr_id, throughput[thr_id]);
 		init[thr_id] = true;
 	}
+	else throughput[thr_id] = min(throughput[thr_id], max_nonce - first_nonce);

 	uint32_t endiandata[20];
-	for (int k=0; k < 20; k++)
+	for (int k = 0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);

 	blake256_cpu_setBlock_80(pdata);
-	bmw256_setTarget(ptarget);
+	//bmw256_setTarget(ptarget);

+	//uint32_t *vhash64[2];
 	do {
 		int order = 0;
 		uint32_t foundNonces[2] = { 0, 0 };

-		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		blakeKeccak256_cpu_hash_80(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
+		//blakeKeccakcube256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		TRACE("blake  :");
-		keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		//keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		TRACE("keccak :");
-		cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		cubehash256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
 		TRACE("cube   :");
-		lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		lyra2v2_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
 		TRACE("lyra2  :");
-		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		skein256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
 		TRACE("skein  :");
-		cubehash256_cpu_hash_32(thr_id, throughput,pdata[19], d_hash[thr_id], order++);
+		cubehash256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++);
 		TRACE("cube   :");

-		bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonces);
+		bmw256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], foundNonces, ptarget[7]);
+		//bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonces, ptarget[7], vhash64);
+
+		*hashes_done = pdata[19] - first_nonce + throughput[thr_id];

-		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		/*if (foundNonces[1] != 0)
+		{
+			if (fulltest(vhash64[0], ptarget))
+			{
+				gpulog(LOG_WARNING, thr_id, "result two foundNonces!");
+				pdata[19] = foundNonces[1];
+				pdata[21] = foundNonces[0];
+				work_set_target_ratio(work, vhash64[0]);
+				if (bn_hash_target_ratio(vhash64[1], ptarget) > work->shareratio) {
+					work_set_target_ratio(work, vhash64[1]);
+				}
+				return 2;
+			}
+		}
+		if (foundNonces[0] != 0)
+		{
+			if (fulltest(vhash64[0], ptarget))
+			{
+				gpulog(LOG_WARNING, thr_id, "result one foundNonce!");
+				pdata[19] = foundNonces[0];
+				work_set_target_ratio(work, vhash64[0]);
+				return 1;
+			}
+		}*/

 		if (foundNonces[0] != 0)
 		{
@ -176,25 +286,25 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
				@@ -176,25 +286,25 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc
 					be32enc(&endiandata[19], foundNonces[1]);
 					lyra2v2_hash(vhash64, endiandata);
 					pdata[21] = foundNonces[1];
+					xchg(pdata[19], pdata[21]);
 					if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio) {
 						work_set_target_ratio(work, vhash64);
-						xchg(pdata[19], pdata[21]);
 					}
 					res++;
 				}
 				return res;
 			}
-			else
+			else if (vhash64[7] > ptarget[7])
 			{
 				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonces[0]);
 			}
 		}

-		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+		if ((uint64_t)throughput[thr_id] + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
-		pdata[19] += throughput;
+		pdata[19] += throughput[thr_id];

 	} while (!work_restart[thr_id].restart && !abort_flag);

@ -214,7 +324,7 @@ extern "C" void free_lyra2v2(int thr_id)
				@@ -214,7 +324,7 @@ extern "C" void free_lyra2v2(int thr_id)
 	cudaFree(d_matrix[thr_id]);

 	bmw256_cpu_free(thr_id);
-	keccak256_cpu_free(thr_id);
+	//keccak256_cpu_free(thr_id);

 	init[thr_id] = false;

--- a/miner.h
+++ b/miner.h
@ -445,6 +445,7 @@ struct option {
				@@ -445,6 +445,7 @@ struct option {
 #endif
 extern int options_count();

+extern bool opt_eco_mode;
 extern bool opt_benchmark;
 extern bool opt_debug;
 extern bool opt_quiet;
@ -646,6 +647,9 @@ struct work {
				@@ -646,6 +647,9 @@ struct work {
 	/* pok getwork txs */
 	uint32_t tx_count;
 	struct tx txs[POK_MAX_TXS];
+
+	char *txs2;
+	char *workid;
 };

 #define POK_BOOL_MASK 0x00008000
--- a/neoscrypt/cuda_neoscrypt.cu
+++ b/neoscrypt/cuda_neoscrypt.cu
--- a/neoscrypt/cuda_vectors.h
+++ b/neoscrypt/cuda_vectors.h
@ -482,7 +482,7 @@ static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift
				@@ -482,7 +482,7 @@ static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift
 // require a uint32_t[9] ret array
 // note: djm neoscrypt implementation is near the limits of gpu capabilities
 //       and weird behaviors can happen when tuning device functions code...
-__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
+__device__ __forceinline__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
 {
 	uint8_t *v = (uint8_t*) &vec4.s0;
 	uint8_t *r = (uint8_t*) ret;
@ -496,7 +496,7 @@ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
				@@ -496,7 +496,7 @@ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
 #else

 // same for SM 3.5+, really faster ?
-__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
+__device__ __forceinline__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift)
 {
 	uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0;
 	asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift));
--- a/neoscrypt/neoscrypt.cpp
+++ b/neoscrypt/neoscrypt.cpp
@ -1,11 +1,14 @@
				@@ -1,11 +1,14 @@
 #include <cuda_runtime.h>
-#include "miner.h"
-#include "neoscrypt/neoscrypt.h"
+#include <string.h>
+#include <miner.h>

-extern void neoscrypt_setBlockTarget(uint32_t * data, const void *ptarget);
-extern void neoscrypt_cpu_init(int thr_id, uint32_t threads);
-extern void neoscrypt_cpu_free(int thr_id);
-extern uint32_t neoscrypt_cpu_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, int have_stratum, int order);
+#include "neoscrypt.h"
+
+extern void neoscrypt_setBlockTarget(uint32_t* const data, uint32_t* const ptarget);
+
+extern void neoscrypt_init_2stream(int thr_id, uint32_t threads);
+extern void neoscrypt_free_2stream(int thr_id);
+extern void neoscrypt_hash_k4_2stream(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum);

 static bool init[MAX_GPUS] = { 0 };

@ -18,6 +21,17 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
				@@ -18,6 +21,17 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign

 	int dev_id = device_map[thr_id];
 	int intensity = is_windows() ? 18 : 19;
+	// Pascal
+	if (strstr(device_name[dev_id], "GTX 10")) intensity = 22;
+	// Maxwell
+	else if (strstr(device_name[dev_id], "TITAN X")) intensity = 21;
+	else if (strstr(device_name[dev_id], "980")) intensity = 21;
+	else if (strstr(device_name[dev_id], "970")) intensity = 20;
+	else if (strstr(device_name[dev_id], "960")) intensity = 20;
+	else if (strstr(device_name[dev_id], "950")) intensity = 19;
+	else if (strstr(device_name[dev_id], "750 Ti")) intensity = 19;
+	else if (strstr(device_name[dev_id], "750")) intensity = 19;
+
 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
 	throughput = throughput / 32; /* set for max intensity ~= 20 */
 	api_set_throughput(thr_id, throughput);
@ -31,16 +45,20 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
				@@ -31,16 +45,20 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
 	{
 		cudaDeviceSynchronize();
 		cudaSetDevice(dev_id);
-		cudaDeviceSetCacheConfig(cudaFuncCachePreferL1);
-		cudaGetLastError(); // reset errors if device is not "reset"
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			cudaGetLastError(); // reset errors if device is not "reset"
+		}

 		if (device_sm[dev_id] <= 300) {
-			applog(LOG_ERR, "Sorry neoscrypt is not supported on SM 3.0 devices");
+			gpulog(LOG_ERR, thr_id, "Sorry neoscrypt is not supported on SM 3.0 devices");
 			proper_exit(EXIT_CODE_CUDA_ERROR);
 		}

-		applog(LOG_INFO, "GPU #%d: Using %d cuda threads", dev_id, throughput);
-		neoscrypt_cpu_init(thr_id, throughput);
+		gpulog(LOG_INFO, thr_id, "Using %d cuda threads", throughput);
+		neoscrypt_init_2stream(thr_id, throughput);

 		init[thr_id] = true;
 	}
@ -48,34 +66,39 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
				@@ -48,34 +66,39 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
 	if (have_stratum) {
 		for (int k = 0; k < 20; k++)
 			be32enc(&endiandata[k], pdata[k]);
-	} else {
+	}
+	else {
 		for (int k = 0; k < 20; k++)
 			endiandata[k] = pdata[k];
 	}

-	neoscrypt_setBlockTarget(endiandata,ptarget);
+	neoscrypt_setBlockTarget(endiandata, ptarget);

 	do {
-		uint32_t foundNonce = neoscrypt_cpu_hash_k4(thr_id, throughput, pdata[19], have_stratum, 0);
-		if (foundNonce != UINT32_MAX)
-		{
-			uint32_t _ALIGN(64) vhash64[8];
+		uint32_t foundNonces[2] = { UINT32_MAX, UINT32_MAX };
+		neoscrypt_hash_k4_2stream(thr_id, throughput, pdata[19], foundNonces, have_stratum);

-			*hashes_done = pdata[19] - first_nonce + 1;
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (foundNonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];

 			if (have_stratum) {
-				be32enc(&endiandata[19], foundNonce);
-			} else {
-				endiandata[19] = foundNonce;
+				be32enc(&endiandata[19], foundNonces[0]);
+			}
+			else {
+				endiandata[19] = foundNonces[0];
 			}
-			neoscrypt((uchar*)vhash64, (uchar*) endiandata, 0x80000620U);
+			neoscrypt((uchar*)vhash, (uchar*)endiandata, 0x80000620U);

-			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
-				work_set_target_ratio(work, vhash64);
-				pdata[19] = foundNonce;
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work_set_target_ratio(work, vhash);
+				pdata[19] = foundNonces[0];
 				return 1;
-			} else {
-				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce);
+			}
+			else {
+				gpulog(LOG_WARNING, thr_id, "nonce %08x does not validate on CPU!", foundNonces[0]);
 			}
 		}

@ -100,8 +123,9 @@ void free_neoscrypt(int thr_id)
				@@ -100,8 +123,9 @@ void free_neoscrypt(int thr_id)

 	cudaThreadSynchronize();

-	neoscrypt_cpu_free(thr_id);
+	neoscrypt_free_2stream(thr_id);
 	init[thr_id] = false;

 	cudaDeviceSynchronize();
 }
+
--- a/nvml.cpp
+++ b/nvml.cpp
@ -49,7 +49,7 @@ uint32_t limit_prev[MAX_GPUS] = { 0 };
				@@ -49,7 +49,7 @@ uint32_t limit_prev[MAX_GPUS] = { 0 };
 	static void *wrap_dlopen(const char *filename) {
 		HMODULE h = LoadLibrary(filename);
 		if (!h && opt_debug) {
-			applog(LOG_DEBUG, "dlopen(%d): failed to load %s", 
+			applog(LOG_DEBUG, "dlopen(%d): failed to load %s",
 				GetLastError(), filename);
 		}
 		return (void*)h;
@ -68,7 +68,7 @@ uint32_t limit_prev[MAX_GPUS] = { 0 };
				@@ -68,7 +68,7 @@ uint32_t limit_prev[MAX_GPUS] = { 0 };
 	static void *wrap_dlopen(const char *filename) {
 		void *h = dlopen(filename, RTLD_NOW);
 		if (h == NULL && opt_debug) {
-			applog(LOG_DEBUG, "dlopen(%d): failed to load %s", 
+			applog(LOG_DEBUG, "dlopen(%d): failed to load %s",
 				errno, filename);
 		}
 		return (void*)h;
--- a/quark/cuda_quark_blake512_sp.cuh
+++ b/quark/cuda_quark_blake512_sp.cuh
@ -21,12 +21,7 @@ static __device__ __forceinline__ uint2 cuda_swap(uint2 v) {
				@@ -21,12 +21,7 @@ static __device__ __forceinline__ uint2 cuda_swap(uint2 v) {
 	v.y = t;
 	return v;
 }
-static __device__ __forceinline__ uint2 eorswap32(uint2 u, uint2 v) {
-	uint2 result;
-	result.y = u.x ^ v.x;
-	result.x = u.y ^ v.y;
-	return result;
-}
+

 __constant__ uint2 c_512_u2[16] =
 {
--- a/util.cpp
+++ b/util.cpp
@ -559,7 +559,7 @@ static json_t *json_rpc_call(CURL *curl, const char *url,
				@@ -559,7 +559,7 @@ static json_t *json_rpc_call(CURL *curl, const char *url,
 	res_val = json_object_get(val, "result");
 	err_val = json_object_get(val, "error");

-	if (!res_val || json_is_null(res_val) ||
+	if (!res_val || //json_is_null(res_val) ||
 	    (err_val && !json_is_null(err_val))) {
 		char *s = NULL;