lyra2z algo (temporary algo)

based on djm34 version, cleaned up and adapted to ccminer 2.0
2025-03-13 05:51:19 +00:00 · 2017-03-07 20:05:03 +01:00 · 2017-03-07 20:05:03 +01:00 · 1b7c2fc296
commit 1b7c2fc296
parent ebef3c8656
14 changed files with 2252 additions and 13 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -34,6 +34,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  lyra2/Lyra2.c lyra2/Sponge.c \
 			  lyra2/lyra2RE.cu lyra2/cuda_lyra2.cu \
 		          lyra2/lyra2REv2.cu lyra2/cuda_lyra2v2.cu \
+			  lyra2/Lyra2Z.c lyra2/lyra2Z.cu lyra2/cuda_lyra2Z.cu \
 			  Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \
 			  Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \
 			  Algo256/blake256.cu Algo256/decred.cu Algo256/vanilla.cu Algo256/keccak256.cu \
--- a/algos.h
+++ b/algos.h
@ -26,6 +26,7 @@ enum sha_algos {
 	ALGO_LUFFA,
 	ALGO_LYRA2,
 	ALGO_LYRA2v2,
+	ALGO_LYRA2Z,
 	ALGO_MJOLLNIR,		/* Hefty hash */
 	ALGO_MYR_GR,
 	ALGO_NEOSCRYPT,
@ -82,6 +83,7 @@ static const char *algo_names[] = {
 	"luffa",
 	"lyra2",
 	"lyra2v2",
+	"lyra2z",
 	"mjollnir",
 	"myr-gr",
 	"neoscrypt",
--- a/bench.cpp
+++ b/bench.cpp
@ -63,6 +63,7 @@ void algo_free_all(int thr_id)
 	free_luffa(thr_id);
 	free_lyra2(thr_id);
 	free_lyra2v2(thr_id);
+	free_lyra2Z(thr_id);
 	free_myriad(thr_id);
 	free_neoscrypt(thr_id);
 	free_nist5(thr_id);
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -244,6 +244,7 @@ Options:\n\
 			luffa       Joincoin\n\
 			lyra2       CryptoCoin\n\
 			lyra2v2     VertCoin\n\
+			lyra2z      ZeroCoin (3rd impl)\n\
 			mjollnir    Mjollnircoin\n\
 			myr-gr      Myriad-Groestl\n\
 			neoscrypt   FeatherCoin, Phoenix, UFO...\n\
@ -1616,6 +1617,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		case ALGO_GROESTL:
 		case ALGO_LBRY:
 		case ALGO_LYRA2v2:
+		case ALGO_LYRA2Z:
 			work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));
 			break;
 		case ALGO_KECCAK:
@ -2131,6 +2133,7 @@ static void *miner_thread(void *userdata)
 				minmax = 0x300000;
 				break;
 			case ALGO_LYRA2:
+			case ALGO_LYRA2Z:
 			case ALGO_NEOSCRYPT:
 			case ALGO_SIB:
 			case ALGO_SCRYPT:
@ -2272,6 +2275,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_LYRA2v2:
 			rc = scanhash_lyra2v2(thr_id, &work, max_nonce, &hashes_done);
 			break;
+		case ALGO_LYRA2Z:
+			rc = scanhash_lyra2Z(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_NEOSCRYPT:
 			rc = scanhash_neoscrypt(thr_id, &work, max_nonce, &hashes_done);
 			break;
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -39,10 +39,10 @@
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings" Condition="'$(Platform)'=='Win32'">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.props" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.props" />
  </ImportGroup>
  <ImportGroup Label="ExtensionSettings" Condition="'$(Platform)'=='x64'">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.props" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.props" />
  </ImportGroup>
  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
@ -256,7 +256,7 @@
    <ClCompile Include="myriadgroestl.cpp" />
    <ClCompile Include="lyra2\Lyra2.c" />
    <ClCompile Include="lyra2\Sponge.c" />
-    <ClInclude Include="lyra2\cuda_lyra2_sm2.cuh" />
+    <ClCompile Include="lyra2\Lyra2Z.c" />
    <ClInclude Include="neoscrypt\neoscrypt.h" />
    <ClCompile Include="neoscrypt\neoscrypt.cpp" />
    <ClCompile Include="neoscrypt\neoscrypt-cpu.c" />
@ -383,7 +383,7 @@
    <ClInclude Include="uint256.h" />
    <ClInclude Include="lyra2\Lyra2.h" />
    <ClInclude Include="lyra2\Sponge.h" />
-    <ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh" />
+    <ClInclude Include="lyra2\Lyra2Z.h" />
    <ClInclude Include="quark\groestl_transf_quad.h" />
    <ClInclude Include="quark\groestl_functions_quad.h" />
    <ClInclude Include="quark\cuda_quark.h" />
@ -505,6 +505,11 @@
    <CudaCompile Include="lyra2\cuda_lyra2.cu" />
    <CudaCompile Include="lyra2\lyra2REv2.cu" />
    <CudaCompile Include="lyra2\cuda_lyra2v2.cu" />
+    <ClInclude Include="lyra2\cuda_lyra2_sm2.cuh" />
+    <ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh" />
+    <CudaCompile Include="lyra2\lyra2Z.cu" />
+    <CudaCompile Include="lyra2\cuda_lyra2Z.cu" />
+    <ClInclude Include="lyra2\cuda_lyra2Z_sm5.cuh" />
    <CudaCompile Include="sia\sia.cu" />
    <CudaCompile Include="skein.cu">
      <MaxRegCount>64</MaxRegCount>
@ -567,11 +572,8 @@
    <Text Include="README.txt" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <ImportGroup Label="ExtensionTargets" Condition="'$(Platform)'=='Win32'">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 7.5.targets" />
-  </ImportGroup>
-  <ImportGroup Label="ExtensionTargets" Condition="'$(Platform)'=='x64'">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 8.0.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 6.5.targets" />
  </ImportGroup>
  <!-- Copy the required dlls -->
  <Target Name="AfterBuild">
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -255,6 +255,9 @@
    <ClCompile Include="lyra2\Sponge.c">
      <Filter>Source Files\sph</Filter>
    </ClCompile>
+    <ClCompile Include="lyra2\Lyra2Z.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
    <ClCompile Include="scrypt.cpp">
      <Filter>Source Files\CUDA\scrypt</Filter>
    </ClCompile>
@ -473,6 +476,9 @@
    <ClInclude Include="lyra2\Lyra2.h">
      <Filter>Header Files\lyra2</Filter>
    </ClInclude>
+    <ClInclude Include="lyra2\Lyra2Z.h">
+      <Filter>Header Files\lyra2</Filter>
+    </ClInclude>
    <ClInclude Include="lyra2\Sponge.h">
      <Filter>Header Files\lyra2</Filter>
    </ClInclude>
@ -506,6 +512,9 @@
    <ClInclude Include="lyra2\cuda_lyra2_sm2.cuh">
      <Filter>Source Files\CUDA\lyra2</Filter>
    </ClInclude>
+    <ClInclude Include="lyra2\cuda_lyra2Z_sm5.cuh">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </ClInclude>
    <ClInclude Include="quark\cuda_quark_blake512_sp.cuh">
      <Filter>Source Files\CUDA\quark</Filter>
    </ClInclude>
@ -820,6 +829,12 @@
    <CudaCompile Include="lyra2\lyra2REv2.cu">
      <Filter>Source Files\CUDA\lyra2</Filter>
    </CudaCompile>
+    <CudaCompile Include="lyra2\cuda_lyra2Z.cu">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </CudaCompile>
+    <CudaCompile Include="lyra2\lyra2Z.cu">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </CudaCompile>
    <CudaCompile Include="Algo256\blake2s.cu">
      <Filter>Source Files\CUDA\Algo256</Filter>
    </CudaCompile>
--- a/lyra2/Lyra2Z.c
+++ b/lyra2/Lyra2Z.c
@ -0,0 +1,215 @@
+/**
+ * Implementation of the Lyra2 Password Hashing Scheme (PHS).
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "Lyra2Z.h"
+#include "Sponge.h"
+
+/**
+ * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
+ * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
+ * where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all
+ * integer parameters (treated as type "unsigned int") in the order they are provided, plus the value
+ * of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols).
+ *
+ * @param K The derived key to be output by the algorithm
+ * @param kLen Desired key length
+ * @param pwd User password
+ * @param pwdlen Password length
+ * @param salt Salt
+ * @param saltlen Salt length
+ * @param timeCost Parameter to determine the processing time (T)
+ * @param nRows Number or rows of the memory matrix (R)
+ * @param nCols Number of columns of the memory matrix (C)
+ *
+ * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
+ */
+int LYRA2Z(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols)
+{
+	//============================= Basic variables ============================//
+	int64_t row = 2; //index of row to be processed
+	int64_t prev = 1; //index of prev (last row ever computed/modified)
+	int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+	int64_t tau; //Time Loop iterator
+	int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+	int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+	int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+	int64_t i; //auxiliary iteration counter
+	int64_t v64; // 64bit var for memcpy
+	//==========================================================================/
+
+	//========== Initializing the Memory Matrix and pointers to it =============//
+	//Tries to allocate enough space for the whole memory matrix
+
+	const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+	const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+	// for Lyra2REv2, nCols = 4, v1 was using 8
+	const int64_t BLOCK_LEN = BLOCK_LEN_BLAKE2_SAFE_INT64;
+
+	size_t sz = (size_t)ROW_LEN_BYTES * nRows;
+	uint64_t *wholeMatrix = malloc(sz);
+	if (wholeMatrix == NULL) {
+		return -1;
+	}
+	memset(wholeMatrix, 0, sz);
+
+	//Allocates pointers to each row of the matrix
+	uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows);
+	if (memMatrix == NULL) {
+		return -1;
+	}
+	//Places the pointers in the correct positions
+	uint64_t *ptrWord = wholeMatrix;
+	for (i = 0; i < nRows; i++) {
+		memMatrix[i] = ptrWord;
+		ptrWord += ROW_LEN_INT64;
+	}
+	//==========================================================================/
+
+	//============= Getting the password + salt + basil padded with 10*1 ===============//
+	//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+	//but this ensures that the password copied locally will be overwritten as soon as possible
+
+	//First, we clean enough blocks for the password, salt, basil and padding
+	int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
+
+	byte *ptrByte = (byte*) wholeMatrix;
+
+	//Prepends the password
+	memcpy(ptrByte, pwd, pwdlen);
+	ptrByte += pwdlen;
+
+	//Concatenates the salt
+	memcpy(ptrByte, salt, saltlen);
+	ptrByte += saltlen;
+
+	memset(ptrByte, 0, (size_t) (nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen)));
+
+	//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+	memcpy(ptrByte, &kLen, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = pwdlen;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = saltlen;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = timeCost;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = nRows;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = nCols;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+
+	//Now comes the padding
+	*ptrByte = 0x80; //first byte of padding: right after the password
+	ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+	ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+	*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+	//==========================================================================/
+
+	//======================= Initializing the Sponge State ====================//
+	//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+	uint64_t state[16];
+	initState(state);
+	//==========================================================================/
+
+	//================================ Setup Phase =============================//
+	//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+	ptrWord = wholeMatrix;
+	for (i = 0; i < nBlocksInput; i++) {
+		absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
+		ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
+	}
+
+	//Initializes M[0] and M[1]
+	reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
+
+	reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
+
+	do {
+		//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+		reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+
+		//updates the value of row* (deterministically picked during Setup))
+		rowa = (rowa + step) & (window - 1);
+		//update prev: it now points to the last row ever computed
+		prev = row;
+		//updates row: goes to the next row to be computed
+		row++;
+
+		//Checks if all rows in the window where visited.
+		if (rowa == 0) {
+		step = window + gap; //changes the step: approximately doubles its value
+		window *= 2; //doubles the size of the re-visitation window
+		gap = -gap; //inverts the modifier to the step
+	}
+
+	} while (row < nRows);
+	//==========================================================================/
+
+	//============================ Wandering Phase =============================//
+	row = 0; //Resets the visitation to the first row of the memory matrix
+	for (tau = 1; tau <= timeCost; tau++) {
+		//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+		step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+		do {
+			//Selects a pseudorandom index row*
+			//------------------------------------------------------------------------------------------
+			rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+			//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+			//------------------------------------------------------------------------------------------
+
+			//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+			reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+
+			//update prev: it now points to the last row ever computed
+			prev = row;
+
+			//updates row: goes to the next row to be computed
+			//------------------------------------------------------------------------------------------
+			row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+			//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+			//------------------------------------------------------------------------------------------
+
+		} while (row != 0);
+	}
+
+	//============================ Wrap-up Phase ===============================//
+	//Absorbs the last block of the memory matrix
+	absorbBlock(state, memMatrix[rowa]);
+
+	//Squeezes the key
+	squeeze(state, K, (unsigned int) kLen);
+
+	//========================= Freeing the memory =============================//
+	free(memMatrix);
+	free(wholeMatrix);
+
+	return 0;
+}
+
--- a/lyra2/Lyra2Z.h
+++ b/lyra2/Lyra2Z.h
@ -0,0 +1,42 @@
+/**
+ * Header file for the Lyra2 Password Hashing Scheme (PHS).
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef LYRA2Z_H_
+#define LYRA2Z_H_
+
+#include <stdint.h>
+
+typedef unsigned char byte;
+
+//Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED)
+#define BLOCK_LEN_BLAKE2_SAFE_INT64 8                                   //512 bits (=64 bytes, =8 uint64_t)
+#define BLOCK_LEN_BLAKE2_SAFE_BYTES (BLOCK_LEN_BLAKE2_SAFE_INT64 * 8)   //same as above, in bytes
+
+
+#ifdef BLOCK_LEN_BITS
+        #define BLOCK_LEN_INT64 (BLOCK_LEN_BITS/64)      //Block length: 768 bits (=96 bytes, =12 uint64_t)
+        #define BLOCK_LEN_BYTES (BLOCK_LEN_BITS/8)       //Block length, in bytes
+#else   //default block lenght: 768 bits
+        #define BLOCK_LEN_INT64 12                       //Block length: 768 bits (=96 bytes, =12 uint64_t)
+        #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8)    //Block length, in bytes
+#endif
+
+int LYRA2Z(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
+
+#endif /* LYRA2_H_ */
--- a/lyra2/cuda_lyra2Z.cu
+++ b/lyra2/cuda_lyra2Z.cu
@ -0,0 +1,966 @@
+/**
+ * Lyra2 (v1) cuda implementation based on djm34 work
+ * tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2)
+ * Lyra2Z implentation for Zcoin based on all the previous
+ * djm34 2017
+ **/
+
+#include <stdio.h>
+#include <memory.h>
+
+#define TPB52 32
+#define TPB30 160
+#define TPB20 160
+
+#include "cuda_lyra2Z_sm5.cuh"
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+__device__ uint32_t __shfl(uint32_t a, uint32_t b, uint32_t c);
+#define atomicMin()
+#define __CUDA_ARCH__ 520
+#endif
+
+static uint32_t *h_GNonces[16]; // this need to get fixed as the rest of that routine
+static uint32_t *d_GNonces[16];
+
+#define reduceDuplexRow(rowIn, rowInOut, rowOut) { \
+	for (int i = 0; i < 8; i++) { \
+		for (int j = 0; j < 12; j++) \
+			state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \
+		round_lyra_sm2(state); \
+		for (int j = 0; j < 12; j++) \
+			Matrix[j + 12 * i][rowOut] ^= state[j]; \
+		Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
+		Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
+		Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
+		Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
+		Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
+		Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
+		Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
+		Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
+		Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
+		Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
+		Matrix[10+ 12 * i][rowInOut] ^= state[9]; \
+		Matrix[11+ 12 * i][rowInOut] ^= state[10]; \
+	} \
+  }
+
+#define absorbblock(in)  { \
+	state[0] ^= Matrix[0][in]; \
+	state[1] ^= Matrix[1][in]; \
+	state[2] ^= Matrix[2][in]; \
+	state[3] ^= Matrix[3][in]; \
+	state[4] ^= Matrix[4][in]; \
+	state[5] ^= Matrix[5][in]; \
+	state[6] ^= Matrix[6][in]; \
+	state[7] ^= Matrix[7][in]; \
+	state[8] ^= Matrix[8][in]; \
+	state[9] ^= Matrix[9][in]; \
+	state[10] ^= Matrix[10][in]; \
+	state[11] ^= Matrix[11][in]; \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+	round_lyra_sm2(state); \
+  }
+
+__device__ __forceinline__
+static void round_lyra_sm2(uint2 *s)
+{
+	Gfunc(s[0], s[4], s[8], s[12]);
+	Gfunc(s[1], s[5], s[9], s[13]);
+	Gfunc(s[2], s[6], s[10], s[14]);
+	Gfunc(s[3], s[7], s[11], s[15]);
+	Gfunc(s[0], s[5], s[10], s[15]);
+	Gfunc(s[1], s[6], s[11], s[12]);
+	Gfunc(s[2], s[7], s[8], s[13]);
+	Gfunc(s[3], s[4], s[9], s[14]);
+}
+
+__device__ __forceinline__
+void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[16], uint2 Matrix[96][8])
+{
+#if __CUDA_ARCH__ > 500
+#pragma unroll
+#endif
+	for (int i = 0; i < 8; i++)
+	{
+		#pragma unroll
+		for (int j = 0; j < 12; j++)
+			state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut];
+
+		round_lyra_sm2(state);
+
+		#pragma unroll
+		for (int j = 0; j < 12; j++)
+			Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j];
+
+		Matrix[0 +  12 * i][rowInOut] ^= state[11];
+		Matrix[1 +  12 * i][rowInOut] ^= state[0];
+		Matrix[2 +  12 * i][rowInOut] ^= state[1];
+		Matrix[3 +  12 * i][rowInOut] ^= state[2];
+		Matrix[4 +  12 * i][rowInOut] ^= state[3];
+		Matrix[5 +  12 * i][rowInOut] ^= state[4];
+		Matrix[6 +  12 * i][rowInOut] ^= state[5];
+		Matrix[7 +  12 * i][rowInOut] ^= state[6];
+		Matrix[8 +  12 * i][rowInOut] ^= state[7];
+		Matrix[9 +  12 * i][rowInOut] ^= state[8];
+		Matrix[10 + 12 * i][rowInOut] ^= state[9];
+		Matrix[11 + 12 * i][rowInOut] ^= state[10];
+	}
+}
+
+#if __CUDA_ARCH__ < 350
+
+__constant__ static uint2 blake2b_IV_sm2[8] = {
+	{ 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 },
+	{ 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a },
+	{ 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c },
+	{ 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 }
+};
+
+__global__ __launch_bounds__(TPB30, 1)
+void lyra2Z_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resNonces)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint2 Mask[8] = {
+		{ 0x00000020, 0x00000000 },{ 0x00000020, 0x00000000 },
+		{ 0x00000020, 0x00000000 },{ 0x00000008, 0x00000000 },
+		{ 0x00000008, 0x00000000 },{ 0x00000008, 0x00000000 },
+		{ 0x00000080, 0x00000000 },{ 0x00000000, 0x01000000 }
+	};
+	if (thread < threads)
+	{
+		uint2 state[16];
+
+		#pragma unroll
+		for (int i = 0; i<4; i++) {
+			LOHI(state[i].x, state[i].y, g_hash[threads*i + thread]);
+		} //password
+
+		#pragma unroll
+		for (int i = 0; i<4; i++) {
+			state[i + 4] = state[i];
+		} //salt
+
+		#pragma unroll
+		for (int i = 0; i<8; i++) {
+			state[i + 8] = blake2b_IV_sm2[i];
+		}
+
+		// blake2blyra x2
+		//#pragma unroll 24
+		for (int i = 0; i<12; i++) {
+			round_lyra_sm2(state);
+		}
+
+		for (int i = 0; i<8; i++)
+			state[i] ^= Mask[i];
+
+		for (int i = 0; i<12; i++) {
+			round_lyra_sm2(state);
+		}
+
+		uint2 Matrix[96][8]; // not cool
+
+		// reducedSqueezeRow0
+		#pragma unroll 8
+		for (int i = 0; i < 8; i++)
+		{
+			#pragma unroll 12
+			for (int j = 0; j<12; j++) {
+				Matrix[j + 84 - 12 * i][0] = state[j];
+			}
+			round_lyra_sm2(state);
+		}
+
+		// reducedSqueezeRow1
+		#pragma unroll 8
+		for (int i = 0; i < 8; i++)
+		{
+			#pragma unroll 12
+			for (int j = 0; j<12; j++) {
+				state[j] ^= Matrix[j + 12 * i][0];
+			}
+			round_lyra_sm2(state);
+			#pragma unroll 12
+			for (int j = 0; j<12; j++) {
+				Matrix[j + 84 - 12 * i][1] = Matrix[j + 12 * i][0] ^ state[j];
+			}
+		}
+
+		reduceDuplexRowSetup(1, 0, 2, state, Matrix);
+		reduceDuplexRowSetup(2, 1, 3, state, Matrix);
+		reduceDuplexRowSetup(3, 0, 4, state, Matrix);
+		reduceDuplexRowSetup(4, 3, 5, state, Matrix);
+		reduceDuplexRowSetup(5, 2, 6, state, Matrix);
+		reduceDuplexRowSetup(6, 1, 7, state, Matrix);
+
+		uint32_t rowa;
+		uint32_t prev = 7;
+		uint32_t iterator = 0;
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = state[0].x & 7;
+			reduceDuplexRow(prev, rowa, iterator);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = state[0].x & 7;
+			reduceDuplexRow(prev, rowa, iterator);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = state[0].x & 7;
+			reduceDuplexRow(prev, rowa, iterator);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = state[0].x & 7;
+			reduceDuplexRow(prev, rowa, iterator);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = state[0].x & 7;
+			reduceDuplexRow(prev, rowa, iterator);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = state[0].x & 7;
+			reduceDuplexRow(prev, rowa, iterator);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = state[0].x & 7;
+			reduceDuplexRow(prev, rowa, iterator);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = state[0].x & 7;
+			reduceDuplexRow(prev, rowa, iterator);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+
+		absorbblock(rowa);
+		uint32_t nonce = startNounce + thread;
+		if (((uint64_t*)state)[3] <= ((uint64_t*)pTarget)[3]) {
+			atomicMin(&resNonces[1], resNonces[0]);
+			atomicMin(&resNonces[0], nonce);
+		}
+	} //thread
+}
+#else
+__global__ void lyra2Z_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resNonces) {}
+#endif
+
+#if __CUDA_ARCH__ > 500
+
+#include "cuda_lyra2_vectors.h"
+//#include "cuda_vector_uint2x4.h"
+
+#define Nrow 8
+#define Ncol 8
+#define memshift 3
+
+#define BUF_COUNT 0
+
+__device__ uint2 *DMatrix;
+
+__device__ __forceinline__
+void LD4S(uint2 res[3], const int row, const int col, const int thread, const int threads)
+{
+#if BUF_COUNT != 8
+	extern __shared__ uint2 shared_mem[];
+	const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift;
+#endif
+#if BUF_COUNT != 0
+	const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x;
+#endif
+
+#if BUF_COUNT == 8
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		res[j] = *(DMatrix + d0 + j * threads * blockDim.x);
+#elif BUF_COUNT == 0
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+#else
+	if (row < BUF_COUNT) {
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			res[j] = *(DMatrix + d0 + j * threads * blockDim.x);
+	} else {
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+	}
+#endif
+}
+
+__device__ __forceinline__
+void ST4S(const int row, const int col, const uint2 data[3], const int thread, const int threads)
+{
+#if BUF_COUNT != 8
+	extern __shared__ uint2 shared_mem[];
+	const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift;
+#endif
+#if BUF_COUNT != 0
+	const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x;
+#endif
+
+#if BUF_COUNT == 8
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		*(DMatrix + d0 + j * threads * blockDim.x) = data[j];
+
+#elif BUF_COUNT == 0
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j];
+#else
+	if (row < BUF_COUNT) {
+	#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + d0 + j * threads * blockDim.x) = data[j];
+	} else {
+	#pragma unroll
+		for (int j = 0; j < 3; j++)
+			shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j];
+	}
+#endif
+}
+
+#if __CUDA_ARCH__ >= 300
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	return __shfl(a, b, c);
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
+}
+
+__device__ __forceinline__
+void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	a1 = WarpShuffle(a1, b1, c);
+	a2 = WarpShuffle(a2, b2, c);
+	a3 = WarpShuffle(a3, b3, c);
+}
+
+#else
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+	uint32_t *_ptr = (uint32_t*)shared_mem;
+
+	__threadfence_block();
+	uint32_t buf = _ptr[thread];
+
+	_ptr[thread] = a;
+	__threadfence_block();
+	uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	_ptr[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a;
+	__threadfence_block();
+	uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a1;
+	__threadfence_block();
+	a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a2;
+	__threadfence_block();
+	a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a3;
+	__threadfence_block();
+	a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+	__threadfence_block();
+}
+#endif
+
+__device__ __forceinline__ void round_lyra(uint2 s[4])
+{
+	Gfunc(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4);
+	Gfunc(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4);
+}
+
+static __device__ __forceinline__
+void round_lyra(uint2x4* s)
+{
+	Gfunc(s[0].x, s[1].x, s[2].x, s[3].x);
+	Gfunc(s[0].y, s[1].y, s[2].y, s[3].y);
+	Gfunc(s[0].z, s[1].z, s[2].z, s[3].z);
+	Gfunc(s[0].w, s[1].w, s[2].w, s[3].w);
+	Gfunc(s[0].x, s[1].y, s[2].z, s[3].w);
+	Gfunc(s[0].y, s[1].z, s[2].w, s[3].x);
+	Gfunc(s[0].z, s[1].w, s[2].x, s[3].y);
+	Gfunc(s[0].w, s[1].x, s[2].y, s[3].z);
+}
+
+static __device__ __forceinline__
+void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads)
+{
+	uint2 state1[3];
+
+#if __CUDA_ARCH__ > 500
+#pragma unroll
+#endif
+	for (int i = 0; i < Nrow; i++)
+	{
+		ST4S(0, Ncol - i - 1, state, thread, threads);
+
+		round_lyra(state);
+	}
+
+	#pragma unroll 4
+	for (int i = 0; i < Nrow; i++)
+	{
+		LD4S(state1, 0, i, thread, threads);
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j];
+
+		round_lyra(state);
+
+		for (int j = 0; j < 3; j++)
+			state1[j] ^= state[j];
+		ST4S(1, Ncol - i - 1, state1, thread, threads);
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], uint32_t thread, const uint32_t threads)
+{
+	uint2 state1[3], state2[3];
+
+	#pragma unroll 1
+	for (int i = 0; i < Nrow; i++)
+	{
+		LD4S(state1, rowIn, i, thread, threads);
+		LD4S(state2, rowInOut, i, thread, threads);
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] ^= state[j];
+
+		ST4S(rowOut, Ncol - i - 1, state1, thread, threads);
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		} else {
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		ST4S(rowInOut, i, state2, thread, threads);
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	for (int i = 0; i < Nrow; i++)
+	{
+		uint2 state1[3], state2[3];
+
+		LD4S(state1, rowIn, i, thread, threads);
+		LD4S(state2, rowInOut, i, thread, threads);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		ST4S(rowInOut, i, state2, thread, threads);
+
+		LD4S(state1, rowOut, i, thread, threads);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] ^= state[j];
+
+		ST4S(rowOut, i, state1, thread, threads);
+	}
+}
+
+#if 0
+static __device__ __forceinline__
+void reduceDuplexRowt_8(const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads)
+{
+	uint2 state1[3], state2[3], last[3];
+
+	LD4S(state1, 2, 0, thread, threads);
+	LD4S(last, rowInOut, 0, thread, threads);
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= state1[j] + last[j];
+
+	round_lyra(state);
+
+	//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+	uint2 Data0 = state[0];
+	uint2 Data1 = state[1];
+	uint2 Data2 = state[2];
+	WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+	if (threadIdx.x == 0)
+	{
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	} else {
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
+	}
+
+	if (rowInOut == 5)
+	{
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}
+
+	for (int i = 1; i < Nrow; i++)
+	{
+		LD4S(state1, 2, i, thread, threads);
+		LD4S(state2, rowInOut, i, thread, threads);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+	}
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= last[j];
+}
+#endif
+
+static __device__ __forceinline__
+void reduceDuplexRowt_8_v2(const int rowIn, const int rowOut, const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads)
+{
+	uint2 state1[3], state2[3], last[3];
+
+	LD4S(state1, rowIn, 0, thread, threads);
+	LD4S(last, rowInOut, 0, thread, threads);
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= state1[j] + last[j];
+
+	round_lyra(state);
+
+	//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+	uint2 Data0 = state[0];
+	uint2 Data1 = state[1];
+	uint2 Data2 = state[2];
+	WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+	if (threadIdx.x == 0)
+	{
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	}
+	else {
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
+	}
+
+	if (rowInOut == rowOut) {
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}
+
+	for (int i = 1; i < Nrow; i++)
+	{
+		LD4S(state1, rowIn, i, thread, threads);
+		LD4S(state2, rowInOut, i, thread, threads);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+	}
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= last[j];
+}
+
+__global__
+__launch_bounds__(64, 1)
+void lyra2Z_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint2x4 Mask[2] = {
+		0x00000020UL, 0x00000000UL, 0x00000020UL, 0x00000000UL,
+		0x00000020UL, 0x00000000UL, 0x00000008UL, 0x00000000UL,
+		0x00000008UL, 0x00000000UL, 0x00000008UL, 0x00000000UL,
+		0x00000080UL, 0x00000000UL, 0x00000000UL, 0x01000000UL
+	};
+	const uint2x4 blake2b_IV[2] = {
+		0xf3bcc908lu, 0x6a09e667lu,
+		0x84caa73blu, 0xbb67ae85lu,
+		0xfe94f82blu, 0x3c6ef372lu,
+		0x5f1d36f1lu, 0xa54ff53alu,
+		0xade682d1lu, 0x510e527flu,
+		0x2b3e6c1flu, 0x9b05688clu,
+		0xfb41bd6blu, 0x1f83d9ablu,
+		0x137e2179lu, 0x5be0cd19lu
+	};
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+
+		state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]);
+		state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]);
+		state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]);
+		state[0].w = state[1].w = __ldg(&g_hash[thread + threads * 3]);
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];
+
+		for (int i = 0; i<12; i++)
+			round_lyra(state);
+
+		state[0] ^= Mask[0];
+		state[1] ^= Mask[1];
+
+		for (int i = 0; i<12; i++)
+			round_lyra(state); //because 12 is not enough
+
+		((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
+		((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
+		((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
+		((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
+	}
+}
+
+__global__
+__launch_bounds__(TPB52, 1)
+void lyra2Z_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+{
+	const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
+
+	if (thread < threads)
+	{
+		uint2 state[4];
+		state[0] = __ldg(&DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x]);
+		state[1] = __ldg(&DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x]);
+		state[2] = __ldg(&DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x]);
+		state[3] = __ldg(&DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x]);
+
+		reduceDuplex(state, thread, threads);
+		reduceDuplexRowSetup(1, 0, 2, state, thread, threads);
+		reduceDuplexRowSetup(2, 1, 3, state, thread, threads);
+		reduceDuplexRowSetup(3, 0, 4, state, thread, threads);
+		reduceDuplexRowSetup(4, 3, 5, state, thread, threads);
+		reduceDuplexRowSetup(5, 2, 6, state, thread, threads);
+		reduceDuplexRowSetup(6, 1, 7, state, thread, threads);
+
+		uint32_t rowa; // = WarpShuffle(state[0].x, 0, 4) & 7;
+		uint32_t prev = 7;
+		uint32_t iterator = 0;
+
+	//for (uint32_t j=0;j<4;j++) {
+
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowt(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowt(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowt(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowt(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowt(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowt(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowt(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+
+		for (uint32_t i = 0; i<7; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowt(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+
+	//}
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowt_8_v2(prev,iterator,rowa, state, thread, threads);
+
+		DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x] = state[0];
+		DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x] = state[1];
+		DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x] = state[2];
+		DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x] = state[3];
+	}
+}
+
+__global__
+__launch_bounds__(64, 1)
+void lyra2Z_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *resNonces)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	uint28 state[4];
+
+	if (thread < threads)
+	{
+		state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
+		state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
+		state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
+		state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra(state);
+
+		uint32_t nonce = startNounce + thread;
+		if (((uint64_t*)state)[3] <= ((uint64_t*)pTarget)[3]) {
+			atomicMin(&resNonces[1], resNonces[0]);
+			atomicMin(&resNonces[0], nonce);
+		}
+/*
+		g_hash[thread + threads * 0] = state[0].x;
+		g_hash[thread + threads * 1] = state[0].y;
+		g_hash[thread + threads * 2] = state[0].z;
+		g_hash[thread + threads * 3] = state[0].w;
+*/
+	}
+}
+#else
+#if __CUDA_ARCH__ < 350
+__device__ void* DMatrix;
+#endif
+__global__ void lyra2Z_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
+__global__ void lyra2Z_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {}
+__global__ void lyra2Z_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *resNonces) {}
+#endif
+
+__host__
+void lyra2Z_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
+{
+	// just assign the device pointer allocated in main loop
+	cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
+	cudaMalloc(&d_GNonces[thr_id], 2 * sizeof(uint32_t));
+	cudaMallocHost(&h_GNonces[thr_id], 2 * sizeof(uint32_t));
+}
+
+__host__
+void lyra2Z_cpu_init_sm2(int thr_id, uint32_t threads)
+{
+	// just assign the device pointer allocated in main loop
+	cudaMalloc(&d_GNonces[thr_id], 2 * sizeof(uint32_t));
+	cudaMallocHost(&h_GNonces[thr_id], 2 * sizeof(uint32_t));
+}
+
+__host__
+uint32_t lyra2Z_getSecNonce(int thr_id, int num)
+{
+	uint32_t results[2];
+	memset(results, 0xFF, sizeof(results));
+	cudaMemcpy(results, d_GNonces[thr_id], sizeof(results), cudaMemcpyDeviceToHost);
+	if (results[1] == results[0])
+		return UINT32_MAX;
+	return results[num];
+}
+
+__host__
+void lyra2Z_setTarget(const void *pTargetIn)
+{
+	cudaMemcpyToSymbol(pTarget, pTargetIn, 32, 0, cudaMemcpyHostToDevice);
+}
+
+__host__
+uint32_t lyra2Z_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, bool gtx750ti)
+{
+	uint32_t result = UINT32_MAX;
+	cudaMemset(d_GNonces[thr_id], 0xff, 2 * sizeof(uint32_t));
+	int dev_id = device_map[thr_id % MAX_GPUS];
+
+	uint32_t tpb = TPB52;
+
+	if (device_sm[dev_id] == 500)
+		tpb = TPB50;
+	if (device_sm[dev_id] == 200)
+		tpb = TPB20;
+
+	dim3 grid1((threads * 4 + tpb - 1) / tpb);
+	dim3 block1(4, tpb >> 2);
+
+	dim3 grid2((threads + 64 - 1) / 64);
+	dim3 block2(64);
+
+	dim3 grid3((threads + tpb - 1) / tpb);
+	dim3 block3(tpb);
+
+	if (device_sm[dev_id] >= 520)
+	{
+		lyra2Z_gpu_hash_32_1 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
+
+		lyra2Z_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, startNounce, d_hash);
+
+		lyra2Z_gpu_hash_32_3 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash, d_GNonces[thr_id]);
+	}
+	else if (device_sm[dev_id] == 500 || device_sm[dev_id] == 350)
+	{
+		size_t shared_mem = 0;
+
+		if (gtx750ti)
+			// 8Warpに調整のため、8192バイト確保する
+			shared_mem = 8192;
+		else
+			// 10Warpに調整のため、6144バイト確保する
+			shared_mem = 6144;
+
+		lyra2Z_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
+
+		lyra2Z_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, startNounce, (uint2*)d_hash);
+
+		lyra2Z_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash, d_GNonces[thr_id]);
+	}
+	else
+		lyra2Z_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, startNounce, d_hash, d_GNonces[thr_id]);
+
+	// get first found nonce
+	cudaMemcpy(h_GNonces[thr_id], d_GNonces[thr_id], 1 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	result = *h_GNonces[thr_id];
+
+	return result;
+}
--- a/lyra2/cuda_lyra2Z_sm5.cuh
+++ b/lyra2/cuda_lyra2Z_sm5.cuh
@ -0,0 +1,819 @@
+#include <memory.h>
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+//#define __CUDA_ARCH__ 500
+#define __threadfence_block()
+#define __ldg(x) *(x)
+#define atomicMin(p,y) y
+#endif
+
+#include "cuda_helper.h"
+
+#define TPB50 32
+
+__constant__ uint32_t pTarget[8];
+
+static __device__ __forceinline__
+void Gfunc(uint2 & a, uint2 &b, uint2 &c, uint2 &d)
+{
+#if __CUDA_ARCH__ > 500
+	a += b; uint2 tmp = d; d.y = a.x ^ tmp.x; d.x = a.y ^ tmp.y;
+	c += d; b ^= c; b = ROR24(b);
+	a += b; d ^= a; d = ROR16(d);
+	c += d; b ^= c; b = ROR2(b, 63);
+#else
+	a += b; d ^= a; d = SWAPUINT2(d);
+	c += d; b ^= c; b = ROR2(b, 24);
+	a += b; d ^= a; d = ROR2(d, 16);
+	c += d; b ^= c; b = ROR2(b, 63);
+#endif
+}
+
+#if __CUDA_ARCH__ == 500 || __CUDA_ARCH__ == 350
+#include "cuda_lyra2_vectors.h"
+
+#define Nrow 8
+#define Ncol 8
+#define memshift 3
+
+__device__ uint2 *DMatrix;
+
+__device__ __forceinline__ uint2 LD4S(const int index)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+}
+
+__device__ __forceinline__ void ST4S(const int index, const uint2 data)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data;
+}
+
+#if __CUDA_ARCH__ == 300
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	return __shfl(a, b, c);
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	a1 = WarpShuffle(a1, b1, c);
+	a2 = WarpShuffle(a2, b2, c);
+	a3 = WarpShuffle(a3, b3, c);
+}
+#else // != 300
+
+__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+	uint32_t *_ptr = (uint32_t*)shared_mem;
+
+	__threadfence_block();
+	uint32_t buf = _ptr[thread];
+
+	_ptr[thread] = a;
+	__threadfence_block();
+	uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	_ptr[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a;
+	__threadfence_block();
+	uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+
+	__threadfence_block();
+	return result;
+}
+
+__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c)
+{
+	extern __shared__ uint2 shared_mem[];
+
+	const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x;
+
+	__threadfence_block();
+	uint2 buf = shared_mem[thread];
+
+	shared_mem[thread] = a1;
+	__threadfence_block();
+	a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a2;
+	__threadfence_block();
+	a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))];
+	__threadfence_block();
+	shared_mem[thread] = a3;
+	__threadfence_block();
+	a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))];
+
+	__threadfence_block();
+	shared_mem[thread] = buf;
+	__threadfence_block();
+}
+
+#endif // != 300
+
+__device__ __forceinline__ void round_lyra(uint2 s[4])
+{
+	Gfunc(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4);
+	Gfunc(s[0], s[1], s[2], s[3]);
+	WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4);
+}
+
+static __device__ __forceinline__
+void round_lyra(uint2x4* s)
+{
+	Gfunc(s[0].x, s[1].x, s[2].x, s[3].x);
+	Gfunc(s[0].y, s[1].y, s[2].y, s[3].y);
+	Gfunc(s[0].z, s[1].z, s[2].z, s[3].z);
+	Gfunc(s[0].w, s[1].w, s[2].w, s[3].w);
+	Gfunc(s[0].x, s[1].y, s[2].z, s[3].w);
+	Gfunc(s[0].y, s[1].z, s[2].w, s[3].x);
+	Gfunc(s[0].z, s[1].w, s[2].x, s[3].y);
+	Gfunc(s[0].w, s[1].x, s[2].y, s[3].z);
+}
+
+static __device__ __forceinline__
+void reduceDuplexV5(uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	uint2 state1[3], state2[3];
+
+	const uint32_t ps0 = (memshift * Ncol * 0 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps1 = (memshift * Ncol * 1 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps2 = (memshift * Ncol * 2 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps3 = (memshift * Ncol * 3 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps4 = (memshift * Ncol * 4 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps5 = (memshift * Ncol * 5 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps6 = (memshift * Ncol * 6 * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps7 = (memshift * Ncol * 7 * threads + thread)*blockDim.x + threadIdx.x;
+
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s0 = memshift * Ncol * 0 + (Ncol - 1 - i) * memshift;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			ST4S(s0 + j, state[j]);
+		round_lyra(state);
+	}
+
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s1 = ps1 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = LD4S(s0 + j);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s1 + j*threads*blockDim.x) = state1[j] ^ state[j];
+	}
+
+	// 1, 0, 2
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
+		const uint32_t s2 = ps2 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s1 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = LD4S(s0 + j);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s2 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			ST4S(s0 + j, state2[j]);
+	}
+
+	// 2, 1, 3
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
+		const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x;
+		const uint32_t s3 = ps3 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s2 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = *(DMatrix + s1 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s3 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		} else  {
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s1 + j*threads*blockDim.x) = state2[j];
+	}
+
+	// 3, 0, 4
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t ls0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s0 = ps0 + i * memshift* threads*blockDim.x;
+		const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x;
+		const uint32_t s4 = ps4 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s3 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = LD4S(ls0 + j);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s4 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		} else {
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s0 + j*threads*blockDim.x) = state2[j];
+	}
+
+	// 4, 3, 5
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x;
+		const uint32_t s4 = ps4 + i * memshift* threads*blockDim.x;
+		const uint32_t s5 = ps5 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s4 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = *(DMatrix + s3 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s5 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s3 + j*threads*blockDim.x) = state2[j];
+	}
+
+	// 5, 2, 6
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x;
+		const uint32_t s5 = ps5 + i * memshift* threads*blockDim.x;
+		const uint32_t s6 = ps6 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s5 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = *(DMatrix + s2 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s6 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		}
+		else
+		{
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s2 + j*threads*blockDim.x) = state2[j];
+	}
+
+	// 6, 1, 7
+	for (int i = 0; i < 8; i++)
+	{
+		const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x;
+		const uint32_t s6 = ps6 + i * memshift* threads*blockDim.x;
+		const uint32_t s7 = ps7 + (7 - i)*memshift* threads*blockDim.x;
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = *(DMatrix + s6 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = *(DMatrix + s1 + j*threads*blockDim.x);
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra(state);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s7 + j*threads*blockDim.x) = state1[j] ^ state[j];
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		} else {
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			*(DMatrix + s1 + j*threads*blockDim.x) = state2[j];
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowV50(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	const uint32_t ps1 = (memshift * Ncol * rowIn*threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps3 = (memshift * Ncol * rowOut*threads + thread)*blockDim.x + threadIdx.x;
+
+	#pragma unroll 1
+	for (int i = 0; i < 8; i++)
+	{
+		uint2 state1[3], state2[3];
+
+		const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x;
+		const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x;
+		const uint32_t s3 = ps3 + i*memshift*threads *blockDim.x;
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++) {
+			state1[j] = *(DMatrix + s1 + j*threads*blockDim.x);
+			state2[j] = *(DMatrix + s2 + j*threads*blockDim.x);
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++) {
+			state1[j] += state2[j];
+			state[j] ^= state1[j];
+		}
+
+		round_lyra(state);
+
+		//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+		uint2 Data0 = state[0];
+		uint2 Data1 = state[1];
+		uint2 Data2 = state[2];
+		WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0)
+		{
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		} else {
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+		{
+			*(DMatrix + s2 + j*threads*blockDim.x) = state2[j];
+			*(DMatrix + s3 + j*threads*blockDim.x) ^= state[j];
+		}
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	const uint32_t ps1 = (memshift * Ncol * 2*threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x;
+	// const uint32_t ps3 = (memshift * Ncol * 5*threads + thread)*blockDim.x + threadIdx.x;
+
+	uint2 state1[3], last[3];
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++) {
+		state1[j] = *(DMatrix + ps1 + j*threads*blockDim.x);
+		last[j] = *(DMatrix + ps2 + j*threads*blockDim.x);
+	}
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++) {
+		state1[j] += last[j];
+		state[j] ^= state1[j];
+	}
+
+	round_lyra(state);
+
+	//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+	uint2 Data0 = state[0];
+	uint2 Data1 = state[1];
+	uint2 Data2 = state[2];
+	WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+	if (threadIdx.x == 0)
+	{
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	} else {
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
+	}
+
+	if (rowInOut == 5)
+	{
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}
+
+	for (int i = 1; i < 8; i++)
+	{
+		const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x;
+		const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x;
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= *(DMatrix + s1 + j*threads*blockDim.x) + *(DMatrix + s2 + j*threads*blockDim.x);
+
+		round_lyra(state);
+	}
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= last[j];
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowV50_8_v2(const int rowIn, const int rowOut,const int rowInOut, uint2 state[4], const uint32_t thread, const uint32_t threads)
+{
+	const uint32_t ps1 = (memshift * Ncol * rowIn * threads + thread)*blockDim.x + threadIdx.x;
+	const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x;
+	// const uint32_t ps3 = (memshift * Ncol * 5*threads + thread)*blockDim.x + threadIdx.x;
+
+	uint2 state1[3], last[3];
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++) {
+		state1[j] = *(DMatrix + ps1 + j*threads*blockDim.x);
+		last[j] = *(DMatrix + ps2 + j*threads*blockDim.x);
+	}
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++) {
+		state1[j] += last[j];
+		state[j] ^= state1[j];
+	}
+
+	round_lyra(state);
+
+	//一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る)
+	uint2 Data0 = state[0];
+	uint2 Data1 = state[1];
+	uint2 Data2 = state[2];
+	WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4);
+
+	if (threadIdx.x == 0)
+	{
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	}
+	else {
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
+	}
+
+	if (rowInOut == rowOut)
+	{
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}
+
+	for (int i = 1; i < 8; i++)
+	{
+		const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x;
+		const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x;
+
+#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= *(DMatrix + s1 + j*threads*blockDim.x) + *(DMatrix + s2 + j*threads*blockDim.x);
+
+		round_lyra(state);
+	}
+
+
+#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= last[j];
+
+}
+
+
+__global__ __launch_bounds__(64, 1)
+void lyra2Z_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	const uint2x4 blake2b_IV[2] = {
+		{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } },
+		{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }
+	};
+	const uint2x4 Mask[2] = {
+		0x00000020UL, 0x00000000UL, 0x00000020UL, 0x00000000UL,
+		0x00000020UL, 0x00000000UL, 0x00000008UL, 0x00000000UL,
+		0x00000008UL, 0x00000000UL, 0x00000008UL, 0x00000000UL,
+		0x00000080UL, 0x00000000UL, 0x00000000UL, 0x01000000UL
+	};
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+
+		((uint2*)state)[0] = __ldg(&g_hash[thread]);
+		((uint2*)state)[1] = __ldg(&g_hash[thread + threads]);
+		((uint2*)state)[2] = __ldg(&g_hash[thread + threads * 2]);
+		((uint2*)state)[3] = __ldg(&g_hash[thread + threads * 3]);
+
+		state[1] = state[0];
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];
+
+		for (int i = 0; i < 12; i++)
+			round_lyra(state); //because 12 is not enough
+
+		state[0] ^= Mask[0];
+		state[1] ^= Mask[1];
+
+		for (int i = 0; i < 12; i++)
+			round_lyra(state); //because 12 is not enough
+
+
+		((uint2x4*)DMatrix)[0 * threads + thread] = state[0];
+		((uint2x4*)DMatrix)[1 * threads + thread] = state[1];
+		((uint2x4*)DMatrix)[2 * threads + thread] = state[2];
+		((uint2x4*)DMatrix)[3 * threads + thread] = state[3];
+	}
+}
+
+__global__ __launch_bounds__(TPB50, 1)
+void lyra2Z_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+{
+	const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
+
+	if (thread < threads)
+	{
+		uint2 state[4];
+
+		state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]);
+		state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]);
+		state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]);
+		state[3] = __ldg(&DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x]);
+
+		reduceDuplexV5(state, thread, threads);
+
+		uint32_t rowa; // = WarpShuffle(state[0].x, 0, 4) & 7;
+		uint32_t prev = 7;
+		uint32_t iterator = 0;
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+		for (uint32_t i = 0; i<8; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator + 3) & 7;
+		}
+		for (uint32_t i = 0; i<7; i++) {
+			rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+			reduceDuplexRowV50(prev, rowa, iterator, state, thread, threads);
+			prev = iterator;
+			iterator = (iterator - 1) & 7;
+		}
+
+		rowa = WarpShuffle(state[0].x, 0, 4) & 7;
+		reduceDuplexRowV50_8_v2(prev,iterator,rowa, state, thread, threads);
+
+		DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x] = state[0];
+		DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x] = state[1];
+		DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x] = state[2];
+		DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x] = state[3];
+	}
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2Z_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *resNonces)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+
+		state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]);
+		state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]);
+		state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]);
+		state[3] = __ldg4(&((uint2x4*)DMatrix)[3 * threads + thread]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra(state);
+
+		uint32_t nonce = startNounce + thread;
+		if (((uint64_t*)state)[3] <= ((uint64_t*)pTarget)[3]) {
+			atomicMin(&resNonces[1], resNonces[0]);
+			atomicMin(&resNonces[0], nonce);
+		}
+	}
+}
+
+#else
+/* if __CUDA_ARCH__ != 500 .. host */
+__global__ void lyra2Z_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
+__global__ void lyra2Z_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
+__global__ void lyra2Z_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *resNonces) {}
+#endif
--- a/lyra2/cuda_lyra2_vectors.h
+++ b/lyra2/cuda_lyra2_vectors.h
@ -36,11 +36,11 @@ typedef struct __align__(128) ulonglong8to16 {
 	ulonglong2to8 lo, hi;
 } ulonglong8to16;

-typedef struct __align__(256) ulonglong16to32 {
+typedef struct __align__(128) ulonglong16to32{
 	ulonglong8to16 lo, hi;
 } ulonglong16to32;

-typedef struct __align__(512) ulonglong32to64 {
+typedef struct __align__(128) ulonglong32to64{
 	ulonglong16to32 lo, hi;
 } ulonglong32to64;

@ -79,7 +79,7 @@ struct __align__(128) ulong8 {
 };
 typedef __device_builtin__ struct ulong8 ulong8;

-typedef struct __align__(256) ulonglong16 {
+typedef struct __align__(128) ulonglong16{
 	ulonglong4 s0, s1, s2, s3, s4, s5, s6, s7;
 } ulonglong16;

@ -92,7 +92,7 @@ typedef struct __builtin_align__(32) uint48 {
 		uint4 s0,s1;
 } uint48;

-typedef struct __align__(256) uint4x16 {
+typedef struct __builtin_align__(128) uint4x16{
 	uint4 s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
 } uint4x16;

--- a/lyra2/lyra2Z.cu
+++ b/lyra2/lyra2Z.cu
@ -0,0 +1,164 @@
+extern "C" {
+#include <sph/sph_blake.h>
+#include "Lyra2Z.h"
+}
+
+#include <miner.h>
+#include <cuda_helper.h>
+
+static uint64_t* d_hash[MAX_GPUS];
+static uint64_t* d_matrix[MAX_GPUS];
+
+extern void blake256_cpu_init(int thr_id, uint32_t threads);
+extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
+extern void blake256_cpu_setBlock_80(uint32_t *pdata);
+
+extern void lyra2Z_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
+extern void lyra2Z_cpu_init_sm2(int thr_id, uint32_t threads);
+extern uint32_t lyra2Z_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti);
+
+extern void lyra2Z_setTarget(const void *ptarget);
+extern uint32_t lyra2Z_getSecNonce(int thr_id, int num);
+
+extern "C" void lyra2Z_hash(void *state, const void *input)
+{
+	uint32_t _ALIGN(64) hashA[8], hashB[8];
+	sph_blake256_context ctx_blake;
+
+	sph_blake256_set_rounds(14);
+	sph_blake256_init(&ctx_blake);
+	sph_blake256(&ctx_blake, input, 80);
+	sph_blake256_close(&ctx_blake, hashA);
+
+	LYRA2Z(hashB, 32, hashA, 32, hashA, 32, 8, 8, 8);
+
+	memcpy(state, hashB, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+static __thread uint32_t throughput = 0;
+static __thread bool gtx750ti = false;
+
+extern "C" int scanhash_lyra2Z(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	uint32_t _ALIGN(64) endiandata[20];
+	const uint32_t first_nonce = pdata[19];
+	int dev_id = device_map[thr_id];
+
+	if (opt_benchmark)
+		ptarget[7] = 0x00ff;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+
+		int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 17 : 16;
+		if (device_sm[dev_id] <= 500) intensity = 15;
+		throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
+		if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+		cudaDeviceProp props;
+		cudaGetDeviceProperties(&props, dev_id);
+		gtx750ti = (strstr(props.name, "750 Ti") != NULL);
+
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		blake256_cpu_init(thr_id, throughput);
+
+		if (device_sm[dev_id] >= 350)
+		{
+			size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
+			CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));
+			lyra2Z_cpu_init(thr_id, throughput, d_matrix[thr_id]);
+		}
+		else
+			lyra2Z_cpu_init_sm2(thr_id, throughput);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	blake256_cpu_setBlock_80(pdata);
+	lyra2Z_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = lyra2Z_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti);
+
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+
+			be32enc(&endiandata[19], work->nonces[0]);
+			lyra2Z_hash(vhash, endiandata);
+
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = lyra2Z_getSecNonce(thr_id, 1);
+				work_set_target_ratio(work, vhash);
+				pdata[19] = work->nonces[0] + 1;
+				if (work->nonces[1] != UINT32_MAX)
+				{
+					be32enc(&endiandata[19], work->nonces[1]);
+					lyra2Z_hash(vhash, endiandata);
+					if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+						bn_set_target_ratio(work, vhash, 1);
+						work->valid_nonces++;
+					}
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > ptarget[7]) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)	gpulog(LOG_WARNING, thr_id,
+					"result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0];
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_lyra2Z(int thr_id)
+{
+	int dev_id = device_map[thr_id];
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+	if (device_sm[dev_id] >= 350)
+		cudaFree(d_matrix[thr_id]);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/miner.h
+++ b/miner.h
@ -292,6 +292,7 @@ extern int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsi
 extern int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_lyra2v2(int thr_id,struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_lyra2Z(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_myriad(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_neoscrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
@ -344,6 +345,7 @@ extern void free_lbry(int thr_id);
 extern void free_luffa(int thr_id);
 extern void free_lyra2(int thr_id);
 extern void free_lyra2v2(int thr_id);
+extern void free_lyra2Z(int thr_id);
 extern void free_myriad(int thr_id);
 extern void free_neoscrypt(int thr_id);
 extern void free_nist5(int thr_id);
@ -863,6 +865,7 @@ void groestlhash(void *state, const void *input);
 void lbry_hash(void *output, const void *input);
 void lyra2re_hash(void *state, const void *input);
 void lyra2v2_hash(void *state, const void *input);
+void lyra2Z_hash(void *state, const void *input);
 void myriadhash(void *state, const void *input);
 void neoscrypt(uchar *output, const uchar *input, uint32_t profile);
 void nist5hash(void *state, const void *input);
--- a/util.cpp
+++ b/util.cpp
@ -2201,6 +2201,9 @@ void print_hash_tests(void)
 	lyra2v2_hash(&hash[0], &buf[0]);
 	printpfx("lyra2v2", hash);

+	lyra2Z_hash(&hash[0], &buf[0]);
+	printpfx("lyra2z", hash);
+
 	myriadhash(&hash[0], &buf[0]);
 	printpfx("myriad", hash);