Add Lyra2 algo, based on Vertcoin published code

Seems to be djm34 work, i recognize the code style ;) Code was cleaned/indented and adapted to my fork... Only usable on the test pool until 16 december 2014!
10 years ago · c5b349e079
21 changed files with 2900 additions and 59 deletions
--- a/Algo256/blake256.cu
+++ b/Algo256/blake256.cu
--- a/Algo256/cuda_blake256.cu
+++ b/Algo256/cuda_blake256.cu
@ -0,0 +1,250 @@
 /**
 * Blake-256 Cuda Kernel (Tested on SM 5.0)
 *
 * Tanguy Pruvot - Nov. 2014
 */
 extern "C" {
 #include "sph/sph_blake.h"
 }
 #include "cuda_helper.h"
 #include <memory.h>
 static __device__ uint64_t cuda_swab32ll(uint64_t x) {
 	return MAKE_ULONGLONG(cuda_swab32(_LOWORD(x)), cuda_swab32(_HIWORD(x)));
 }
 __constant__ static uint32_t  c_data[20];
 __constant__ static uint32_t sigma[16][16];
 static uint32_t  c_sigma[16][16] = {
 	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
 	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
 	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
 	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
 	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
 	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
 	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
 	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
 	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
 	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
 	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
 	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
 	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
 	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
 	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
 	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
 };
 static const uint32_t  c_IV256[8] = {
 	0x6A09E667, 0xBB67AE85,
 	0x3C6EF372, 0xA54FF53A,
 	0x510E527F, 0x9B05688C,
 	0x1F83D9AB, 0x5BE0CD19
 };
 __device__ __constant__ static uint32_t cpu_h[8];
 __device__ __constant__ static  uint32_t  u256[16];
 static const uint32_t  c_u256[16] = {
 	0x243F6A88, 0x85A308D3,
 	0x13198A2E, 0x03707344,
 	0xA4093822, 0x299F31D0,
 	0x082EFA98, 0xEC4E6C89,
 	0x452821E6, 0x38D01377,
 	0xBE5466CF, 0x34E90C6C,
 	0xC0AC29B7, 0xC97C50DD,
 	0x3F84D5B5, 0xB5470917
 };
 #define GS2(a,b,c,d,x) { \
 	const uint32_t idx1 = sigma[r][x]; \
 	const uint32_t idx2 = sigma[r][x+1]; \
 	v[a] += (m[idx1] ^ u256[idx2]) + v[b]; \
 	v[d] = SPH_ROTL32(v[d] ^ v[a], 16); \
 	v[c] += v[d]; \
 	v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \
 \
 	v[a] += (m[idx2] ^ u256[idx1]) + v[b]; \
 	v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \
 	v[c] += v[d]; \
 	v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \
 }
 //#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n)))
 #define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
 #define hostGS(a,b,c,d,x) { \
 	const uint32_t idx1 = c_sigma[r][x]; \
 	const uint32_t idx2 = c_sigma[r][x+1]; \
 	v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \
 	v[d] = ROTR32(v[d] ^ v[a], 16); \
 	v[c] += v[d]; \
 	v[b] = ROTR32(v[b] ^ v[c], 12); \
 \
 	v[a] += (m[idx2] ^ c_u256[idx1]) + v[b]; \
 	v[d] = ROTR32(v[d] ^ v[a], 8); \
 	v[c] += v[d]; \
 	v[b] = ROTR32(v[b] ^ v[c], 7); \
 	}
 /* Second part (64-80) msg never change, store it */
 __device__ __constant__ static const uint32_t  c_Padding[16] = {
 	0, 0, 0, 0,
 	0x80000000, 0, 0, 0,
 	0, 0, 0, 0,
 	0, 1, 0, 640,
 };
 __host__ __forceinline__
 static void blake256_compress1st(uint32_t *h, const uint32_t *block, const uint32_t T0)
 {
 	uint32_t m[16];
 	uint32_t v[16];
 	for (int i = 0; i < 16; i++) {
 		m[i] = block[i];
 	}
 	for (int i = 0; i < 8; i++)
 		v[i] = h[i];
 	v[8] = c_u256[0];
 	v[9] = c_u256[1];
 	v[10] = c_u256[2];
 	v[11] = c_u256[3];
 	v[12] = c_u256[4] ^ T0;
 	v[13] = c_u256[5] ^ T0;
 	v[14] = c_u256[6];
 	v[15] = c_u256[7];
 	for (int r = 0; r < 14; r++) {
 		/* column step */
 		hostGS(0, 4, 0x8, 0xC, 0x0);
 		hostGS(1, 5, 0x9, 0xD, 0x2);
 		hostGS(2, 6, 0xA, 0xE, 0x4);
 		hostGS(3, 7, 0xB, 0xF, 0x6);
 		/* diagonal step */
 		hostGS(0, 5, 0xA, 0xF, 0x8);
 		hostGS(1, 6, 0xB, 0xC, 0xA);
 		hostGS(2, 7, 0x8, 0xD, 0xC);
 		hostGS(3, 4, 0x9, 0xE, 0xE);
 	}
 	for (int i = 0; i < 16; i++) {
 		int j = i & 7;
 		h[j] ^= v[i];
 	}
 }
 __device__ __forceinline__
 static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint32_t T0)
 {
 	uint32_t m[16];
 	uint32_t v[16];
 	m[0] = block[0];
 	m[1] = block[1];
 	m[2] = block[2];
 	m[3] = block[3];
 	#pragma unroll
 	for (int i = 4; i < 16; i++) {
 		m[i] = c_Padding[i];
 	}
 	#pragma unroll 8
 	for (int i = 0; i < 8; i++)
 		v[i] = h[i];
 	v[8] =  u256[0];
 	v[9] =  u256[1];
 	v[10] = u256[2];
 	v[11] = u256[3];
 	v[12] = u256[4] ^ T0;
 	v[13] = u256[5] ^ T0;
 	v[14] = u256[6];
 	v[15] = u256[7];
 	#pragma unroll 14
 	for (int r = 0; r < 14; r++) {
 		/* column step */
 		GS2(0, 4, 0x8, 0xC, 0x0);
 		GS2(1, 5, 0x9, 0xD, 0x2);
 		GS2(2, 6, 0xA, 0xE, 0x4);
 		GS2(3, 7, 0xB, 0xF, 0x6);
 		/* diagonal step */
 		GS2(0, 5, 0xA, 0xF, 0x8);
 		GS2(1, 6, 0xB, 0xC, 0xA);
 		GS2(2, 7, 0x8, 0xD, 0xC);
 		GS2(3, 4, 0x9, 0xE, 0xE);
 	}
 	#pragma unroll 16
 	for (int i = 0; i < 16; i++) {
 		int j = i & 7;
 		h[j] ^= v[i];
 	}
 }
 __global__ __launch_bounds__(256,3)
 void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t * Hash)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		const uint32_t nonce = startNonce + thread;
 		uint32_t h[8];
 		uint32_t input[4];
 		#pragma unroll 8
 		for (int i = 0; i<8; i++) { h[i] = cpu_h[i];}
 		#pragma unroll 3
 		for (int i = 0; i < 3; ++i) input[i] = c_data[16 + i];
 		input[3] = nonce;
 		blake256_compress2nd(h, input, 640);
        #pragma unroll
 		for (int i = 0; i<4; i++) {
 			Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2*i+1]));
 		}
 	}
 }
 __host__
 void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order)
 {
 	const int threadsperblock = 256;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 	blake256_gpu_hash_80 <<<grid, block>>> (threads, startNonce, Hash);
 	MyStreamSynchronize(NULL, order, thr_id);
 }
 __host__
 void blake256_cpu_setBlock_80(uint32_t *pdata)
 {
 	uint32_t h[8];
 	uint32_t data[20];
 	memcpy(data, pdata, 80);
 	for (int i = 0; i<8; i++) {
 		h[i] = c_IV256[i];
 	}
 	blake256_compress1st(h, pdata, 512);
 	cudaMemcpyToSymbol(cpu_h, h, sizeof(h), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice);
 }
 __host__
 void blake256_cpu_init(int thr_id, int threads)
 {
 	cudaMemcpyToSymbol(u256, c_u256, sizeof(c_u256), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice);
 }
--- a/Algo256/cuda_fugue256.cu
+++ b/Algo256/cuda_fugue256.cu
@ -571,7 +571,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
 		for(int i=0;i<30;i++)
 			sc[i] = GPUstate[i];
-		uint32_t nounce = startNounce + thread; // muss noch ermittelt werden	
+		uint32_t nounce = startNounce + thread; // muss noch ermittelt werden
 		uint32_t q;
@ -687,7 +687,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
 		int i;
 		bool rc = true;
-	
+
 		for (i = 7; i >= 0; i--) {
 			if (hash[i] > pTarget[i]) {
 				rc = false;
@ -730,7 +730,7 @@ void fugue256_cpu_init(int thr_id, int threads)
 	// Speicher für alle Ergebnisse belegen
 	cudaMalloc(&d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads);
-	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
+	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
 }
 __host__ void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
--- a/Algo256/cuda_groestl256.cu
+++ b/Algo256/cuda_groestl256.cu
@ -0,0 +1,309 @@
 #include <memory.h>
 #include "cuda_helper.h"
 uint32_t *d_gnounce[8];
 uint32_t *d_GNonce[8];
 __constant__ uint32_t pTarget[8];
 #define SPH_C32(x)    ((uint32_t)(x ## U))
 #define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
 #define C32e(x) \
 	  ((SPH_C32(x) >> 24) \
 	| ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
 	| ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
 	| ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
 #define PC32up(j, r)   ((uint32_t)((j) + (r)))
 #define PC32dn(j, r)   0
 #define QC32up(j, r)   0xFFFFFFFF
 #define QC32dn(j, r)   (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
 #define B32_0(x)    __byte_perm(x, 0, 0x4440)
 //((x) & 0xFF)
 #define B32_1(x)    __byte_perm(x, 0, 0x4441)
 //(((x) >> 8) & 0xFF)
 #define B32_2(x)    __byte_perm(x, 0, 0x4442)
 //(((x) >> 16) & 0xFF)
 #define B32_3(x)    __byte_perm(x, 0, 0x4443)
 //((x) >> 24)
 #define MAXWELL_OR_FERMI 1
 #if MAXWELL_OR_FERMI
 	#define USE_SHARED 1
 	// Maxwell and Fermi cards get the best speed with SHARED access it seems.
 	#if USE_SHARED
 	#define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
 	#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
 	#define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
 	#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
 	#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
 	#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
 	#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
 	#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
 	#else
 	#define T0up(x) tex1Dfetch(t0up2, x)
 	#define T0dn(x) tex1Dfetch(t0dn2, x)
 	#define T1up(x) tex1Dfetch(t1up2, x)
 	#define T1dn(x) tex1Dfetch(t1dn2, x)
 	#define T2up(x) tex1Dfetch(t2up2, x)
 	#define T2dn(x) tex1Dfetch(t2dn2, x)
 	#define T3up(x) tex1Dfetch(t3up2, x)
 	#define T3dn(x) tex1Dfetch(t3dn2, x)
 	#endif
 #else
 	#define USE_SHARED 1
 	// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
 	#define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
 	#define T0dn(x) tex1Dfetch(t0dn2, x)
 	#define T1up(x) tex1Dfetch(t1up2, x)
 	#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
 	#define T2up(x) tex1Dfetch(t2up2, x)
 	#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
 	#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
 	#define T3dn(x) tex1Dfetch(t3dn2, x)
 #endif
 texture<unsigned int, 1, cudaReadModeElementType> t0up2;
 texture<unsigned int, 1, cudaReadModeElementType> t0dn2;
 texture<unsigned int, 1, cudaReadModeElementType> t1up2;
 texture<unsigned int, 1, cudaReadModeElementType> t1dn2;
 texture<unsigned int, 1, cudaReadModeElementType> t2up2;
 texture<unsigned int, 1, cudaReadModeElementType> t2dn2;
 texture<unsigned int, 1, cudaReadModeElementType> t3up2;
 texture<unsigned int, 1, cudaReadModeElementType> t3dn2;
 #define RSTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \
 	t[d0] = T0up(B32_0(a[b0])) \
 		^ T1up(B32_1(a[b1])) \
 		^ T2up(B32_2(a[b2])) \
 		^ T3up(B32_3(a[b3])) \
 		^ T0dn(B32_0(a[b4])) \
 		^ T1dn(B32_1(a[b5])) \
 		^ T2dn(B32_2(a[b6])) \
 		^ T3dn(B32_3(a[b7])); \
 	t[d1] = T0dn(B32_0(a[b0])) \
 		^ T1dn(B32_1(a[b1])) \
 		^ T2dn(B32_2(a[b2])) \
 		^ T3dn(B32_3(a[b3])) \
 		^ T0up(B32_0(a[b4])) \
 		^ T1up(B32_1(a[b5])) \
 		^ T2up(B32_2(a[b6])) \
 		^ T3up(B32_3(a[b7])); \
 	} while (0)
 extern uint32_t T0up_cpu[];
 extern uint32_t T0dn_cpu[];
 extern uint32_t T1up_cpu[];
 extern uint32_t T1dn_cpu[];
 extern uint32_t T2up_cpu[];
 extern uint32_t T2dn_cpu[];
 extern uint32_t T3up_cpu[];
 extern uint32_t T3dn_cpu[];
 __device__ __forceinline__
 void groestl256_perm_P(int thread,uint32_t *a, char *mixtabs)
 {
 	#pragma unroll 10
 	for (int r = 0; r<10; r++)
 	{
 		uint32_t t[16];
 		a[0x0] ^= PC32up(0x00, r);
 		a[0x2] ^= PC32up(0x10, r);
 		a[0x4] ^= PC32up(0x20, r);
 		a[0x6] ^= PC32up(0x30, r);
 		a[0x8] ^= PC32up(0x40, r);
 		a[0xA] ^= PC32up(0x50, r);
 		a[0xC] ^= PC32up(0x60, r);
 		a[0xE] ^= PC32up(0x70, r);
 		RSTT(0x0, 0x1, a, 0x0, 0x2, 0x4, 0x6, 0x9, 0xB, 0xD, 0xF);
 		RSTT(0x2, 0x3, a, 0x2, 0x4, 0x6, 0x8, 0xB, 0xD, 0xF, 0x1);
 		RSTT(0x4, 0x5, a, 0x4, 0x6, 0x8, 0xA, 0xD, 0xF, 0x1, 0x3);
 		RSTT(0x6, 0x7, a, 0x6, 0x8, 0xA, 0xC, 0xF, 0x1, 0x3, 0x5);
 		RSTT(0x8, 0x9, a, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7);
 		RSTT(0xA, 0xB, a, 0xA, 0xC, 0xE, 0x0, 0x3, 0x5, 0x7, 0x9);
 		RSTT(0xC, 0xD, a, 0xC, 0xE, 0x0, 0x2, 0x5, 0x7, 0x9, 0xB);
 		RSTT(0xE, 0xF, a, 0xE, 0x0, 0x2, 0x4, 0x7, 0x9, 0xB, 0xD);
 		#pragma unroll 16
 		for (int k = 0; k<16; k++)
 			a[k] = t[k];
 	}
 }
 __device__ __forceinline__
 void groestl256_perm_Q(int thread, uint32_t *a, char *mixtabs)
 {
 	#pragma unroll
 	for (int r = 0; r<10; r++)
 	{
 		uint32_t t[16];
 		a[0x0] ^= QC32up(0x00, r);
 		a[0x1] ^= QC32dn(0x00, r);
 		a[0x2] ^= QC32up(0x10, r);
 		a[0x3] ^= QC32dn(0x10, r);
 		a[0x4] ^= QC32up(0x20, r);
 		a[0x5] ^= QC32dn(0x20, r);
 		a[0x6] ^= QC32up(0x30, r);
 		a[0x7] ^= QC32dn(0x30, r);
 		a[0x8] ^= QC32up(0x40, r);
 		a[0x9] ^= QC32dn(0x40, r);
 		a[0xA] ^= QC32up(0x50, r);
 		a[0xB] ^= QC32dn(0x50, r);
 		a[0xC] ^= QC32up(0x60, r);
 		a[0xD] ^= QC32dn(0x60, r);
 		a[0xE] ^= QC32up(0x70, r);
 		a[0xF] ^= QC32dn(0x70, r);
 		RSTT(0x0, 0x1, a, 0x2, 0x6, 0xA, 0xE, 0x1, 0x5, 0x9, 0xD);
 		RSTT(0x2, 0x3, a, 0x4, 0x8, 0xC, 0x0, 0x3, 0x7, 0xB, 0xF);
 		RSTT(0x4, 0x5, a, 0x6, 0xA, 0xE, 0x2, 0x5, 0x9, 0xD, 0x1);
 		RSTT(0x6, 0x7, a, 0x8, 0xC, 0x0, 0x4, 0x7, 0xB, 0xF, 0x3);
 		RSTT(0x8, 0x9, a, 0xA, 0xE, 0x2, 0x6, 0x9, 0xD, 0x1, 0x5);
 		RSTT(0xA, 0xB, a, 0xC, 0x0, 0x4, 0x8, 0xB, 0xF, 0x3, 0x7);
 		RSTT(0xC, 0xD, a, 0xE, 0x2, 0x6, 0xA, 0xD, 0x1, 0x5, 0x9);
 		RSTT(0xE, 0xF, a, 0x0, 0x4, 0x8, 0xC, 0xF, 0x3, 0x7, 0xB);
 		#pragma unroll
 		for (int k = 0; k<16; k++)
 			a[k] = t[k];
 	}
 }
 __global__ __launch_bounds__(256,1)
 void groestl256_gpu_hash32(int threads, uint32_t startNounce, uint64_t *outputHash, uint32_t *nonceVector)
 {
 #if USE_SHARED
 	extern __shared__ char mixtabs[];
 	if (threadIdx.x < 256) {
 		*((uint32_t*)mixtabs + (threadIdx.x)) = tex1Dfetch(t0up2, threadIdx.x);
 		*((uint32_t*)mixtabs + (256 + threadIdx.x)) = tex1Dfetch(t0dn2, threadIdx.x);
 		*((uint32_t*)mixtabs + (512 + threadIdx.x)) = tex1Dfetch(t1up2, threadIdx.x);
 		*((uint32_t*)mixtabs + (768 + threadIdx.x)) = tex1Dfetch(t1dn2, threadIdx.x);
 		*((uint32_t*)mixtabs + (1024 + threadIdx.x)) = tex1Dfetch(t2up2, threadIdx.x);
 		*((uint32_t*)mixtabs + (1280 + threadIdx.x)) = tex1Dfetch(t2dn2, threadIdx.x);
 		*((uint32_t*)mixtabs + (1536 + threadIdx.x)) = tex1Dfetch(t3up2, threadIdx.x);
 		*((uint32_t*)mixtabs + (1792 + threadIdx.x)) = tex1Dfetch(t3dn2, threadIdx.x);
 	}
 	__syncthreads();
 #endif
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		// GROESTL
 		uint32_t message[16];
 		uint32_t state[16];
 		#pragma unroll
 		for (int k = 0; k<4; k++)
 			LOHI(message[2*k], message[2*k+1], outputHash[k*threads+thread]);
 		#pragma unroll
 		for (int k = 9; k<15; k++)
 			message[k] = 0;
 		message[8] = 0x80;
 		message[15] = 0x01000000;
 		#pragma unroll 16
 		for (int u = 0; u<16; u++)
 			state[u] = message[u];
 		state[15] ^= 0x10000;
 		// Perm
 #if USE_SHARED
 		groestl256_perm_P(thread, state, mixtabs);
 		state[15] ^= 0x10000;
 		groestl256_perm_Q(thread, message, mixtabs);
 #else
 		groestl256_perm_P(thread, state, NULL);
 		state[15] ^= 0x10000;
 		groestl256_perm_P(thread, message, NULL);
 #endif
 		#pragma unroll 16
 		for (int u = 0; u<16; u++) state[u] ^= message[u];
 		#pragma unroll 16
 		for (int u = 0; u<16; u++) message[u] = state[u];
 #if USE_SHARED
 		groestl256_perm_P(thread, message, mixtabs);
 #else
 		groestl256_perm_P(thread, message, NULL);
 #endif
 		state[14] ^= message[14];
 		state[15] ^= message[15];
 		uint32_t nonce = startNounce + thread;
 		if (state[15] <= pTarget[7]) {
 			nonceVector[0] = nonce;
 		}
 	}
 }
 #define texDef(texname, texmem, texsource, texsize) \
 	unsigned int *texmem; \
 	cudaMalloc(&texmem, texsize); \
 	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
 	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
 	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
 __host__
 void groestl256_cpu_init(int thr_id, int threads)
 {
 	// Texturen mit obigem Makro initialisieren
 	texDef(t0up2, d_T0up, T0up_cpu, sizeof(uint32_t) * 256);
 	texDef(t0dn2, d_T0dn, T0dn_cpu, sizeof(uint32_t) * 256);
 	texDef(t1up2, d_T1up, T1up_cpu, sizeof(uint32_t) * 256);
 	texDef(t1dn2, d_T1dn, T1dn_cpu, sizeof(uint32_t) * 256);
 	texDef(t2up2, d_T2up, T2up_cpu, sizeof(uint32_t) * 256);
 	texDef(t2dn2, d_T2dn, T2dn_cpu, sizeof(uint32_t) * 256);
 	texDef(t3up2, d_T3up, T3up_cpu, sizeof(uint32_t) * 256);
 	texDef(t3dn2, d_T3dn, T3dn_cpu, sizeof(uint32_t) * 256);
 	cudaMalloc(&d_GNonce[thr_id], sizeof(uint32_t));
 	cudaMallocHost(&d_gnounce[thr_id], 1*sizeof(uint32_t));
 }
 __host__
 uint32_t groestl256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
 {
 	uint32_t result = 0xffffffff;
 	cudaMemset(d_GNonce[thr_id], 0xff, sizeof(uint32_t));
 	const int threadsperblock = 256;
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 #if USE_SHARED
 	size_t shared_size = 8 * 256 * sizeof(uint32_t);
 #else
 	size_t shared_size = 0;
 #endif
 	groestl256_gpu_hash32<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash, d_GNonce[thr_id]);
 	MyStreamSynchronize(NULL, order, thr_id);
 	cudaMemcpy(d_gnounce[thr_id], d_GNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	cudaThreadSynchronize();
 	result = *d_gnounce[thr_id];
 	return result;
 }
 __host__
 void groestl256_setTarget(const void *pTargetIn)
 {
 	cudaMemcpyToSymbol(pTarget, pTargetIn, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
 }
--- a/Algo256/cuda_keccak256.cu
+++ b/Algo256/cuda_keccak256.cu
@ -27,11 +27,81 @@ uint32_t *d_KNonce[8];
 __constant__ uint32_t pTarget[8];
 __constant__ uint64_t keccak_round_constants[24];
-__constant__ uint64_t c_PaddedMessage80[10]; // padded message (80 bytes + padding)
+__constant__ uint64_t c_PaddedMessage80[10]; // padded message (80 bytes + padding?)
 #if __CUDA_ARCH__ >= 350
 __device__ __forceinline__
 static void keccak_blockv35(uint2 *s, const uint64_t *keccak_round_constants)
 {
 	size_t i;
 	uint2 t[5], u[5], v, w;
 	#pragma unroll
 	for (i = 0; i < 24; i++) {
 		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
 		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
 		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
 		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
 		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
 		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
 		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
 		u[0] = t[4] ^ ROL2(t[1], 1);
 		u[1] = t[0] ^ ROL2(t[2], 1);
 		u[2] = t[1] ^ ROL2(t[3], 1);
 		u[3] = t[2] ^ ROL2(t[4], 1);
 		u[4] = t[3] ^ ROL2(t[0], 1);
 		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
 		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
 		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
 		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
 		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
 		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
 		/* rho pi: b[..] = rotl(a[..], ..) */
 		v = s[1];
 		s[1] = ROL2(s[6], 44);
 		s[6] = ROL2(s[9], 20);
 		s[9] = ROL2(s[22], 61);
 		s[22] = ROL2(s[14], 39);
 		s[14] = ROL2(s[20], 18);
 		s[20] = ROL2(s[2], 62);
 		s[2] = ROL2(s[12], 43);
 		s[12] = ROL2(s[13], 25);
 		s[13] = ROL2(s[19], 8);
 		s[19] = ROL2(s[23], 56);
 		s[23] = ROL2(s[15], 41);
 		s[15] = ROL2(s[4], 27);
 		s[4] = ROL2(s[24], 14);
 		s[24] = ROL2(s[21], 2);
 		s[21] = ROL2(s[8], 55);
 		s[8] = ROL2(s[16], 45);
 		s[16] = ROL2(s[5], 36);
 		s[5] = ROL2(s[3], 28);
 		s[3] = ROL2(s[18], 21);
 		s[18] = ROL2(s[17], 15);
 		s[17] = ROL2(s[11], 10);
 		s[11] = ROL2(s[7], 6);
 		s[7] = ROL2(s[10], 3);
 		s[10] = ROL2(v, 1);
 		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
 		v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
 		v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
 		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
 		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
 		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
-static __device__ __forceinline__
+		/* iota: a[0,0] ^= round constant */
-void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants) {
+		s[0] ^= vectorize(keccak_round_constants[i]);
 	}
 }
 #endif
 __device__ __forceinline__
 static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants)
 {
 	size_t i;
 	uint64_t t[5], u[5], v, w;
@ -109,14 +179,16 @@ void keccak256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash,
 		//#pragma unroll 25
 		for (int i=0; i<25; i++) {
-			if(i<9) {keccak_gpu_state[i] = c_PaddedMessage80[i];}
+			if (i < 9)
-			else    {keccak_gpu_state[i] = 0;}
+				keccak_gpu_state[i] = c_PaddedMessage80[i];
 			else
 				keccak_gpu_state[i] = 0;
 		}
-		keccak_gpu_state[9]=REPLACE_HIWORD(c_PaddedMessage80[9],cuda_swab32(nounce));
+		keccak_gpu_state[9]  = REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce));
-		keccak_gpu_state[10]=0x0000000000000001;
+		keccak_gpu_state[10] = 0x0000000000000001;
-		keccak_gpu_state[16]=0x8000000000000000;
+		keccak_gpu_state[16] = 0x8000000000000000;
-		keccak_block(keccak_gpu_state,keccak_round_constants);
+		keccak_blockv30(keccak_gpu_state, keccak_round_constants);
 		bool rc = false;
 		if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) {rc = true;}
@ -125,18 +197,7 @@ void keccak256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash,
 			if(resNounce[0] > nounce)
 				resNounce[0] = nounce;
 		}
-	} //thread
+	}
 }
 void keccak256_cpu_init(int thr_id, int threads)
 {
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants,
 						host_keccak_round_constants,
 						sizeof(host_keccak_round_constants),
 						0, cudaMemcpyHostToDevice));
 	CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], sizeof(uint32_t)));
 	CUDA_SAFE_CALL(cudaMallocHost(&d_nounce[thr_id], 1*sizeof(uint32_t)));
 }
 __host__
@ -161,6 +222,66 @@ uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, ui
 	return result;
 }
 #ifdef _MSC_VER
 #define UINT2(a, b) { a, b }
 #else
 #define UINT2(a, b) (uint2) { a, b }
 #endif
 __global__ __launch_bounds__(256,3)
 void keccak256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 #if __CUDA_ARCH__ >= 350 /* tpr: to double check if faster on SM5+ */
 		uint2 keccak_gpu_state[25];
 		#pragma unroll 25
 		for (int i = 0; i<25; i++) {
 			if (i < 4)
 				keccak_gpu_state[i] = vectorize(outputHash[i*threads+thread]);
 			else
 				keccak_gpu_state[i] = UINT2(0, 0);
 		}
 		keccak_gpu_state[4]  = UINT2(1, 0);
 		keccak_gpu_state[16] = UINT2(0, 0x80000000);
 		keccak_blockv35(keccak_gpu_state, keccak_round_constants);
 		#pragma unroll 4
 		for (int i=0; i<4;i++)
 			outputHash[i*threads+thread]=devectorize(keccak_gpu_state[i]);
 #else
 		uint64_t keccak_gpu_state[25];
 		#pragma unroll 25
 		for (int i = 0; i<25; i++) {
 			if (i<4)
 				keccak_gpu_state[i] = outputHash[i*threads+thread];
 			else
 				keccak_gpu_state[i] = 0;
 		}
 		keccak_gpu_state[4] = 0x0000000000000001;
 		keccak_gpu_state[16] = 0x8000000000000000;
 		keccak_blockv30(keccak_gpu_state, keccak_round_constants);
 		#pragma unroll 4
 		for (int i = 0; i<4; i++)
 			outputHash[i*threads + thread] = keccak_gpu_state[i];
 #endif
 	}
 }
 __host__
 void keccak256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
 {
 	const int threadsperblock = 256;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 	keccak256_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_outputHash);
 	MyStreamSynchronize(NULL, order, thr_id);
 }
 __host__
 void keccak256_setBlock_80(void *pdata,const void *pTargetIn)
 {
@ -168,4 +289,13 @@ void keccak256_setBlock_80(void *pdata,const void *pTargetIn)
 	memcpy(PaddedMessage, pdata, 80);
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, pTargetIn, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 10*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
-}
+}
 __host__
 void keccak256_cpu_init(int thr_id, int threads)
 {
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants, host_keccak_round_constants,
 				sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice));
 	CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], sizeof(uint32_t)));
 	CUDA_SAFE_CALL(cudaMallocHost(&d_nounce[thr_id], 1*sizeof(uint32_t)));
 }
--- a/Algo256/cuda_skein256.cu
+++ b/Algo256/cuda_skein256.cu
@ -0,0 +1,196 @@
 #include <memory.h>
 #include "cuda_helper.h"
 #if 0
 static __constant__ uint64_t SKEIN_IV512_256[8] = {
 	0xCCD044A12FDB3E13, 0xE83590301A79A9EB,
 	0x55AEA0614F816E6F, 0x2A2767A4AE9B94DB,
 	0xEC06025E74DD7683, 0xE7A436CDC4746251,
 	0xC36FBAF9393AD185, 0x3EEDBA1833EDFC13
 };
 #endif
 static __constant__ uint2 vSKEIN_IV512_256[8] = {
 	{ 0x2FDB3E13, 0xCCD044A1 },
 	{ 0x1A79A9EB, 0xE8359030 },
 	{ 0x4F816E6F, 0x55AEA061 },
 	{ 0xAE9B94DB, 0x2A2767A4 },
 	{ 0x74DD7683, 0xEC06025E },
 	{ 0xC4746251, 0xE7A436CD },
 	{ 0x393AD185, 0xC36FBAF9 },
 	{ 0x33EDFC13, 0x3EEDBA18 }
 };
 static __constant__ int ROT256[8][4] =
 {
 	46,36, 19, 37,
 	33,27, 14, 42,
 	17,49, 36, 39,
 	44, 9, 54, 56,
 	39,30, 34, 24,
 	13,50, 10, 17,
 	25,29, 39, 43,
 	8, 35, 56, 22,
 };
 static __constant__ uint2 skein_ks_parity = { 0xA9FC1A22,0x1BD11BDA};
 static __constant__ uint2 t12[6] = {
 	{ 0x20,	0 },
 	{ 0,	0xf0000000 },
 	{ 0x20,	0xf0000000 },
 	{ 0x08,	0 },
 	{ 0,	0xff000000 },
 	{ 0x08,	0xff000000 }
 };
 #if 0
 static __constant__ uint64_t t12_30[6] = {
 	0x20,
 	0xf000000000000000,
 	0xf000000000000020,
 	0x08,
 	0xff00000000000000,
 	0xff00000000000008
 };
 #endif
 static __forceinline__ __device__
 void Round512v35(uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int ROT)
 {
 	p0 += p1; p1 = ROL2(p1, ROT256[ROT][0]);  p1 ^= p0;
 	p2 += p3; p3 = ROL2(p3, ROT256[ROT][1]);  p3 ^= p2;
 	p4 += p5; p5 = ROL2(p5, ROT256[ROT][2]);  p5 ^= p4;
 	p6 += p7; p7 = ROL2(p7, ROT256[ROT][3]);  p7 ^= p6;
 }
 static __forceinline__ __device__
 void Round_8_512v35(uint2 *ks, uint2 *ts,
                    uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3,
                    uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int R)
 {
 	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 0);
 	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 1);
 	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 2);
 	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 3);
 	p0 += ks[((R)+0) % 9];   /* inject the key schedule value */
 	p1 += ks[((R)+1) % 9];
 	p2 += ks[((R)+2) % 9];
 	p3 += ks[((R)+3) % 9];
 	p4 += ks[((R)+4) % 9];
 	p5 += ks[((R)+5) % 9] + ts[((R)+0) % 3];
 	p6 += ks[((R)+6) % 9] + ts[((R)+1) % 3];
 	p7 += ks[((R)+7) % 9] + make_uint2((R),0);
 	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 4);
 	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 5);
 	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 6);
 	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 7);
 	p0 += ks[((R)+1) % 9];   /* inject the key schedule value */
 	p1 += ks[((R)+2) % 9];
 	p2 += ks[((R)+3) % 9];
 	p3 += ks[((R)+4) % 9];
 	p4 += ks[((R)+5) % 9];
 	p5 += ks[((R)+6) % 9] + ts[((R)+1) % 3];
 	p6 += ks[((R)+7) % 9] + ts[((R)+2) % 3];
 	p7 += ks[((R)+8) % 9] + make_uint2((R)+1, 0);
 }
 __global__ __launch_bounds__(256,3)
 void skein256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint2 h[9];
 		uint2 t[3];
 		uint2 dt0,dt1,dt2,dt3;
 		uint2 p0, p1, p2, p3, p4, p5, p6, p7;
 		h[8] = skein_ks_parity;
 		for (int i = 0; i<8; i++) {
 			h[i] = vSKEIN_IV512_256[i];
 			h[8] ^= h[i];
 		}
 		t[0]=t12[0];
 		t[1]=t12[1];
 		t[2]=t12[2];
 		LOHI(dt0.x,dt0.y,outputHash[thread]);
 		LOHI(dt1.x,dt1.y,outputHash[threads+thread]);
 		LOHI(dt2.x,dt2.y,outputHash[2*threads+thread]);
 		LOHI(dt3.x,dt3.y,outputHash[3*threads+thread]);
 		p0 = h[0] + dt0;
 		p1 = h[1] + dt1;
 		p2 = h[2] + dt2;
 		p3 = h[3] + dt3;
 		p4 = h[4];
 		p5 = h[5] + t[0];
 		p6 = h[6] + t[1];
 		p7 = h[7];
 		#pragma unroll
 		for (int i = 1; i<19; i+=2) {
 			Round_8_512v35(h,t,p0,p1,p2,p3,p4,p5,p6,p7,i);
 		}
 		p0 ^= dt0;
 		p1 ^= dt1;
 		p2 ^= dt2;
 		p3 ^= dt3;
 		h[0] = p0;
 		h[1] = p1;
 		h[2] = p2;
 		h[3] = p3;
 		h[4] = p4;
 		h[5] = p5;
 		h[6] = p6;
 		h[7] = p7;
 		h[8] = skein_ks_parity;
 		#pragma unroll 8
 		for (int i = 0; i<8; i++) {
 			h[8] ^= h[i];
 		}
 		t[0] = t12[3];
 		t[1] = t12[4];
 		t[2] = t12[5];
 		p5 += t[0];  //p5 already equal h[5]
 		p6 += t[1];
 		#pragma unroll
 		for (int i = 1; i<19; i+=2) {
 			Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, i);
 		}
 		outputHash[thread]           = devectorize(p0);
 		outputHash[threads+thread]   = devectorize(p1);
 		outputHash[2*threads+thread] = devectorize(p2);
 		outputHash[3*threads+thread] = devectorize(p3);
 	}
 }
 __host__
 void skein256_cpu_init(int thr_id, int threads)
 {
 	//empty
 }
 __host__
 void skein256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
 {
 	const int threadsperblock = 256;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 	skein256_gpu_hash_32<<<grid, block>>>(threads, startNounce, d_outputHash);
 	MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/Algo256/keccak256.cu
+++ b/Algo256/keccak256.cu
--- a/Makefile.am
+++ b/Makefile.am
@ -10,11 +10,11 @@ EXTRA_DIST		= autogen.sh README.txt LICENSE.txt \
 			  cudaminer.sln cudaminer.vcxproj cudaminer.vcxproj.filters \
 			  compat/gettimeofday.c compat/getopt/getopt_long.c cpuminer-config.h.in
-SUBDIRS		= compat
+SUBDIRS = compat
-bin_PROGRAMS	= ccminer
+bin_PROGRAMS = ccminer
-ccminer_SOURCES		= elist.h miner.h compat.h \
+ccminer_SOURCES	= elist.h miner.h compat.h \
 			  compat/inttypes.h compat/stdbool.h compat/unistd.h \
 			  compat/sys/time.h compat/getopt/getopt.h \
 			  crc32.c hefty1.c scrypt.c \
@ -27,17 +27,20 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
 			  heavy/cuda_hefty1.cu heavy/cuda_hefty1.h \
 			  heavy/cuda_keccak512.cu heavy/cuda_keccak512.h \
 			  heavy/cuda_sha256.cu heavy/cuda_sha256.h \
-			  keccak/cuda_keccak256.cu keccak/keccak256.cu \
+			  fuguecoin.cpp Algo256/cuda_fugue256.cu sph/fugue.c uint256.h \
 			  fuguecoin.cpp cuda_fugue256.cu sph/fugue.c sph/sph_fugue.h uint256.h \
 			  groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \
 			  myriadgroestl.cpp cuda_myriadgroestl.cu \
 			  lyra2/Lyra2.c lyra2/Sponge.c \
 			  lyra2/lyra2RE.cu lyra2/cuda_lyra2.cu \
 			  Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \
 			  Algo256/blake256.cu Algo256/keccak256.cu \
 			  JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \
 			  JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \
 			  quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \
 			  quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu \
 			  quark/quarkcoin.cu quark/animecoin.cu \
 			  quark/cuda_quark_compactionTest.cu  \
-			  cuda_nist5.cu blake32.cu pentablake.cu \
+			  cuda_nist5.cu pentablake.cu \
 			  sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
 			  sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
 			  sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@
-ccMiner release 1.5.0-tpruvot (27 Nov 2014) - "Extra nonce"
+ccMiner release 1.5.1-tpruvot (16 Dec 2014) - "Vertcoin Lyra2"
 ---------------------------------------------------------------
 ***************************************************************
@ -38,6 +38,7 @@ Keccak (Maxcoin)
 Deep, Doom and Qubit
 Pentablake (Blake 512 x5)
 S3 (OneCoin)
 Lyra2RE (new VertCoin algo)
 where some of these coins have a VERY NOTABLE nVidia advantage
 over competing AMD (OpenCL Only) implementations.
@ -68,6 +69,7 @@ its command line interface and options.
                          jackpot     use to mine Jackpotcoin
                          keccak      use to mine Maxcoin
                          luffa       use to mine Doomcoin
                          lyra2       use to mine Vertcoin
                          mjollnir    use to mine Mjollnircoin
                          myr-gr      use to mine Myriad-Groest
                          nist5       use to mine TalkCoin
@ -169,6 +171,12 @@ features.
 >>> RELEASE HISTORY <<<
  Dec. 2014       v1.5.1 (not released yet!)
                  Add lyra2 algo for Vertcoin (Release is 16 Dec 2014)
                  Multiple shares support (2 for the moment)
                  X11 optimisations (From klaust and sp-hash)
                  HTML5 WebSocket api compatibility (see api/websocket.htm)
  Nov. 27th 2014  v1.5.0
                  Upgrade compat jansson to 2.6 (for windows)
                  Add pool mining.set_extranonce support
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -138,6 +138,7 @@ enum sha_algos {
 	ALGO_KECCAK,
 	ALGO_JACKPOT,
 	ALGO_LUFFA_DOOM,
 	ALGO_LYRA,
 	ALGO_MJOLLNIR,		/* Hefty hash */
 	ALGO_MYR_GR,
 	ALGO_NIST5,
@ -167,6 +168,7 @@ static const char *algo_names[] = {
 	"keccak",
 	"jackpot",
 	"luffa",
 	"lyra2",
 	"mjollnir",
 	"myr-gr",
 	"nist5",
@ -272,6 +274,7 @@ Options:\n\
 			jackpot     Jackpot\n\
 			keccak      Keccak-256 (Maxcoin)\n\
 			luffa       Doomcoin\n\
 			lyra2       VertCoin\n\
 			mjollnir    Mjollnircoin\n\
 			myr-gr      Myriad-Groestl\n\
 			nist5       NIST5 (TalkCoin)\n\
@ -1255,6 +1258,11 @@ static void *miner_thread(void *userdata)
 			                      max_nonce, &hashes_done);
 			break;
 		case ALGO_LYRA:
 			rc = scanhash_lyra(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
 			break;
 		case ALGO_NIST5:
 			rc = scanhash_nist5(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -105,7 +105,7 @@
      <MaxRegCount>80</MaxRegCount>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>false</Keep>
-      <CodeGeneration>compute_50,sm_50</CodeGeneration>
+      <CodeGeneration>compute_30,sm_30;compute_50,sm_50</CodeGeneration>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
@ -173,7 +173,7 @@
      <MaxRegCount>80</MaxRegCount>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>false</Keep>
-      <CodeGeneration>compute_30,sm_30;compute_50,sm_50;</CodeGeneration>
+      <CodeGeneration>compute_50,sm_50;</CodeGeneration>
      <AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
      <Defines>
      </Defines>
@ -257,6 +257,8 @@
      <Optimization Condition="'$(Configuration)'=='Release'">Full</Optimization>
      <AdditionalOptions>/Tp %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <ClCompile Include="lyra2\Lyra2.c" />
    <ClCompile Include="lyra2\Sponge.c" />
    <ClCompile Include="sph\aes_helper.c" />
    <ClCompile Include="sph\blake.c" />
    <ClCompile Include="sph\bmw.c" />
@ -330,13 +332,15 @@
    <ClInclude Include="sph\sph_whirlpool.h" />
    <ClInclude Include="uint256.h" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="lyra2\Lyra2.h" />
    <ClInclude Include="lyra2\Sponge.h" />
  </ItemGroup>
  <ItemGroup>
    <CudaCompile Include="cuda.cpp" />
    <CudaCompile Include="bitslice_transformations_quad.cu">
      <ExcludedFromBuild>true</ExcludedFromBuild>
    </CudaCompile>
    <CudaCompile Include="cuda_fugue256.cu">
    </CudaCompile>
    <CudaCompile Include="cuda_groestlcoin.cu">
    </CudaCompile>
    <CudaCompile Include="cuda_myriadgroestl.cu">
@ -369,15 +373,19 @@
    </CudaCompile>
    <CudaCompile Include="JHA\jackpotcoin.cu">
    </CudaCompile>
-    <CudaCompile Include="blake32.cu">
+    <CudaCompile Include="Algo256\blake256.cu">
      <MaxRegCount>64</MaxRegCount>
      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
      <FastMath>true</FastMath>
    </CudaCompile>
-    <CudaCompile Include="keccak\cuda_keccak256.cu">
+    <CudaCompile Include="Algo256\keccak256.cu" />
    <CudaCompile Include="Algo256\cuda_blake256.cu" />
    <CudaCompile Include="Algo256\cuda_fugue256.cu" />
    <CudaCompile Include="Algo256\cuda_groestl256.cu" />
    <CudaCompile Include="Algo256\cuda_keccak256.cu">
      <MaxRegCount>92</MaxRegCount>
    </CudaCompile>
-    <CudaCompile Include="keccak\keccak256.cu" />
+    <CudaCompile Include="Algo256\cuda_skein256.cu" />
    <CudaCompile Include="pentablake.cu">
      <MaxRegCount>80</MaxRegCount>
      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
@ -418,6 +426,8 @@
    </CudaCompile>
    <CudaCompile Include="qubit\qubit_luffa512.cu">
    </CudaCompile>
    <CudaCompile Include="lyra2\lyra2RE.cu" />
    <CudaCompile Include="lyra2\cuda_lyra2.cu" />
    <CudaCompile Include="x11\cuda_x11_aes.cu">
      <ExcludedFromBuild>true</ExcludedFromBuild>
    </CudaCompile>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -61,12 +61,15 @@
    <Filter Include="Source Files\jansson">
      <UniqueIdentifier>{17b56151-79ec-4a32-bac3-9d94ae7f68fe}</UniqueIdentifier>
    </Filter>
    <Filter Include="Source Files\CUDA\keccak">
      <UniqueIdentifier>{9762c92c-9677-4044-8292-ff6ba4bfdd89}</UniqueIdentifier>
    </Filter>
    <Filter Include="Header Files\compat\nvapi">
      <UniqueIdentifier>{ef6f9983-bda5-4fb2-adfa-ac4f29b74f25}</UniqueIdentifier>
    </Filter>
    <Filter Include="Source Files\CUDA\Algo256">
      <UniqueIdentifier>{9762c92c-9677-4044-8292-ff6ba4bfdd89}</UniqueIdentifier>
    </Filter>
    <Filter Include="Header Files\lyra2">
      <UniqueIdentifier>{2ff6e4ce-7c92-4cb2-a3ad-c331e94fd81d}</UniqueIdentifier>
    </Filter>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="compat\jansson\dump.c">
@ -213,6 +216,12 @@
    <ClCompile Include="compat\jansson\error.c">
      <Filter>Source Files\jansson</Filter>
    </ClCompile>
    <ClCompile Include="lyra2\Lyra2.c">
      <Filter>Source Files\sph</Filter>
    </ClCompile>
    <ClCompile Include="lyra2\Sponge.c">
      <Filter>Source Files\sph</Filter>
    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="compat.h">
@ -347,14 +356,17 @@
    <ClInclude Include="compat\jansson\jansson_config.h">
      <Filter>Header Files\compat</Filter>
    </ClInclude>
    <ClInclude Include="lyra2\Lyra2.h">
      <Filter>Header Files\lyra2</Filter>
    </ClInclude>
    <ClInclude Include="lyra2\Sponge.h">
      <Filter>Header Files\lyra2</Filter>
    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <CudaCompile Include="cuda.cpp">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
    <CudaCompile Include="cuda_fugue256.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
    <CudaCompile Include="cuda_groestlcoin.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
@ -505,20 +517,38 @@
    <CudaCompile Include="x17\x17.cu">
      <Filter>Source Files\CUDA\x17</Filter>
    </CudaCompile>
-    <CudaCompile Include="blake32.cu">
+    <CudaCompile Include="pentablake.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
    <CudaCompile Include="x11\s3.cu">
      <Filter>Source Files\CUDA\x11</Filter>
    </CudaCompile>
    <CudaCompile Include="Algo256\blake256.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
-    <CudaCompile Include="pentablake.cu">
+    <CudaCompile Include="Algo256\keccak256.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
-    <CudaCompile Include="keccak\cuda_keccak256.cu">
+    <CudaCompile Include="Algo256\cuda_blake256.cu">
-      <Filter>Source Files\CUDA\keccak</Filter>
+      <Filter>Source Files\CUDA\Algo256</Filter>
    </CudaCompile>
-    <CudaCompile Include="keccak\keccak256.cu">
+    <CudaCompile Include="Algo256\cuda_fugue256.cu">
-      <Filter>Source Files\CUDA\keccak</Filter>
+      <Filter>Source Files\CUDA\Algo256</Filter>
    </CudaCompile>
-    <CudaCompile Include="x11\s3.cu">
+    <CudaCompile Include="Algo256\cuda_groestl256.cu">
-      <Filter>Source Files\CUDA\x11</Filter>
+      <Filter>Source Files\CUDA\Algo256</Filter>
    </CudaCompile>
    <CudaCompile Include="Algo256\cuda_keccak256.cu">
      <Filter>Source Files\CUDA\Algo256</Filter>
    </CudaCompile>
    <CudaCompile Include="Algo256\cuda_skein256.cu">
      <Filter>Source Files\CUDA\Algo256</Filter>
    </CudaCompile>
    <CudaCompile Include="lyra2\cuda_lyra2.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
    <CudaCompile Include="lyra2\lyra2RE.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
  </ItemGroup>
 </Project>
--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -355,7 +355,7 @@ uint64_t ROTL64(const uint64_t x, const int offset)
 		"setp.lt.u32 p, %2, 32;\n\t"
 		"@!p mov.b64 %0, {vl,vh};\n\t"
 		"@p  mov.b64 %0, {vh,vl};\n\t"
-		"}"
+	"}"
 		: "=l"(res) : "l"(x) , "r"(offset)
 	);
 	return res;
@ -378,4 +378,99 @@ uint64_t SWAPDWORDS(uint64_t value)
 #endif
 }
 /* lyra2 - int2 operators */
 __device__ __forceinline__
 void LOHI(uint32_t &lo, uint32_t &hi, uint64_t x) {
 	asm("mov.b64 {%0,%1},%2; \n\t"
 		: "=r"(lo), "=r"(hi) : "l"(x));
 }
 static __device__ __forceinline__ uint64_t devectorize(uint2 v) { return MAKE_ULONGLONG(v.x, v.y); }
 static __device__ __forceinline__ uint2 vectorize(uint64_t v) {
 	uint2 result;
 	LOHI(result.x, result.y, v);
 	return result;
 }
 static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); }
 static __device__ __forceinline__ uint2 operator& (uint2 a, uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); }
 static __device__ __forceinline__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); }
 static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); }
 static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; }
 static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b)
 {
 	uint2 result;
 	asm("{\n\t"
 		"add.cc.u32 %0,%2,%4; \n\t"
 		"addc.u32 %1,%3,%5;   \n\t"
 	"}\n\t"
 		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
 	return result;
 }
 static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a + b; }
 /**
 * basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b))
 * (what does uint64 "*" operator)
 */
 static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b)
 {
 	uint2 result;
 	asm("{\n\t"
 		"mul.lo.u32        %0,%2,%4;  \n\t"
 		"mul.hi.u32        %1,%2,%4;  \n\t"
 		"mad.lo.cc.u32    %1,%3,%4,%1; \n\t"
 		"madc.lo.u32      %1,%3,%5,%1; \n\t"
 	"}\n\t"
 		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
 	return result;
 }
 // uint2 method
 #if  __CUDA_ARCH__ >= 350
 __device__ __inline__ uint2 ROR2(const uint2 a, const int offset) {
 	uint2 result;
 	if (offset < 32) {
 		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
 		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
 	}
 	else {
 		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
 		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
 	}
 	return result;
 }
 #else
 __device__ __inline__ uint2 ROR2(const uint2 v, const int n) {
 	uint2 result;
 	result.x = (((v.x) >> (n)) | ((v.x) << (64 - (n))));
 	result.y = (((v.y) >> (n)) | ((v.y) << (64 - (n))));
 	return result;
 }
 #endif
 #if  __CUDA_ARCH__ >= 350
 __inline__ __device__ uint2 ROL2(const uint2 a, const int offset) {
 	uint2 result;
 	if (offset >= 32) {
 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
 	}
 	else {
 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
 		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
 	}
 	return result;
 }
 #else
 __inline__ __device__ uint2 ROL2(const uint2 v, const int n) {
 	uint2 result;
 	result.x = (((v.x) << (n)) | ((v.x) >> (64 - (n))));
 	result.y = (((v.y) << (n)) | ((v.y) >> (64 - (n))));
 	return result;
 }
 #endif
 #endif // #ifndef CUDA_HELPER_H
--- a/lyra2/Lyra2.c
+++ b/lyra2/Lyra2.c
@ -0,0 +1,211 @@
 /**
 * Implementation of the Lyra2 Password Hashing Scheme (PHS).
 *
 * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
 *
 * This software is hereby placed in the public domain.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <time.h>
 #include "Lyra2.h"
 #include "Sponge.h"
 /**
 * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
 * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
 * where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all
 * integer parameters (treated as type "unsigned int") in the order they are provided, plus the value
 * of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols).
 *
 * @param K The derived key to be output by the algorithm
 * @param kLen Desired key length
 * @param pwd User password
 * @param pwdlen Password length
 * @param salt Salt
 * @param saltlen Salt length
 * @param timeCost Parameter to determine the processing time (T)
 * @param nRows Number or rows of the memory matrix (R)
 * @param nCols Number of columns of the memory matrix (C)
 *
 * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
 */
 int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols)
 {
 	//============================= Basic variables ============================//
 	int64_t row = 2; //index of row to be processed
 	int64_t prev = 1; //index of prev (last row ever computed/modified)
 	int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
 	int64_t tau; //Time Loop iterator
 	int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
 	int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
 	int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
 	int64_t i; //auxiliary iteration counter
 	//==========================================================================/
 	//========== Initializing the Memory Matrix and pointers to it =============//
 	//Tries to allocate enough space for the whole memory matrix
 	i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES);
 	uint64_t *wholeMatrix = (uint64_t*) malloc((size_t) i);
 	if (wholeMatrix == NULL) {
 		return -1;
 	}
 	memset(wholeMatrix, 0, (size_t) i);
 	//Allocates pointers to each row of the matrix
 	uint64_t **memMatrix = malloc((size_t) nRows * sizeof(uint64_t*));
 	if (memMatrix == NULL) {
 		return -1;
 	}
 	//Places the pointers in the correct positions
 	uint64_t *ptrWord = wholeMatrix;
 	for (i = 0; i < (int64_t) nRows; i++) {
 		memMatrix[i] = ptrWord;
 		ptrWord += ROW_LEN_INT64;
 	}
 	//==========================================================================/
 	//============= Getting the password + salt + basil padded with 10*1 ===============//
 	//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
 	//but this ensures that the password copied locally will be overwritten as soon as possible
 	//First, we clean enough blocks for the password, salt, basil and padding
 	uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof (uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
 	byte *ptrByte = (byte*) wholeMatrix;
 	memset(ptrByte, 0, (size_t) nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES);
 	//Prepends the password
 	memcpy(ptrByte, pwd, (size_t) pwdlen);
 	ptrByte += pwdlen;
 	//Concatenates the salt
 	memcpy(ptrByte, salt, (size_t) saltlen);
 	ptrByte += saltlen;
 	//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
 	memcpy(ptrByte, &kLen, sizeof (uint64_t));
 	ptrByte += sizeof (uint64_t);
 	memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
 	ptrByte += sizeof (uint64_t);
 	memcpy(ptrByte, &saltlen, sizeof (uint64_t));
 	ptrByte += sizeof (uint64_t);
 	memcpy(ptrByte, &timeCost, sizeof (uint64_t));
 	ptrByte += sizeof (uint64_t);
 	memcpy(ptrByte, &nRows, sizeof (uint64_t));
 	ptrByte += sizeof (uint64_t);
 	memcpy(ptrByte, &nCols, sizeof (uint64_t));
 	ptrByte += sizeof (uint64_t);
 	//Now comes the padding
 	*ptrByte = 0x80; //first byte of padding: right after the password
 	ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
 	ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
 	*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
 	//==========================================================================/
 	//======================= Initializing the Sponge State ====================//
 	//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
 	uint64_t *state = malloc(16 * sizeof (uint64_t));
 	if (state == NULL) {
 		return -1;
 	}
 	initState(state);
 	//==========================================================================/
 	//================================ Setup Phase =============================//
 	//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
 	ptrWord = wholeMatrix;
 	for (i = 0; i < (int64_t) nBlocksInput; i++) {
 		absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
 		ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil)
 	}
 	//Initializes M[0] and M[1]
 	reducedSqueezeRow0(state, memMatrix[0]); //The locally copied password is most likely overwritten here
 	reducedDuplexRow1(state, memMatrix[0], memMatrix[1]);
 	do {
 		//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
 		reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
 		//updates the value of row* (deterministically picked during Setup))
 		rowa = (rowa + step) & (window - 1);
 		//update prev: it now points to the last row ever computed
 		prev = row;
 		//updates row: goes to the next row to be computed
 		row++;
 		//Checks if all rows in the window where visited.
 		if (rowa == 0) {
 		step = window + gap; //changes the step: approximately doubles its value
 		window *= 2; //doubles the size of the re-visitation window
 		gap = -gap; //inverts the modifier to the step
 	}
 	} while (row < (int64_t) nRows);
 	//==========================================================================/
 	//============================ Wandering Phase =============================//
 	row = 0; //Resets the visitation to the first row of the memory matrix
 	for (tau = 1; tau <= (int64_t) timeCost; tau++) {
 		//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
 		step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
 		do {
 			//Selects a pseudorandom index row*
 			//------------------------------------------------------------------------------------------
 			//rowa = ((unsigned int)state[0]) & (nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
 			rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
 			//------------------------------------------------------------------------------------------
 			//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
 			reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
 			//update prev: it now points to the last row ever computed
 			prev = row;
 			//updates row: goes to the next row to be computed
 			//------------------------------------------------------------------------------------------
 			//row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
 			row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
 			//------------------------------------------------------------------------------------------
 		} while (row != 0);
 	}
 	//==========================================================================/
 	//============================ Wrap-up Phase ===============================//
 	//Absorbs the last block of the memory matrix
 	absorbBlock(state, memMatrix[rowa]);
 	//Squeezes the key
 	squeeze(state, K, (size_t) kLen);
 	//==========================================================================/
 	//========================= Freeing the memory =============================//
 	free(memMatrix);
 	free(wholeMatrix);
 	//Wiping out the sponge's internal state before freeing it
 	memset(state, 0, 16 * sizeof (uint64_t));
 	free(state);
 	//==========================================================================/
 	return 0;
 }
--- a/lyra2/Lyra2.h
+++ b/lyra2/Lyra2.h
@ -0,0 +1,50 @@
 /**
 * Header file for the Lyra2 Password Hashing Scheme (PHS).
 *
 * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
 *
 * This software is hereby placed in the public domain.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef LYRA2_H_
 #define LYRA2_H_
 #include <stdint.h>
 typedef unsigned char byte;
 //Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED)
 #define BLOCK_LEN_BLAKE2_SAFE_INT64 8                                   //512 bits (=64 bytes, =8 uint64_t)
 #define BLOCK_LEN_BLAKE2_SAFE_BYTES (BLOCK_LEN_BLAKE2_SAFE_INT64 * 8)   //same as above, in bytes
 #ifdef BLOCK_LEN_BITS
        #define BLOCK_LEN_INT64 (BLOCK_LEN_BITS/64)      //Block length: 768 bits (=96 bytes, =12 uint64_t)
        #define BLOCK_LEN_BYTES (BLOCK_LEN_BITS/8)       //Block length, in bytes
 #else   //default block lenght: 768 bits
        #define BLOCK_LEN_INT64 12                       //Block length: 768 bits (=96 bytes, =12 uint64_t)
        #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8)    //Block length, in bytes
 #endif
 #ifndef N_COLS
        #define N_COLS 8                                //Number of columns in the memory matrix: fixed to 64 by default
 #endif
 #define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks
 #define ROW_LEN_BYTES (ROW_LEN_INT64 * 8)        //Number of bytes per row
 int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols);
 #endif /* LYRA2_H_ */
--- a/lyra2/Sponge.c
+++ b/lyra2/Sponge.c
@ -0,0 +1,755 @@
 /**
 * A simple implementation of Blake2b's internal permutation
 * in the form of a sponge.
 *
 * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
 *
 * This software is hereby placed in the public domain.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #include <string.h>
 #include <stdio.h>
 #include <time.h>
 #include "Sponge.h"
 #include "Lyra2.h"
 /**
 * Initializes the Sponge State. The first 512 bits are set to zeros and the remainder
 * receive Blake2b's IV as per Blake2b's specification. <b>Note:</b> Even though sponges
 * typically have their internal state initialized with zeros, Blake2b's G function
 * has a fixed point: if the internal state and message are both filled with zeros. the
 * resulting permutation will always be a block filled with zeros; this happens because
 * Blake2b does not use the constants originally employed in Blake2 inside its G function,
 * relying on the IV for avoiding possible fixed points.
 *
 * @param state         The 1024-bit array to be initialized
 */
 void initState(uint64_t state[/*16*/]) {
    //First 512 bis are zeros
    memset(state, 0, 64);
    //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV
    state[8] = blake2b_IV[0];
    state[9] = blake2b_IV[1];
    state[10] = blake2b_IV[2];
    state[11] = blake2b_IV[3];
    state[12] = blake2b_IV[4];
    state[13] = blake2b_IV[5];
    state[14] = blake2b_IV[6];
    state[15] = blake2b_IV[7];
 }
 /**
 * Execute Blake2b's G function, with all 12 rounds.
 *
 * @param v     A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
 */
 __inline static void blake2bLyra(uint64_t *v) {
    ROUND_LYRA(0);
    ROUND_LYRA(1);
    ROUND_LYRA(2);
    ROUND_LYRA(3);
    ROUND_LYRA(4);
    ROUND_LYRA(5);
    ROUND_LYRA(6);
    ROUND_LYRA(7);
    ROUND_LYRA(8);
    ROUND_LYRA(9);
    ROUND_LYRA(10);
    ROUND_LYRA(11);
 }
 /**
 * Executes a reduced version of Blake2b's G function with only one round
 * @param v     A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
 */
 __inline static void reducedBlake2bLyra(uint64_t *v) {
    ROUND_LYRA(0);
 }
 /**
 * Performs a squeeze operation, using Blake2b's G function as the
 * internal permutation
 *
 * @param state      The current state of the sponge
 * @param out        Array that will receive the data squeezed
 * @param len        The number of bytes to be squeezed into the "out" array
 */
 void squeeze(uint64_t *state, byte *out, unsigned int len) {
    int fullBlocks = len / BLOCK_LEN_BYTES;
    byte *ptr = out;
    int i;
    //Squeezes full blocks
    for (i = 0; i < fullBlocks; i++) {
        memcpy(ptr, state, BLOCK_LEN_BYTES);
        blake2bLyra(state);
        ptr += BLOCK_LEN_BYTES;
    }
    //Squeezes remaining bytes
    memcpy(ptr, state, (len % BLOCK_LEN_BYTES));
 }
 /**
 * Performs an absorb operation for a single block (BLOCK_LEN_INT64 words
 * of type uint64_t), using Blake2b's G function as the internal permutation
 *
 * @param state The current state of the sponge
 * @param in    The block to be absorbed (BLOCK_LEN_INT64 words)
 */
 void absorbBlock(uint64_t *state, const uint64_t *in) {
    //XORs the first BLOCK_LEN_INT64 words of "in" with the current state
    state[0] ^= in[0];
    state[1] ^= in[1];
    state[2] ^= in[2];
    state[3] ^= in[3];
    state[4] ^= in[4];
    state[5] ^= in[5];
    state[6] ^= in[6];
    state[7] ^= in[7];
    state[8] ^= in[8];
    state[9] ^= in[9];
    state[10] ^= in[10];
    state[11] ^= in[11];
    //Applies the transformation f to the sponge's state
    blake2bLyra(state);
 }
 /**
 * Performs an absorb operation for a single block (BLOCK_LEN_BLAKE2_SAFE_INT64
 * words of type uint64_t), using Blake2b's G function as the internal permutation
 *
 * @param state The current state of the sponge
 * @param in    The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words)
 */
 void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) {
    //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
    state[0] ^= in[0];
    state[1] ^= in[1];
    state[2] ^= in[2];
    state[3] ^= in[3];
    state[4] ^= in[4];
    state[5] ^= in[5];
    state[6] ^= in[6];
    state[7] ^= in[7];
    //Applies the transformation f to the sponge's state
    blake2bLyra(state);
 /*
    for(int i = 0; i<16; i++) {
        printf(" final state %d %08x %08x in %08x %08x\n", i, (uint32_t)(state[i] & 0xFFFFFFFFULL), (uint32_t)(state[i] >> 32),
        (uint32_t)(in[i] & 0xFFFFFFFFULL), (uint32_t)(in[i] >> 32));
    }
 */
 }
 /**
 * Performs a reduced squeeze operation for a single row, from the highest to
 * the lowest index, using the reduced-round Blake2b's G function as the
 * internal permutation
 *
 * @param state     The current state of the sponge
 * @param rowOut    Row to receive the data squeezed
 */
 void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) {
    uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
    int i;
    //M[row][C-1-col] = H.reduced_squeeze()
    for (i = 0; i < N_COLS; i++) {
 	ptrWord[0] = state[0];
 	ptrWord[1] = state[1];
 	ptrWord[2] = state[2];
 	ptrWord[3] = state[3];
 	ptrWord[4] = state[4];
 	ptrWord[5] = state[5];
 	ptrWord[6] = state[6];
 	ptrWord[7] = state[7];
 	ptrWord[8] = state[8];
 	ptrWord[9] = state[9];
 	ptrWord[10] = state[10];
 	ptrWord[11] = state[11];
 	/*
 for (int i = 0; i<12; i++) {
 		printf(" after reducedSqueezeRow0 %d %08x %08x in %08x %08x\n", i, (uint32_t)(ptrWord[i] & 0xFFFFFFFFULL), (uint32_t)(ptrWord[i] >> 32),
 			(uint32_t)(state[i] & 0xFFFFFFFFULL), (uint32_t)(state[i] >> 32));
 	}
 */
 	//Goes to next block (column) that will receive the squeezed data
 	ptrWord -= BLOCK_LEN_INT64;
 	//Applies the reduced-round transformation f to the sponge's state
 	reducedBlake2bLyra(state);
    }
 }
 /**
 * Performs a reduced duplex operation for a single row, from the highest to
 * the lowest index, using the reduced-round Blake2b's G function as the
 * internal permutation
 *
 * @param state		The current state of the sponge
 * @param rowIn		Row to feed the sponge
 * @param rowOut	Row to receive the sponge's output
 */
 void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) {
    uint64_t* ptrWordIn = rowIn;				//In Lyra2: pointer to prev
    uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
    int i;
    for (i = 0; i < N_COLS; i++) {
 	//Absorbing "M[prev][col]"
 	state[0]  ^= (ptrWordIn[0]);
 	state[1]  ^= (ptrWordIn[1]);
 	state[2]  ^= (ptrWordIn[2]);
 	state[3]  ^= (ptrWordIn[3]);
 	state[4]  ^= (ptrWordIn[4]);
 	state[5]  ^= (ptrWordIn[5]);
 	state[6]  ^= (ptrWordIn[6]);
 	state[7]  ^= (ptrWordIn[7]);
 	state[8]  ^= (ptrWordIn[8]);
 	state[9]  ^= (ptrWordIn[9]);
 	state[10] ^= (ptrWordIn[10]);
 	state[11] ^= (ptrWordIn[11]);
 	//Applies the reduced-round transformation f to the sponge's state
 	reducedBlake2bLyra(state);
 	//M[row][C-1-col] = M[prev][col] XOR rand
 	ptrWordOut[0] = ptrWordIn[0]  ^ state[0];
 	ptrWordOut[1] = ptrWordIn[1]  ^ state[1];
 	ptrWordOut[2] = ptrWordIn[2]  ^ state[2];
 	ptrWordOut[3] = ptrWordIn[3]  ^ state[3];
 	ptrWordOut[4] = ptrWordIn[4]  ^ state[4];
 	ptrWordOut[5] = ptrWordIn[5]  ^ state[5];
 	ptrWordOut[6] = ptrWordIn[6]  ^ state[6];
 	ptrWordOut[7] = ptrWordIn[7]  ^ state[7];
 	ptrWordOut[8] = ptrWordIn[8]  ^ state[8];
 	ptrWordOut[9] = ptrWordIn[9]  ^ state[9];
 	ptrWordOut[10] = ptrWordIn[10] ^ state[10];
 	ptrWordOut[11] = ptrWordIn[11] ^ state[11];
 	//Input: next column (i.e., next block in sequence)
 	ptrWordIn += BLOCK_LEN_INT64;
 	//Output: goes to previous column
 	ptrWordOut -= BLOCK_LEN_INT64;
    }
 }
 /**
 * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e.,
 * the wordwise addition of two columns, ignoring carries between words). The
 * output of this operation, "rand", is then used to make
 * "M[rowOut][(N_COLS-1)-col] = M[rowIn][col] XOR rand" and
 * "M[rowInOut][col] =  M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit
 * rotation to the left and N_COLS is a system parameter.
 *
 * @param state          The current state of the sponge
 * @param rowIn          Row used only as input
 * @param rowInOut       Row used as input and to receive output after rotation
 * @param rowOut         Row receiving the output
 *
 */
 void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
    uint64_t* ptrWordIn = rowIn;				//In Lyra2: pointer to prev
    uint64_t* ptrWordInOut = rowInOut;				//In Lyra2: pointer to row*
    uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
    int i;
    for (i = 0; i < N_COLS; i++) {
 	//Absorbing "M[prev] [+] M[row*]"
 	state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
 	state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
 	state[2]  ^= (ptrWordIn[2]  + ptrWordInOut[2]);
 	state[3]  ^= (ptrWordIn[3]  + ptrWordInOut[3]);
 	state[4]  ^= (ptrWordIn[4]  + ptrWordInOut[4]);
 	state[5]  ^= (ptrWordIn[5]  + ptrWordInOut[5]);
 	state[6]  ^= (ptrWordIn[6]  + ptrWordInOut[6]);
 	state[7]  ^= (ptrWordIn[7]  + ptrWordInOut[7]);
 	state[8]  ^= (ptrWordIn[8]  + ptrWordInOut[8]);
 	state[9]  ^= (ptrWordIn[9]  + ptrWordInOut[9]);
 	state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
 	state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
 	//Applies the reduced-round transformation f to the sponge's state
 	reducedBlake2bLyra(state);
 	//M[row][col] = M[prev][col] XOR rand
 	ptrWordOut[0] = ptrWordIn[0]  ^ state[0];
 	ptrWordOut[1] = ptrWordIn[1]  ^ state[1];
 	ptrWordOut[2] = ptrWordIn[2]  ^ state[2];
 	ptrWordOut[3] = ptrWordIn[3]  ^ state[3];
 	ptrWordOut[4] = ptrWordIn[4]  ^ state[4];
 	ptrWordOut[5] = ptrWordIn[5]  ^ state[5];
 	ptrWordOut[6] = ptrWordIn[6]  ^ state[6];
 	ptrWordOut[7] = ptrWordIn[7]  ^ state[7];
 	ptrWordOut[8] = ptrWordIn[8]  ^ state[8];
 	ptrWordOut[9] = ptrWordIn[9]  ^ state[9];
 	ptrWordOut[10] = ptrWordIn[10] ^ state[10];
 	ptrWordOut[11] = ptrWordIn[11] ^ state[11];
 	//M[row*][col] = M[row*][col] XOR rotW(rand)
 	ptrWordInOut[0]  ^= state[11];
 	ptrWordInOut[1]  ^= state[0];
 	ptrWordInOut[2]  ^= state[1];
 	ptrWordInOut[3]  ^= state[2];
 	ptrWordInOut[4]  ^= state[3];
 	ptrWordInOut[5]  ^= state[4];
 	ptrWordInOut[6]  ^= state[5];
 	ptrWordInOut[7]  ^= state[6];
 	ptrWordInOut[8]  ^= state[7];
 	ptrWordInOut[9]  ^= state[8];
 	ptrWordInOut[10] ^= state[9];
 	ptrWordInOut[11] ^= state[10];
 	//Inputs: next column (i.e., next block in sequence)
 	ptrWordInOut += BLOCK_LEN_INT64;
 	ptrWordIn += BLOCK_LEN_INT64;
 	//Output: goes to previous column
 	ptrWordOut -= BLOCK_LEN_INT64;
    }
 }
 /**
 * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e.,
 * the wordwise addition of two columns, ignoring carries between words). The
 * output of this operation, "rand", is then used to make
 * "M[rowOut][col] = M[rowOut][col] XOR rand" and
 * "M[rowInOut][col] =  M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit
 * rotation to the left.
 *
 * @param state          The current state of the sponge
 * @param rowIn          Row used only as input
 * @param rowInOut       Row used as input and to receive output after rotation
 * @param rowOut         Row receiving the output
 *
 */
 void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
    uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
    int i;
    for (i = 0; i < N_COLS; i++) {
 	//Absorbing "M[prev] [+] M[row*]"
 	state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
 	state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
 	state[2]  ^= (ptrWordIn[2]  + ptrWordInOut[2]);
 	state[3]  ^= (ptrWordIn[3]  + ptrWordInOut[3]);
 	state[4]  ^= (ptrWordIn[4]  + ptrWordInOut[4]);
 	state[5]  ^= (ptrWordIn[5]  + ptrWordInOut[5]);
 	state[6]  ^= (ptrWordIn[6]  + ptrWordInOut[6]);
 	state[7]  ^= (ptrWordIn[7]  + ptrWordInOut[7]);
 	state[8]  ^= (ptrWordIn[8]  + ptrWordInOut[8]);
 	state[9]  ^= (ptrWordIn[9]  + ptrWordInOut[9]);
 	state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
 	state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
 	//Applies the reduced-round transformation f to the sponge's state
 	reducedBlake2bLyra(state);
 	//M[rowOut][col] = M[rowOut][col] XOR rand
 	ptrWordOut[0] ^= state[0];
 	ptrWordOut[1] ^= state[1];
 	ptrWordOut[2] ^= state[2];
 	ptrWordOut[3] ^= state[3];
 	ptrWordOut[4] ^= state[4];
 	ptrWordOut[5] ^= state[5];
 	ptrWordOut[6] ^= state[6];
 	ptrWordOut[7] ^= state[7];
 	ptrWordOut[8] ^= state[8];
 	ptrWordOut[9] ^= state[9];
 	ptrWordOut[10] ^= state[10];
 	ptrWordOut[11] ^= state[11];
 	//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
 	ptrWordInOut[0] ^= state[11];
 	ptrWordInOut[1] ^= state[0];
 	ptrWordInOut[2] ^= state[1];
 	ptrWordInOut[3] ^= state[2];
 	ptrWordInOut[4] ^= state[3];
 	ptrWordInOut[5] ^= state[4];
 	ptrWordInOut[6] ^= state[5];
 	ptrWordInOut[7] ^= state[6];
 	ptrWordInOut[8] ^= state[7];
 	ptrWordInOut[9] ^= state[8];
 	ptrWordInOut[10] ^= state[9];
 	ptrWordInOut[11] ^= state[10];
 	//Goes to next block
 	ptrWordOut += BLOCK_LEN_INT64;
 	ptrWordInOut += BLOCK_LEN_INT64;
 	ptrWordIn += BLOCK_LEN_INT64;
    }
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /**
 * Performs a duplex operation over "M[rowInOut] [+] M[rowIn]", writing the output "rand"
 * on M[rowOut] and making "M[rowInOut] =  M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit
 * rotation to the left.
 *
 * @param state          The current state of the sponge
 * @param rowIn          Row used only as input
 * @param rowInOut       Row used as input and to receive output after rotation
 * @param rowOut         Row receiving the output
 *
 */
 /*
 inline void reducedDuplexRowSetupOLD(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
    uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
    int i;
    for (i = 0; i < N_COLS; i++) {
 	//Absorbing "M[rowInOut] XOR M[rowIn]"
 	state[0] ^= ptrWordInOut[0] ^ ptrWordIn[0];
 	state[1] ^= ptrWordInOut[1] ^ ptrWordIn[1];
 	state[2] ^= ptrWordInOut[2] ^ ptrWordIn[2];
 	state[3] ^= ptrWordInOut[3] ^ ptrWordIn[3];
 	state[4] ^= ptrWordInOut[4] ^ ptrWordIn[4];
 	state[5] ^= ptrWordInOut[5] ^ ptrWordIn[5];
 	state[6] ^= ptrWordInOut[6] ^ ptrWordIn[6];
 	state[7] ^= ptrWordInOut[7] ^ ptrWordIn[7];
 	state[8] ^= ptrWordInOut[8] ^ ptrWordIn[8];
 	state[9] ^= ptrWordInOut[9] ^ ptrWordIn[9];
 	state[10] ^= ptrWordInOut[10] ^ ptrWordIn[10];
 	state[11] ^= ptrWordInOut[11] ^ ptrWordIn[11];
 	//Applies the reduced-round transformation f to the sponge's state
 	reducedBlake2bLyra(state);
 	//M[row][col] = rand
 	ptrWordOut[0] = state[0];
 	ptrWordOut[1] = state[1];
 	ptrWordOut[2] = state[2];
 	ptrWordOut[3] = state[3];
 	ptrWordOut[4] = state[4];
 	ptrWordOut[5] = state[5];
 	ptrWordOut[6] = state[6];
 	ptrWordOut[7] = state[7];
 	ptrWordOut[8] = state[8];
 	ptrWordOut[9] = state[9];
 	ptrWordOut[10] = state[10];
 	ptrWordOut[11] = state[11];
 	//M[row*][col] = M[row*][col] XOR rotW(rand)
 	ptrWordInOut[0] ^= state[10];
 	ptrWordInOut[1] ^= state[11];
 	ptrWordInOut[2] ^= state[0];
 	ptrWordInOut[3] ^= state[1];
 	ptrWordInOut[4] ^= state[2];
 	ptrWordInOut[5] ^= state[3];
 	ptrWordInOut[6] ^= state[4];
 	ptrWordInOut[7] ^= state[5];
 	ptrWordInOut[8] ^= state[6];
 	ptrWordInOut[9] ^= state[7];
 	ptrWordInOut[10] ^= state[8];
 	ptrWordInOut[11] ^= state[9];
 	//Goes to next column (i.e., next block in sequence)
 	ptrWordInOut += BLOCK_LEN_INT64;
 	ptrWordIn += BLOCK_LEN_INT64;
 	ptrWordOut += BLOCK_LEN_INT64;
    }
 }
 */
 /**
 * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", writing the output "rand"
 * on M[rowOut] and making "M[rowInOut] =  M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit
 * rotation to the left.
 *
 * @param state          The current state of the sponge
 * @param rowIn          Row used only as input
 * @param rowInOut       Row used as input and to receive output after rotation
 * @param rowOut         Row receiving the output
 *
 */
 /*
 inline void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
    uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
    int i;
    for (i = 0; i < N_COLS; i++) {
 	//Absorbing "M[rowInOut] XOR M[rowIn]"
 	state[0] ^= ptrWordInOut[0] + ptrWordIn[0];
 	state[1] ^= ptrWordInOut[1] + ptrWordIn[1];
 	state[2] ^= ptrWordInOut[2] + ptrWordIn[2];
 	state[3] ^= ptrWordInOut[3] + ptrWordIn[3];
 	state[4] ^= ptrWordInOut[4] + ptrWordIn[4];
 	state[5] ^= ptrWordInOut[5] + ptrWordIn[5];
 	state[6] ^= ptrWordInOut[6] + ptrWordIn[6];
 	state[7] ^= ptrWordInOut[7] + ptrWordIn[7];
 	state[8] ^= ptrWordInOut[8] + ptrWordIn[8];
 	state[9] ^= ptrWordInOut[9] + ptrWordIn[9];
 	state[10] ^= ptrWordInOut[10] + ptrWordIn[10];
 	state[11] ^= ptrWordInOut[11] + ptrWordIn[11];
 	//Applies the reduced-round transformation f to the sponge's state
 	reducedBlake2bLyra(state);
 	//M[row*][col] = M[row*][col] XOR rotW(rand)
 	ptrWordInOut[0] ^= state[10];
 	ptrWordInOut[1] ^= state[11];
 	ptrWordInOut[2] ^= state[0];
 	ptrWordInOut[3] ^= state[1];
 	ptrWordInOut[4] ^= state[2];
 	ptrWordInOut[5] ^= state[3];
 	ptrWordInOut[6] ^= state[4];
 	ptrWordInOut[7] ^= state[5];
 	ptrWordInOut[8] ^= state[6];
 	ptrWordInOut[9] ^= state[7];
 	ptrWordInOut[10] ^= state[8];
 	ptrWordInOut[11] ^= state[9];
 	//M[row][col] = rand
 	ptrWordOut[0] = state[0] ^ ptrWordIn[0];
 	ptrWordOut[1] = state[1] ^ ptrWordIn[1];
 	ptrWordOut[2] = state[2] ^ ptrWordIn[2];
 	ptrWordOut[3] = state[3] ^ ptrWordIn[3];
 	ptrWordOut[4] = state[4] ^ ptrWordIn[4];
 	ptrWordOut[5] = state[5] ^ ptrWordIn[5];
 	ptrWordOut[6] = state[6] ^ ptrWordIn[6];
 	ptrWordOut[7] = state[7] ^ ptrWordIn[7];
 	ptrWordOut[8] = state[8] ^ ptrWordIn[8];
 	ptrWordOut[9] = state[9] ^ ptrWordIn[9];
 	ptrWordOut[10] = state[10] ^ ptrWordIn[10];
 	ptrWordOut[11] = state[11] ^ ptrWordIn[11];
 	//Goes to next column (i.e., next block in sequence)
 	ptrWordInOut += BLOCK_LEN_INT64;
 	ptrWordIn += BLOCK_LEN_INT64;
 	ptrWordOut += BLOCK_LEN_INT64;
    }
 }
 */
 /**
 * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", writing the output "rand"
 * on M[rowOut] and making "M[rowInOut] =  M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit
 * rotation to the left.
 *
 * @param state          The current state of the sponge
 * @param rowIn          Row used only as input
 * @param rowInOut       Row used as input and to receive output after rotation
 * @param rowOut         Row receiving the output
 *
 */
 /*
 inline void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
    uint64_t* ptrWordOut = rowOut;
    int i;
    for (i = 0; i < N_COLS / 2; i++) {
 	//Absorbing "M[rowInOut] XOR M[rowIn]"
 	state[0] ^= ptrWordInOut[0] + ptrWordIn[0];
 	state[1] ^= ptrWordInOut[1] + ptrWordIn[1];
 	state[2] ^= ptrWordInOut[2] + ptrWordIn[2];
 	state[3] ^= ptrWordInOut[3] + ptrWordIn[3];
 	state[4] ^= ptrWordInOut[4] + ptrWordIn[4];
 	state[5] ^= ptrWordInOut[5] + ptrWordIn[5];
 	state[6] ^= ptrWordInOut[6] + ptrWordIn[6];
 	state[7] ^= ptrWordInOut[7] + ptrWordIn[7];
 	state[8] ^= ptrWordInOut[8] + ptrWordIn[8];
 	state[9] ^= ptrWordInOut[9] + ptrWordIn[9];
 	state[10] ^= ptrWordInOut[10] + ptrWordIn[10];
 	state[11] ^= ptrWordInOut[11] + ptrWordIn[11];
 	//Applies the reduced-round transformation f to the sponge's state
 	reducedBlake2bLyra(state);
 	//M[row*][col] = M[row*][col] XOR rotW(rand)
 	ptrWordInOut[0] ^= state[10];
 	ptrWordInOut[1] ^= state[11];
 	ptrWordInOut[2] ^= state[0];
 	ptrWordInOut[3] ^= state[1];
 	ptrWordInOut[4] ^= state[2];
 	ptrWordInOut[5] ^= state[3];
 	ptrWordInOut[6] ^= state[4];
 	ptrWordInOut[7] ^= state[5];
 	ptrWordInOut[8] ^= state[6];
 	ptrWordInOut[9] ^= state[7];
 	ptrWordInOut[10] ^= state[8];
 	ptrWordInOut[11] ^= state[9];
 	//M[row][col] = rand
 	ptrWordOut[0] = state[0] ^ ptrWordIn[0];
 	ptrWordOut[1] = state[1] ^ ptrWordIn[1];
 	ptrWordOut[2] = state[2] ^ ptrWordIn[2];
 	ptrWordOut[3] = state[3] ^ ptrWordIn[3];
 	ptrWordOut[4] = state[4] ^ ptrWordIn[4];
 	ptrWordOut[5] = state[5] ^ ptrWordIn[5];
 	ptrWordOut[6] = state[6] ^ ptrWordIn[6];
 	ptrWordOut[7] = state[7] ^ ptrWordIn[7];
 	ptrWordOut[8] = state[8] ^ ptrWordIn[8];
 	ptrWordOut[9] = state[9] ^ ptrWordIn[9];
 	ptrWordOut[10] = state[10] ^ ptrWordIn[10];
 	ptrWordOut[11] = state[11] ^ ptrWordIn[11];
 	//Goes to next column (i.e., next block in sequence)
 	ptrWordInOut += BLOCK_LEN_INT64;
 	ptrWordIn += BLOCK_LEN_INT64;
 	ptrWordOut += 2 * BLOCK_LEN_INT64;
    }
    ptrWordOut =  rowOut + BLOCK_LEN_INT64;
    for (i = 0; i < N_COLS / 2; i++) {
 	//Absorbing "M[rowInOut] XOR M[rowIn]"
 	state[0] ^= ptrWordInOut[0] + ptrWordIn[0];
 	state[1] ^= ptrWordInOut[1] + ptrWordIn[1];
 	state[2] ^= ptrWordInOut[2] + ptrWordIn[2];
 	state[3] ^= ptrWordInOut[3] + ptrWordIn[3];
 	state[4] ^= ptrWordInOut[4] + ptrWordIn[4];
 	state[5] ^= ptrWordInOut[5] + ptrWordIn[5];
 	state[6] ^= ptrWordInOut[6] + ptrWordIn[6];
 	state[7] ^= ptrWordInOut[7] + ptrWordIn[7];
 	state[8] ^= ptrWordInOut[8] + ptrWordIn[8];
 	state[9] ^= ptrWordInOut[9] + ptrWordIn[9];
 	state[10] ^= ptrWordInOut[10] + ptrWordIn[10];
 	state[11] ^= ptrWordInOut[11] + ptrWordIn[11];
 	//Applies the reduced-round transformation f to the sponge's state
 	reducedBlake2bLyra(state);
 	//M[row*][col] = M[row*][col] XOR rotW(rand)
 	ptrWordInOut[0] ^= state[10];
 	ptrWordInOut[1] ^= state[11];
 	ptrWordInOut[2] ^= state[0];
 	ptrWordInOut[3] ^= state[1];
 	ptrWordInOut[4] ^= state[2];
 	ptrWordInOut[5] ^= state[3];
 	ptrWordInOut[6] ^= state[4];
 	ptrWordInOut[7] ^= state[5];
 	ptrWordInOut[8] ^= state[6];
 	ptrWordInOut[9] ^= state[7];
 	ptrWordInOut[10] ^= state[8];
 	ptrWordInOut[11] ^= state[9];
 	//M[row][col] = rand
 	ptrWordOut[0] = state[0] ^ ptrWordIn[0];
 	ptrWordOut[1] = state[1] ^ ptrWordIn[1];
 	ptrWordOut[2] = state[2] ^ ptrWordIn[2];
 	ptrWordOut[3] = state[3] ^ ptrWordIn[3];
 	ptrWordOut[4] = state[4] ^ ptrWordIn[4];
 	ptrWordOut[5] = state[5] ^ ptrWordIn[5];
 	ptrWordOut[6] = state[6] ^ ptrWordIn[6];
 	ptrWordOut[7] = state[7] ^ ptrWordIn[7];
 	ptrWordOut[8] = state[8] ^ ptrWordIn[8];
 	ptrWordOut[9] = state[9] ^ ptrWordIn[9];
 	ptrWordOut[10] = state[10] ^ ptrWordIn[10];
 	ptrWordOut[11] = state[11] ^ ptrWordIn[11];
 	//Goes to next column (i.e., next block in sequence)
 	ptrWordInOut += BLOCK_LEN_INT64;
 	ptrWordIn += BLOCK_LEN_INT64;
 	ptrWordOut += 2 * BLOCK_LEN_INT64;
    }
 }
 */
 /**
 * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", using the output "rand"
 * to make "M[rowOut][col] = M[rowOut][col] XOR rand" and "M[rowInOut] = M[rowInOut] XOR rotW(rand)",
 * where rotW is a 64-bit rotation to the left.
 *
 * @param state          The current state of the sponge
 * @param rowIn          Row used only as input
 * @param rowInOut       Row used as input and to receive output after rotation
 * @param rowOut         Row receiving the output
 *
 */
 /*
 inline void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
    uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
    int i;
    for (i = 0; i < N_COLS; i++) {
 	//Absorbing "M[rowInOut] XOR M[rowIn]"
 	state[0] ^= ptrWordInOut[0] + ptrWordIn[0];
 	state[1] ^= ptrWordInOut[1] + ptrWordIn[1];
 	state[2] ^= ptrWordInOut[2] + ptrWordIn[2];
 	state[3] ^= ptrWordInOut[3] + ptrWordIn[3];
 	state[4] ^= ptrWordInOut[4] + ptrWordIn[4];
 	state[5] ^= ptrWordInOut[5] + ptrWordIn[5];
 	state[6] ^= ptrWordInOut[6] + ptrWordIn[6];
 	state[7] ^= ptrWordInOut[7] + ptrWordIn[7];
 	state[8] ^= ptrWordInOut[8] + ptrWordIn[8];
 	state[9] ^= ptrWordInOut[9] + ptrWordIn[9];
 	state[10] ^= ptrWordInOut[10] + ptrWordIn[10];
 	state[11] ^= ptrWordInOut[11] + ptrWordIn[11];
 	//Applies the reduced-round transformation f to the sponge's state
 	reducedBlake2bLyra(state);
 	//M[rowOut][col] = M[rowOut][col] XOR rand
 	ptrWordOut[0] ^= state[0];
 	ptrWordOut[1] ^= state[1];
 	ptrWordOut[2] ^= state[2];
 	ptrWordOut[3] ^= state[3];
 	ptrWordOut[4] ^= state[4];
 	ptrWordOut[5] ^= state[5];
 	ptrWordOut[6] ^= state[6];
 	ptrWordOut[7] ^= state[7];
 	ptrWordOut[8] ^= state[8];
 	ptrWordOut[9] ^= state[9];
 	ptrWordOut[10] ^= state[10];
 	ptrWordOut[11] ^= state[11];
 	//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
 	//Goes to next block
 	ptrWordOut += BLOCK_LEN_INT64;
 	ptrWordInOut += BLOCK_LEN_INT64;
 	ptrWordIn += BLOCK_LEN_INT64;
    }
 }
 */
 /**
 Prints an array of unsigned chars
 */
 void printArray(unsigned char *array, unsigned int size, char *name) {
 	int i;
 	printf("%s: ", name);
 	for (i = 0; i < size; i++) {
 		printf("%2x|", array[i]);
 	}
 	printf("\n");
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////
--- a/lyra2/Sponge.h
+++ b/lyra2/Sponge.h
@ -0,0 +1,108 @@
 /**
 * Header file for Blake2b's internal permutation in the form of a sponge.
 * This code is based on the original Blake2b's implementation provided by
 * Samuel Neves (https://blake2.net/)
 *
 * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
 *
 * This software is hereby placed in the public domain.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
 * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
 * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef SPONGE_H_
 #define SPONGE_H_
 #include <stdint.h>
 #if defined(__GNUC__)
 #define ALIGN __attribute__ ((aligned(32)))
 #elif defined(_MSC_VER)
 #define ALIGN __declspec(align(32))
 #else
 #define ALIGN
 #endif
 /*Blake2b IV Array*/
 static const uint64_t blake2b_IV[8] =
 {
  0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
  0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
  0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
  0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
 };
 /*Blake2b's rotation*/
 static __inline uint64_t rotr64( const uint64_t w, const unsigned c ){
    return ( w >> c ) | ( w << ( 64 - c ) );
 }
 /*Blake2b's G function*/
 #define G(r,i,a,b,c,d) \
  do { \
    a = a + b; \
    d = rotr64(d ^ a, 32); \
    c = c + d; \
    b = rotr64(b ^ c, 24); \
    a = a + b; \
    d = rotr64(d ^ a, 16); \
    c = c + d; \
    b = rotr64(b ^ c, 63); \
  } while(0)
 /*One Round of the Blake2b's compression function*/
 #define ROUND_LYRA(r)  \
    G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
    G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
    G(r,2,v[ 2],v[ 6],v[10],v[14]); \
    G(r,3,v[ 3],v[ 7],v[11],v[15]); \
    G(r,4,v[ 0],v[ 5],v[10],v[15]); \
    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
    G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
 //---- Housekeeping
 void initState(uint64_t state[/*16*/]);
 //---- Squeezes
 void squeeze(uint64_t *state, unsigned char *out, unsigned int len);
 void reducedSqueezeRow0(uint64_t* state, uint64_t* row);
 //---- Absorbs
 void absorbBlock(uint64_t *state, const uint64_t *in);
 void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);
 //---- Duplexes
 void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut);
 void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
 void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
 //---- Misc
 void printArray(unsigned char *array, unsigned int size, char *name);
 ////////////////////////////////////////////////////////////////////////////////////////////////
 ////TESTS////
 //void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
 //void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
 //void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2);
 //void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
 //void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
 //void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
 /////////////
 #endif /* SPONGE_H_ */
--- a/lyra2/cuda_lyra2.cu
+++ b/lyra2/cuda_lyra2.cu
@ -0,0 +1,536 @@
 #include <memory.h>
 #include "cuda_helper.h"
 static __constant__ uint2 blake2b_IV[8] = {
 	{ 0xf3bcc908, 0x6a09e667 },
 	{ 0x84caa73b, 0xbb67ae85 },
 	{ 0xfe94f82b, 0x3c6ef372 },
 	{ 0x5f1d36f1, 0xa54ff53a },
 	{ 0xade682d1, 0x510e527f },
 	{ 0x2b3e6c1f, 0x9b05688c },
 	{ 0xfb41bd6b, 0x1f83d9ab },
 	{ 0x137e2179, 0x5be0cd19 }
 };
 // data: 0-4 outputhash 4-8 outputhash 8-16 basil
 #define reduceDuplexRowSetup(rowIn, rowInOut, rowOut) { \
 	for (int i = 0; i < 8; i++) { \
 		for (int j = 0; j < 12; j++) \
 			state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \
 		round_lyra_v35(state); \
 		for (int j = 0; j < 12; j++) \
 			Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j]; \
 		Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
 		Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
 		Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
 		Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
 		Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
 		Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
 		Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
 		Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
 		Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
 		Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
 		Matrix[10+ 12 * i][rowInOut] ^= state[9]; \
 		Matrix[11+ 12 * i][rowInOut] ^= state[10]; \
 	} \
  }
 #define reduceDuplexRow(rowIn, rowInOut, rowOut) { \
 	for (int i = 0; i < 8; i++) { \
 		for (int j = 0; j < 12; j++) \
 			state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \
 		round_lyra_v35(state); \
 		for (int j = 0; j < 12; j++) \
 			Matrix[j + 12 * i][rowOut] ^= state[j]; \
 		Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
 		Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
 		Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
 		Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
 		Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
 		Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
 		Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
 		Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
 		Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
 		Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
 		Matrix[10+ 12 * i][rowInOut] ^= state[9]; \
 		Matrix[11+ 12 * i][rowInOut] ^= state[10]; \
 	} \
  }
 #define absorbblock(in)  { \
 	state[0] ^= Matrix[0][in]; \
 	state[1] ^= Matrix[1][in]; \
 	state[2] ^= Matrix[2][in]; \
 	state[3] ^= Matrix[3][in]; \
 	state[4] ^= Matrix[4][in]; \
 	state[5] ^= Matrix[5][in]; \
 	state[6] ^= Matrix[6][in]; \
 	state[7] ^= Matrix[7][in]; \
 	state[8] ^= Matrix[8][in]; \
 	state[9] ^= Matrix[9][in]; \
 	state[10] ^= Matrix[10][in]; \
 	state[11] ^= Matrix[11][in]; \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
  }
 //// test version
 #define reduceDuplexRowSetup_test(rowIn, rowInOut, rowOut) { \
 	for (int i = 0; i < 8; i++) { \
 		for (int j = 0; j < 12; j++) \
 			state[j] ^= Matrix[j][i][rowIn] + Matrix[j][i][rowInOut]; \
 		round_lyra_v35(state); \
 		for (int j = 0; j < 12; j++) \
 			Matrix[j][7-i][rowOut] = Matrix[j][i][rowIn] ^ state[j]; \
 		Matrix[0][i][rowInOut] ^= state[11]; \
 		Matrix[1][i][rowInOut] ^= state[0]; \
 		Matrix[2][i][rowInOut] ^= state[1]; \
 		Matrix[3][i][rowInOut] ^= state[2]; \
 		Matrix[4][i][rowInOut] ^= state[3]; \
 		Matrix[5][i][rowInOut] ^= state[4]; \
 		Matrix[6][i][rowInOut] ^= state[5]; \
 		Matrix[7][i][rowInOut] ^= state[6]; \
 		Matrix[8][i][rowInOut] ^= state[7]; \
 		Matrix[9][i][rowInOut] ^= state[8]; \
 		Matrix[10][i][rowInOut] ^= state[9]; \
 		Matrix[11][i][rowInOut] ^= state[10]; \
 	} \
  }
 #define reduceDuplexRow_test(rowIn, rowInOut, rowOut) { \
 	for (int i = 0; i < 8; i++) { \
 		for (int j = 0; j < 12; j++) \
 			state[j] ^= Matrix[j][i][rowIn] + Matrix[j][i][rowInOut]; \
 		round_lyra_v35(state); \
 		for (int j = 0; j < 12; j++) \
 			Matrix[j][i][rowOut] ^= state[j]; \
 		Matrix[0][i][rowInOut] ^= state[11]; \
 		Matrix[1][i][rowInOut] ^= state[0]; \
 		Matrix[2][i][rowInOut] ^= state[1]; \
 		Matrix[3][i][rowInOut] ^= state[2]; \
 		Matrix[4][i][rowInOut] ^= state[3]; \
 		Matrix[5][i][rowInOut] ^= state[4]; \
 		Matrix[6][i][rowInOut] ^= state[5]; \
 		Matrix[7][i][rowInOut] ^= state[6]; \
 		Matrix[8][i][rowInOut] ^= state[7]; \
 		Matrix[9][i][rowInOut] ^= state[8]; \
 		Matrix[10][i][rowInOut] ^= state[9]; \
 		Matrix[11][i][rowInOut] ^= state[10]; \
 	} \
  }
 #define absorbblock_test(in) { \
 	state[0] ^= Matrix[0][0][ in]; \
 	state[1] ^= Matrix[1][0][in]; \
 	state[2] ^= Matrix[2][0][in]; \
 	state[3] ^= Matrix[3][0][in]; \
 	state[4] ^= Matrix[4][0][in]; \
 	state[5] ^= Matrix[5][0][in]; \
 	state[6] ^= Matrix[6][0][in]; \
 	state[7] ^= Matrix[7][0][in]; \
 	state[8] ^= Matrix[8][0][in]; \
 	state[9] ^= Matrix[9][0][in]; \
 	state[10] ^= Matrix[10][0][in]; \
 	state[11] ^= Matrix[11][0][in]; \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
 	round_lyra_v35(state); \
  }
 //// compute 30 version
 #define reduceDuplexRowSetup_v30(rowIn, rowInOut, rowOut) { \
 	for (int i = 0; i < 8; i++) { \
 		for (int j = 0; j < 12; j++) \
 			state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \
 		round_lyra_v30(state); \
 		for (int j = 0; j < 12; j++) \
 			Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j]; \
 		Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
 		Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
 		Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
 		Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
 		Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
 		Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
 		Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
 		Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
 		Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
 		Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
 		Matrix[10 + 12 * i][rowInOut] ^= state[9]; \
 		Matrix[11 + 12 * i][rowInOut] ^= state[10]; \
 	} \
  }
 #define reduceDuplexRow_v30(rowIn, rowInOut, rowOut) { \
 	for (int i = 0; i < 8; i++) { \
 		for (int j = 0; j < 12; j++) \
 			state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \
 		round_lyra_v30(state); \
 		for (int j = 0; j < 12; j++) \
 		 	Matrix[j + 12 * i][rowOut] ^= state[j]; \
 		Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
 		Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
 		Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
 		Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
 		Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
 		Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
 		Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
 		Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
 		Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
 		Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
 		Matrix[10 + 12 * i][rowInOut] ^= state[9]; \
 		Matrix[11 + 12 * i][rowInOut] ^= state[10]; \
 	} \
  }
 #define absorbblock_v30(in) { \
 	state[0] ^= Matrix[0][in]; \
 	state[1] ^= Matrix[1][in]; \
 	state[2] ^= Matrix[2][in]; \
 	state[3] ^= Matrix[3][in]; \
 	state[4] ^= Matrix[4][in]; \
 	state[5] ^= Matrix[5][in]; \
 	state[6] ^= Matrix[6][in]; \
 	state[7] ^= Matrix[7][in]; \
 	state[8] ^= Matrix[8][in]; \
 	state[9] ^= Matrix[9][in]; \
 	state[10] ^= Matrix[10][in]; \
 	state[11] ^= Matrix[11][in]; \
 	round_lyra_v30(state); \
 	round_lyra_v30(state); \
 	round_lyra_v30(state); \
 	round_lyra_v30(state); \
 	round_lyra_v30(state); \
 	round_lyra_v30(state); \
 	round_lyra_v30(state); \
 	round_lyra_v30(state); \
 	round_lyra_v30(state); \
 	round_lyra_v30(state); \
 	round_lyra_v30(state); \
 	round_lyra_v30(state); \
  }
 static __device__ __forceinline__
 void Gfunc_v35(uint2 & a, uint2 &b, uint2 &c, uint2 &d)
 {
 	a += b; d ^= a; d = ROR2(d, 32);
 	c += d; b ^= c; b = ROR2(b, 24);
 	a += b; d ^= a; d = ROR2(d, 16);
 	c += d; b ^= c; b = ROR2(b, 63);
 }
 static __device__ __forceinline__
 void Gfunc_v30(uint64_t & a, uint64_t &b, uint64_t &c, uint64_t &d)
 {
 	a += b; d ^= a; d = ROTR64(d, 32);
 	c += d; b ^= c; b = ROTR64(b, 24);
 	a += b; d ^= a; d = ROTR64(d, 16);
 	c += d; b ^= c; b = ROTR64(b, 63);
 }
 #define round_lyra_v35_new(state) { \
 	Gfunc_v35(state[0], state[4], state[8], state[12]); \
 	Gfunc_v35(state[1], state[5], state[9], state[13]); \
 	Gfunc_v35(state[2], state[6], state[10], state[14]); \
 	Gfunc_v35(state[3], state[7], state[11], state[15]); \
 	Gfunc_v35(state[0], state[5], state[10], state[15]); \
 	Gfunc_v35(state[1], state[6], state[11], state[12]); \
 	Gfunc_v35(state[2], state[7], state[8], state[13]); \
 	Gfunc_v35(state[3], state[4], state[9], state[14]); \
 }
 static __device__ __forceinline__ void round_lyra_v35(uint2 *s)
 {
 	Gfunc_v35(s[0], s[4], s[8],  s[12]);
 	Gfunc_v35(s[1], s[5], s[9],  s[13]);
 	Gfunc_v35(s[2], s[6], s[10], s[14]);
 	Gfunc_v35(s[3], s[7], s[11], s[15]);
 	Gfunc_v35(s[0], s[5], s[10], s[15]);
 	Gfunc_v35(s[1], s[6], s[11], s[12]);
 	Gfunc_v35(s[2], s[7], s[8],  s[13]);
 	Gfunc_v35(s[3], s[4], s[9],  s[14]);
 }
 static __device__ __forceinline__ void round_lyra_v30(uint64_t *s)
 {
 	Gfunc_v30(s[0], s[4], s[8], s[12]);
 	Gfunc_v30(s[1], s[5], s[9], s[13]);
 	Gfunc_v30(s[2], s[6], s[10], s[14]);
 	Gfunc_v30(s[3], s[7], s[11], s[15]);
 	Gfunc_v30(s[0], s[5], s[10], s[15]);
 	Gfunc_v30(s[1], s[6], s[11], s[12]);
 	Gfunc_v30(s[2], s[7], s[8], s[13]);
 	Gfunc_v30(s[3], s[4], s[9], s[14]);
 }
 __global__ __launch_bounds__(256, 1)
 void lyra2_gpu_hash_32_v30(int threads, uint32_t startNounce, uint64_t *outputHash)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint64_t state[16];
 		#pragma unroll
 		for (int i = 0; i<4; i++) { state[i] = outputHash[threads*i + thread]; } //password
 		#pragma unroll
 		for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt
 		#pragma unroll
 		for (int i = 0; i<8; i++) { state[i + 8] = devectorize(blake2b_IV[i]); }
 		// blake2blyra x2
 		#pragma unroll 24
 		for (int i = 0; i<24; i++) { round_lyra_v30(state); } //because 12 is not enough
 		uint64_t Matrix[96][8]; // not cool
 		// reducedSqueezeRow0
 		#pragma unroll 8
 		for (int i = 0; i < 8; i++) {
 			int idx = 84-12*i;
 			#pragma unroll 12
 			for (int j = 0; j<12; j++) { Matrix[j + idx][0] = state[j]; }
 			round_lyra_v30(state);
 		}
 		// reducedSqueezeRow1
 		#pragma unroll 8
 		for (int i = 0; i < 8; i++)
 		{
 			int idx0= 12*i;
 			int idx1= 84-idx0;
 			#pragma unroll 12
 			for (int j = 0; j<12; j++) { state[j] ^= Matrix[j + idx0][0]; }
 			round_lyra_v30(state);
 			#pragma unroll 12
 			for (int j = 0; j<12; j++) { Matrix[j + idx1][1] = Matrix[j + idx0][0] ^ state[j]; }
 		}
 		reduceDuplexRowSetup_v30(1, 0, 2);
 		reduceDuplexRowSetup_v30(2, 1, 3);
 		reduceDuplexRowSetup_v30(3, 0, 4);
 		reduceDuplexRowSetup_v30(4, 3, 5);
 		reduceDuplexRowSetup_v30(5, 2, 6);
 		reduceDuplexRowSetup_v30(6, 1, 7);
 		uint64_t rowa;
 		rowa = state[0] & 7;
 		reduceDuplexRow_v30(7, rowa, 0);
 		rowa = state[0] & 7;
 		reduceDuplexRow_v30(0, rowa, 3);
 		rowa = state[0] & 7;
 		reduceDuplexRow_v30(3, rowa, 6);
 		rowa = state[0] & 7;
 		reduceDuplexRow_v30(6, rowa, 1);
 		rowa = state[0] & 7;
 		reduceDuplexRow_v30(1, rowa, 4);
 		rowa = state[0] & 7;
 		reduceDuplexRow_v30(4, rowa, 7);
 		rowa = state[0] & 7;
 		reduceDuplexRow_v30(7, rowa, 2);
 		rowa = state[0] & 7;
 		reduceDuplexRow_v30(2, rowa, 5);
 		absorbblock_v30(rowa);
 		#pragma unroll
 		for (int i = 0; i<4; i++) {
 			outputHash[threads*i + thread] = state[i];
 		} //password
 	} //thread
 }
 __global__ __launch_bounds__(256, 1)
 void lyra2_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint2 state[16];
 		#pragma unroll
 		for (int i = 0; i<4; i++) { LOHI(state[i].x, state[i].y, outputHash[threads*i + thread]); } //password
 		#pragma unroll
 		for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt
 		#pragma unroll
 		for (int i = 0; i<8; i++) { state[i + 8] = blake2b_IV[i]; }
 		// blake2blyra x2
 		#pragma unroll 24
 		for (int i = 0; i<24; i++) { round_lyra_v35(state); } //because 12 is not enough
 		uint2 Matrix[96][8]; // not cool
 		// reducedSqueezeRow0
 		#pragma unroll 8
 		for (int i = 0; i < 8; i++)
 		{
 			#pragma unroll 12
 			for (int j = 0; j<12; j++) { Matrix[j + 84 - 12 * i][0] = state[j]; }
 			round_lyra_v35(state);
 		}
 		// reducedSqueezeRow1
 		#pragma unroll 8
 		for (int i = 0; i < 8; i++)
 		{
 			#pragma unroll 12
 			for (int j = 0; j<12; j++) { state[j] ^= Matrix[j + 12 * i][0]; }
 			round_lyra_v35(state);
 			#pragma unroll 12
 			for (int j = 0; j<12; j++) { Matrix[j + 84 - 12 * i][1] = Matrix[j + 12 * i][0] ^ state[j]; }
 		}
 		reduceDuplexRowSetup(1, 0, 2);
 		reduceDuplexRowSetup(2, 1, 3);
 		reduceDuplexRowSetup(3, 0, 4);
 		reduceDuplexRowSetup(4, 3, 5);
 		reduceDuplexRowSetup(5, 2, 6);
 		reduceDuplexRowSetup(6, 1, 7);
 		uint32_t rowa;
 		rowa = state[0].x & 7;
 		reduceDuplexRow(7, rowa, 0);
 		rowa = state[0].x & 7;
 		reduceDuplexRow(0, rowa, 3);
 		rowa = state[0].x & 7;
 		reduceDuplexRow(3, rowa, 6);
 		rowa = state[0].x & 7;
 		reduceDuplexRow(6, rowa, 1);
 		rowa = state[0].x & 7;
 		reduceDuplexRow(1, rowa, 4);
 		rowa = state[0].x & 7;
 		reduceDuplexRow(4, rowa, 7);
 		rowa = state[0].x & 7;
 		reduceDuplexRow(7, rowa, 2);
 		rowa = state[0].x & 7;
 		reduceDuplexRow(2, rowa, 5);
 		absorbblock(rowa);
 		#pragma unroll
 		for (int i = 0; i<4; i++) {
 			outputHash[threads*i + thread] = devectorize(state[i]);
 		} //password
 	} //thread
 }
 __global__
 void __launch_bounds__(256, 1) lyra2_gpu_hash_32_test(int threads, uint32_t startNounce, uint64_t *outputHash)
 {
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint2 state[16];
 		#pragma unroll
 		for (int i = 0; i<4; i++) { LOHI(state[i].x, state[i].y, outputHash[threads*i + thread]); } //password
 		#pragma unroll
 		for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt
 		#pragma unroll
 		for (int i = 0; i<8; i++) { state[i + 8] = blake2b_IV[i]; }
 		// blake2blyra x2
 		#pragma unroll 24
 		for (int i = 0; i<24; i++) { round_lyra_v35(state); } //because 12 is not enough
 		uint2 Matrix[12][8][8]; // not cool
 		// reducedSqueezeRow0
 		#pragma unroll 8
 		for (int i = 0; i < 8; i++) {
 			#pragma unroll 12
 			for (int j = 0; j<12; j++) { Matrix[j][7-i][0] = state[j]; }
 			round_lyra_v35(state);
 		}
 		// reducedSqueezeRow1
 		#pragma unroll 8
 		for (int i = 0; i < 8; i++)
 		{
 			#pragma unroll 12
 			for (int j = 0; j<12; j++) { state[j] ^= Matrix[j][i][0]; }
 			round_lyra_v35(state);
 			#pragma unroll 12
 			for (int j = 0; j<12; j++) { Matrix[j][7-i][1] = Matrix[j][i][0] ^ state[j]; }
 		}
 		reduceDuplexRowSetup_test(1, 0, 2);
 		reduceDuplexRowSetup_test(2, 1, 3);
 		reduceDuplexRowSetup_test(3, 0, 4);
 		reduceDuplexRowSetup_test(4, 3, 5);
 		reduceDuplexRowSetup_test(5, 2, 6);
 		reduceDuplexRowSetup_test(6, 1, 7);
 		uint64_t rowa;
 		rowa = devectorize(state[0]) & 7;
 		reduceDuplexRow_test(7, rowa, 0);
 		rowa = devectorize(state[0]) & 7;
 		reduceDuplexRow_test(0, rowa, 3);
 		rowa = devectorize(state[0]) & 7;
 		reduceDuplexRow_test(3, rowa, 6);
 		rowa = devectorize(state[0]) & 7;
 		reduceDuplexRow_test(6, rowa, 1);
 		rowa = devectorize(state[0]) & 7;
 		reduceDuplexRow_test(1, rowa, 4);
 		rowa = devectorize(state[0]) & 7;
 		reduceDuplexRow_test(4, rowa, 7);
 		rowa = devectorize(state[0]) & 7;
 		reduceDuplexRow_test(7, rowa, 2);
 		rowa = devectorize(state[0]) & 7;
 		reduceDuplexRow_test(2, rowa, 5);
 		absorbblock_test(rowa);
 		#pragma unroll
 		for (int i = 0; i<4; i++) {
 			outputHash[threads*i + thread] = devectorize(state[i]);
 		} //password
 	} //thread
 }
 __host__
 void lyra2_cpu_init(int thr_id, int threads)
 {
 	//not used
 }
 __host__
 void lyra2_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
 {
 	const int threadsperblock = 256;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 	if (device_sm[device_map[thr_id]] >= 350) {
 		lyra2_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_outputHash);
 	} else {
 		// kernel for compute30 card
 		lyra2_gpu_hash_32_v30 <<<grid, block >>> (threads, startNounce, d_outputHash);
 	}
 	cudaDeviceSynchronize();
 	//MyStreamSynchronize(NULL, order, thr_id);
 }
--- a/lyra2/lyra2RE.cu
+++ b/lyra2/lyra2RE.cu
@ -0,0 +1,133 @@
 extern "C" {
 #include "sph/sph_blake.h"
 #include "sph/sph_groestl.h"
 #include "sph/sph_skein.h"
 #include "sph/sph_keccak.h"
 #include "lyra2/Lyra2.h"
 }
 #include "miner.h"
 #include "cuda_helper.h"
 static _ALIGN(64) uint64_t *d_hash[8];
 extern void quark_check_cpu_init(int thr_id, int threads);
 extern void quark_check_cpu_setTarget(const void *ptarget);
 extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
 extern uint32_t quark_check_cpu_hash_64_2(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint64_t *d_inputHash, int order);
 extern void blake256_cpu_init(int thr_id, int threads);
 extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
 extern void blake256_cpu_setBlock_80(uint32_t *pdata);
 extern void keccak256_cpu_hash_32(int thr_id, int threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void keccak256_cpu_init(int thr_id, int threads);
 extern void skein256_cpu_hash_32(int thr_id, int threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void skein256_cpu_init(int thr_id, int threads);
 extern void lyra2_cpu_hash_32(int thr_id, int threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 extern void lyra2_cpu_init(int thr_id, int threads);
 extern void groestl256_setTarget(const void *ptarget);
 extern uint32_t groestl256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order);
 extern void groestl256_cpu_init(int thr_id, int threads);
 extern "C" void lyra_hash(void *state, const void *input)
 {
 	sph_blake256_context     ctx_blake;
 	sph_keccak256_context    ctx_keccak;
 	sph_skein256_context     ctx_skein;
 	sph_groestl256_context   ctx_groestl;
 	uint32_t hashA[8], hashB[8], hash[8];
 	sph_blake256_init(&ctx_blake);
 	sph_blake256(&ctx_blake, input, 80);
 	sph_blake256_close(&ctx_blake, hashA);
 	sph_keccak256_init(&ctx_keccak);
 	sph_keccak256(&ctx_keccak, hashA, 32);
 	sph_keccak256_close(&ctx_keccak, hashB);
 	LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
 	sph_skein256_init(&ctx_skein);
 	sph_skein256(&ctx_skein, hashA, 32);
 	sph_skein256_close(&ctx_skein, hashB);
 	sph_groestl256_init(&ctx_groestl);
 	sph_groestl256(&ctx_groestl, hashB, 32);
 	sph_groestl256_close(&ctx_groestl, hash);
 	// seems wrong : hash or hashB ?
 	memcpy(state, hashB, 32);
 }
 static bool init[8] = { 0 };
 extern "C" int scanhash_lyra(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
 	int intensity = (device_sm[device_map[thr_id]] >= 500) ? 19 : 18;
 	int throughput = opt_work_size ? opt_work_size : (1 << intensity); // 18=256*256*4;
 	throughput = min(throughput, (int)(max_nonce - first_nonce));
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
 		blake256_cpu_init(thr_id, throughput);
 		keccak256_cpu_init(thr_id,throughput);
 		skein256_cpu_init(thr_id, throughput);
 		groestl256_cpu_init(thr_id, throughput);
 		lyra2_cpu_init(thr_id, throughput);
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
 		init[thr_id] = true;
 	}
 	uint32_t endiandata[20];
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
 	blake256_cpu_setBlock_80(pdata);
 	groestl256_setTarget(ptarget);
 	do {
 		int order = 0;
 		uint32_t foundNonce;
 		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		if  (foundNonce != 0xffffffff)
 		{
 			// const uint32_t Htarg = ptarget[6];
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			lyra_hash(vhash64, endiandata);
 //			if (vhash64[7]<=Htarg) { // && fulltest(vhash64, ptarget)) {
 				*hashes_done = pdata[19] - first_nonce + throughput;
 				pdata[19] = foundNonce;
 				return 1;
 //			} else {
 //				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
 //			}
 		}
 		pdata[19] += throughput;
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
--- a/miner.h
+++ b/miner.h
@ -328,6 +328,10 @@ extern int scanhash_fresh(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
 extern int scanhash_lyra(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
 extern int scanhash_nist5(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
@ -645,6 +649,7 @@ void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
 void keccak256_hash(void *state, const void *input);
 unsigned int jackpothash(void *state, const void *input);
 void groestlhash(void *state, const void *input);
 void lyra_hash(void *state, const void *input);
 void myriadhash(void *state, const void *input);
 void nist5hash(void *state, const void *input);
 void pentablakehash(void *output, const void *input);
--- a/util.cpp
+++ b/util.cpp
@ -1633,18 +1633,22 @@ void print_hash_tests(void)
 	heavycoin_hash(&hash[0], &buf[0], 32);
 	printpfx("heavy", hash);
 	memset(hash, 0, sizeof hash);
 	keccak256_hash(&hash[0], &buf[0]);
 	printpfx("keccak", hash);
 	memset(hash, 0, sizeof hash);
 	jackpothash(&hash[0], &buf[0]);
 	printpfx("jackpot", hash);
 	memset(hash, 0, sizeof hash);
 	keccak256_hash(&hash[0], &buf[0]);
 	printpfx("keccak", hash);
 	memset(hash, 0, sizeof hash);
 	doomhash(&hash[0], &buf[0]);
 	printpfx("luffa", hash);
-
+/* to double check with a lyra2 cpu miner
 	memset(hash, 0, sizeof hash);
 	lyra_hash(&hash[0], &buf[0]);
 	printpfx("lyra2", hash);
 */
 	memset(hash, 0, sizeof hash);
 	myriadhash(&hash[0], &buf[0]);
 	printpfx("myriad", hash);