Add Lyra2 algo, based on Vertcoin published code

Seems to be djm34 work, i recognize the code style ;) Code was cleaned/indented and adapted to my fork... Only usable on the test pool until 16 december 2014!
2014-12-06 09:18:19 +01:00 · 2014-12-06 09:18:19 +01:00 · c5b349e079
commit c5b349e079
parent 6c7fce187b
21 changed files with 2901 additions and 60 deletions
--- a/Algo256/blake256.cu
+++ b/Algo256/blake256.cu
--- a/Algo256/cuda_blake256.cu
+++ b/Algo256/cuda_blake256.cu
@ -0,0 +1,250 @@
+/**
+ * Blake-256 Cuda Kernel (Tested on SM 5.0)
+ *
+ * Tanguy Pruvot - Nov. 2014
+ */
+extern "C" {
+#include "sph/sph_blake.h"
+}
+
+#include "cuda_helper.h"
+
+#include <memory.h>
+
+static __device__ uint64_t cuda_swab32ll(uint64_t x) {
+	return MAKE_ULONGLONG(cuda_swab32(_LOWORD(x)), cuda_swab32(_HIWORD(x)));
+}
+
+__constant__ static uint32_t  c_data[20];
+
+__constant__ static uint32_t sigma[16][16];
+static uint32_t  c_sigma[16][16] = {
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 },
+	{ 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 },
+	{ 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 },
+	{ 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 },
+	{ 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 },
+	{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+	{ 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 },
+	{ 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 },
+	{ 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 },
+	{ 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 },
+	{ 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }
+};
+
+static const uint32_t  c_IV256[8] = {
+	0x6A09E667, 0xBB67AE85,
+	0x3C6EF372, 0xA54FF53A,
+	0x510E527F, 0x9B05688C,
+	0x1F83D9AB, 0x5BE0CD19
+};
+
+__device__ __constant__ static uint32_t cpu_h[8];
+
+__device__ __constant__ static  uint32_t  u256[16];
+static const uint32_t  c_u256[16] = {
+	0x243F6A88, 0x85A308D3,
+	0x13198A2E, 0x03707344,
+	0xA4093822, 0x299F31D0,
+	0x082EFA98, 0xEC4E6C89,
+	0x452821E6, 0x38D01377,
+	0xBE5466CF, 0x34E90C6C,
+	0xC0AC29B7, 0xC97C50DD,
+	0x3F84D5B5, 0xB5470917
+};
+
+#define GS2(a,b,c,d,x) { \
+	const uint32_t idx1 = sigma[r][x]; \
+	const uint32_t idx2 = sigma[r][x+1]; \
+	v[a] += (m[idx1] ^ u256[idx2]) + v[b]; \
+	v[d] = SPH_ROTL32(v[d] ^ v[a], 16); \
+	v[c] += v[d]; \
+	v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \
+\
+	v[a] += (m[idx2] ^ u256[idx1]) + v[b]; \
+	v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \
+	v[c] += v[d]; \
+	v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \
+}
+
+//#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n)))
+#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
+#define hostGS(a,b,c,d,x) { \
+	const uint32_t idx1 = c_sigma[r][x]; \
+	const uint32_t idx2 = c_sigma[r][x+1]; \
+	v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \
+	v[d] = ROTR32(v[d] ^ v[a], 16); \
+	v[c] += v[d]; \
+	v[b] = ROTR32(v[b] ^ v[c], 12); \
+\
+	v[a] += (m[idx2] ^ c_u256[idx1]) + v[b]; \
+	v[d] = ROTR32(v[d] ^ v[a], 8); \
+	v[c] += v[d]; \
+	v[b] = ROTR32(v[b] ^ v[c], 7); \
+	}
+
+/* Second part (64-80) msg never change, store it */
+__device__ __constant__ static const uint32_t  c_Padding[16] = {
+	0, 0, 0, 0,
+	0x80000000, 0, 0, 0,
+	0, 0, 0, 0,
+	0, 1, 0, 640,
+};
+
+__host__ __forceinline__
+static void blake256_compress1st(uint32_t *h, const uint32_t *block, const uint32_t T0)
+{
+	uint32_t m[16];
+	uint32_t v[16];
+
+	for (int i = 0; i < 16; i++) {
+		m[i] = block[i];
+	}
+
+	for (int i = 0; i < 8; i++)
+		v[i] = h[i];
+
+	v[8] = c_u256[0];
+	v[9] = c_u256[1];
+	v[10] = c_u256[2];
+	v[11] = c_u256[3];
+
+	v[12] = c_u256[4] ^ T0;
+	v[13] = c_u256[5] ^ T0;
+	v[14] = c_u256[6];
+	v[15] = c_u256[7];
+
+	for (int r = 0; r < 14; r++) {
+		/* column step */
+		hostGS(0, 4, 0x8, 0xC, 0x0);
+		hostGS(1, 5, 0x9, 0xD, 0x2);
+		hostGS(2, 6, 0xA, 0xE, 0x4);
+		hostGS(3, 7, 0xB, 0xF, 0x6);
+		/* diagonal step */
+		hostGS(0, 5, 0xA, 0xF, 0x8);
+		hostGS(1, 6, 0xB, 0xC, 0xA);
+		hostGS(2, 7, 0x8, 0xD, 0xC);
+		hostGS(3, 4, 0x9, 0xE, 0xE);
+	}
+
+	for (int i = 0; i < 16; i++) {
+		int j = i & 7;
+		h[j] ^= v[i];
+	}
+}
+
+__device__ __forceinline__
+static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint32_t T0)
+{
+	uint32_t m[16];
+	uint32_t v[16];
+
+	m[0] = block[0];
+	m[1] = block[1];
+	m[2] = block[2];
+	m[3] = block[3];
+
+	#pragma unroll
+	for (int i = 4; i < 16; i++) {
+		m[i] = c_Padding[i];
+	}
+
+	#pragma unroll 8
+	for (int i = 0; i < 8; i++)
+		v[i] = h[i];
+
+	v[8] =  u256[0];
+	v[9] =  u256[1];
+	v[10] = u256[2];
+	v[11] = u256[3];
+
+	v[12] = u256[4] ^ T0;
+	v[13] = u256[5] ^ T0;
+	v[14] = u256[6];
+	v[15] = u256[7];
+
+	#pragma unroll 14
+	for (int r = 0; r < 14; r++) {
+		/* column step */
+		GS2(0, 4, 0x8, 0xC, 0x0);
+		GS2(1, 5, 0x9, 0xD, 0x2);
+		GS2(2, 6, 0xA, 0xE, 0x4);
+		GS2(3, 7, 0xB, 0xF, 0x6);
+		/* diagonal step */
+		GS2(0, 5, 0xA, 0xF, 0x8);
+		GS2(1, 6, 0xB, 0xC, 0xA);
+		GS2(2, 7, 0x8, 0xD, 0xC);
+		GS2(3, 4, 0x9, 0xE, 0xE);
+	}
+
+	#pragma unroll 16
+	for (int i = 0; i < 16; i++) {
+		int j = i & 7;
+		h[j] ^= v[i];
+	}
+}
+
+__global__ __launch_bounds__(256,3)
+void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t * Hash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNonce + thread;
+		uint32_t h[8];
+		uint32_t input[4];
+
+		#pragma unroll 8
+		for (int i = 0; i<8; i++) { h[i] = cpu_h[i];}
+
+		#pragma unroll 3
+		for (int i = 0; i < 3; ++i) input[i] = c_data[16 + i];
+
+		input[3] = nonce;
+		blake256_compress2nd(h, input, 640);
+
+        #pragma unroll
+		for (int i = 0; i<4; i++) {
+			Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2*i+1]));
+		}
+	}
+}
+
+__host__
+void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order)
+{
+	const int threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	blake256_gpu_hash_80 <<<grid, block>>> (threads, startNonce, Hash);
+	MyStreamSynchronize(NULL, order, thr_id);
+}
+
+__host__
+void blake256_cpu_setBlock_80(uint32_t *pdata)
+{
+	uint32_t h[8];
+	uint32_t data[20];
+	memcpy(data, pdata, 80);
+	for (int i = 0; i<8; i++) {
+		h[i] = c_IV256[i];
+	}
+	blake256_compress1st(h, pdata, 512);
+
+	cudaMemcpyToSymbol(cpu_h, h, sizeof(h), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_data, data, sizeof(data), 0, cudaMemcpyHostToDevice);
+}
+
+__host__
+void blake256_cpu_init(int thr_id, int threads)
+{
+	cudaMemcpyToSymbol(u256, c_u256, sizeof(c_u256), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice);
+}
--- a/Algo256/cuda_fugue256.cu
+++ b/Algo256/cuda_fugue256.cu
@ -571,7 +571,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas
 		for(int i=0;i<30;i++)
 			sc[i] = GPUstate[i];

-		uint32_t nounce = startNounce + thread; // muss noch ermittelt werden	
+		uint32_t nounce = startNounce + thread; // muss noch ermittelt werden
 		uint32_t q;


@ -687,7 +687,7 @@ fugue256_gpu_hash(int thr_id, int threads, uint32_t startNounce, void *outputHas

 		int i;
 		bool rc = true;
-	
+
 		for (i = 7; i >= 0; i--) {
 			if (hash[i] > pTarget[i]) {
 				rc = false;
@ -730,7 +730,7 @@ void fugue256_cpu_init(int thr_id, int threads)

 	// Speicher für alle Ergebnisse belegen
 	cudaMalloc(&d_fugue256_hashoutput[thr_id], 8 * sizeof(uint32_t) * threads);
-	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
+	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
 }

 __host__ void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
--- a/Algo256/cuda_groestl256.cu
+++ b/Algo256/cuda_groestl256.cu
@ -0,0 +1,309 @@
+#include <memory.h>
+
+#include "cuda_helper.h"
+
+uint32_t *d_gnounce[8];
+uint32_t *d_GNonce[8];
+
+__constant__ uint32_t pTarget[8];
+
+#define SPH_C32(x)    ((uint32_t)(x ## U))
+#define SPH_T32(x)    ((x) & SPH_C32(0xFFFFFFFF))
+
+#define C32e(x) \
+	  ((SPH_C32(x) >> 24) \
+	| ((SPH_C32(x) >>  8) & SPH_C32(0x0000FF00)) \
+	| ((SPH_C32(x) <<  8) & SPH_C32(0x00FF0000)) \
+	| ((SPH_C32(x) << 24) & SPH_C32(0xFF000000)))
+
+#define PC32up(j, r)   ((uint32_t)((j) + (r)))
+#define PC32dn(j, r)   0
+#define QC32up(j, r)   0xFFFFFFFF
+#define QC32dn(j, r)   (((uint32_t)(r) << 24) ^ SPH_T32(~((uint32_t)(j) << 24)))
+
+#define B32_0(x)    __byte_perm(x, 0, 0x4440)
+//((x) & 0xFF)
+#define B32_1(x)    __byte_perm(x, 0, 0x4441)
+//(((x) >> 8) & 0xFF)
+#define B32_2(x)    __byte_perm(x, 0, 0x4442)
+//(((x) >> 16) & 0xFF)
+#define B32_3(x)    __byte_perm(x, 0, 0x4443)
+//((x) >> 24)
+
+#define MAXWELL_OR_FERMI 1
+#if MAXWELL_OR_FERMI
+	#define USE_SHARED 1
+	// Maxwell and Fermi cards get the best speed with SHARED access it seems.
+	#if USE_SHARED
+	#define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
+	#define T0dn(x) (*((uint32_t*)mixtabs + (256+(x))))
+	#define T1up(x) (*((uint32_t*)mixtabs + (512+(x))))
+	#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
+	#define T2up(x) (*((uint32_t*)mixtabs + (1024+(x))))
+	#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
+	#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
+	#define T3dn(x) (*((uint32_t*)mixtabs + (1792+(x))))
+	#else
+	#define T0up(x) tex1Dfetch(t0up2, x)
+	#define T0dn(x) tex1Dfetch(t0dn2, x)
+	#define T1up(x) tex1Dfetch(t1up2, x)
+	#define T1dn(x) tex1Dfetch(t1dn2, x)
+	#define T2up(x) tex1Dfetch(t2up2, x)
+	#define T2dn(x) tex1Dfetch(t2dn2, x)
+	#define T3up(x) tex1Dfetch(t3up2, x)
+	#define T3dn(x) tex1Dfetch(t3dn2, x)
+	#endif
+#else
+	#define USE_SHARED 1
+	// a healthy mix between shared and textured access provides the highest speed on Compute 3.0 and 3.5!
+	#define T0up(x) (*((uint32_t*)mixtabs + (    (x))))
+	#define T0dn(x) tex1Dfetch(t0dn2, x)
+	#define T1up(x) tex1Dfetch(t1up2, x)
+	#define T1dn(x) (*((uint32_t*)mixtabs + (768+(x))))
+	#define T2up(x) tex1Dfetch(t2up2, x)
+	#define T2dn(x) (*((uint32_t*)mixtabs + (1280+(x))))
+	#define T3up(x) (*((uint32_t*)mixtabs + (1536+(x))))
+	#define T3dn(x) tex1Dfetch(t3dn2, x)
+#endif
+
+texture<unsigned int, 1, cudaReadModeElementType> t0up2;
+texture<unsigned int, 1, cudaReadModeElementType> t0dn2;
+texture<unsigned int, 1, cudaReadModeElementType> t1up2;
+texture<unsigned int, 1, cudaReadModeElementType> t1dn2;
+texture<unsigned int, 1, cudaReadModeElementType> t2up2;
+texture<unsigned int, 1, cudaReadModeElementType> t2dn2;
+texture<unsigned int, 1, cudaReadModeElementType> t3up2;
+texture<unsigned int, 1, cudaReadModeElementType> t3dn2;
+
+#define RSTT(d0, d1, a, b0, b1, b2, b3, b4, b5, b6, b7) do { \
+	t[d0] = T0up(B32_0(a[b0])) \
+		^ T1up(B32_1(a[b1])) \
+		^ T2up(B32_2(a[b2])) \
+		^ T3up(B32_3(a[b3])) \
+		^ T0dn(B32_0(a[b4])) \
+		^ T1dn(B32_1(a[b5])) \
+		^ T2dn(B32_2(a[b6])) \
+		^ T3dn(B32_3(a[b7])); \
+	t[d1] = T0dn(B32_0(a[b0])) \
+		^ T1dn(B32_1(a[b1])) \
+		^ T2dn(B32_2(a[b2])) \
+		^ T3dn(B32_3(a[b3])) \
+		^ T0up(B32_0(a[b4])) \
+		^ T1up(B32_1(a[b5])) \
+		^ T2up(B32_2(a[b6])) \
+		^ T3up(B32_3(a[b7])); \
+	} while (0)
+
+
+extern uint32_t T0up_cpu[];
+extern uint32_t T0dn_cpu[];
+extern uint32_t T1up_cpu[];
+extern uint32_t T1dn_cpu[];
+extern uint32_t T2up_cpu[];
+extern uint32_t T2dn_cpu[];
+extern uint32_t T3up_cpu[];
+extern uint32_t T3dn_cpu[];
+
+__device__ __forceinline__
+void groestl256_perm_P(int thread,uint32_t *a, char *mixtabs)
+{
+	#pragma unroll 10
+	for (int r = 0; r<10; r++)
+	{
+		uint32_t t[16];
+
+		a[0x0] ^= PC32up(0x00, r);
+		a[0x2] ^= PC32up(0x10, r);
+		a[0x4] ^= PC32up(0x20, r);
+		a[0x6] ^= PC32up(0x30, r);
+		a[0x8] ^= PC32up(0x40, r);
+		a[0xA] ^= PC32up(0x50, r);
+		a[0xC] ^= PC32up(0x60, r);
+		a[0xE] ^= PC32up(0x70, r);
+		RSTT(0x0, 0x1, a, 0x0, 0x2, 0x4, 0x6, 0x9, 0xB, 0xD, 0xF);
+		RSTT(0x2, 0x3, a, 0x2, 0x4, 0x6, 0x8, 0xB, 0xD, 0xF, 0x1);
+		RSTT(0x4, 0x5, a, 0x4, 0x6, 0x8, 0xA, 0xD, 0xF, 0x1, 0x3);
+		RSTT(0x6, 0x7, a, 0x6, 0x8, 0xA, 0xC, 0xF, 0x1, 0x3, 0x5);
+		RSTT(0x8, 0x9, a, 0x8, 0xA, 0xC, 0xE, 0x1, 0x3, 0x5, 0x7);
+		RSTT(0xA, 0xB, a, 0xA, 0xC, 0xE, 0x0, 0x3, 0x5, 0x7, 0x9);
+		RSTT(0xC, 0xD, a, 0xC, 0xE, 0x0, 0x2, 0x5, 0x7, 0x9, 0xB);
+		RSTT(0xE, 0xF, a, 0xE, 0x0, 0x2, 0x4, 0x7, 0x9, 0xB, 0xD);
+
+		#pragma unroll 16
+		for (int k = 0; k<16; k++)
+			a[k] = t[k];
+	}
+}
+
+__device__ __forceinline__
+void groestl256_perm_Q(int thread, uint32_t *a, char *mixtabs)
+{
+	#pragma unroll
+	for (int r = 0; r<10; r++)
+	{
+		uint32_t t[16];
+
+		a[0x0] ^= QC32up(0x00, r);
+		a[0x1] ^= QC32dn(0x00, r);
+		a[0x2] ^= QC32up(0x10, r);
+		a[0x3] ^= QC32dn(0x10, r);
+		a[0x4] ^= QC32up(0x20, r);
+		a[0x5] ^= QC32dn(0x20, r);
+		a[0x6] ^= QC32up(0x30, r);
+		a[0x7] ^= QC32dn(0x30, r);
+		a[0x8] ^= QC32up(0x40, r);
+		a[0x9] ^= QC32dn(0x40, r);
+		a[0xA] ^= QC32up(0x50, r);
+		a[0xB] ^= QC32dn(0x50, r);
+		a[0xC] ^= QC32up(0x60, r);
+		a[0xD] ^= QC32dn(0x60, r);
+		a[0xE] ^= QC32up(0x70, r);
+		a[0xF] ^= QC32dn(0x70, r);
+		RSTT(0x0, 0x1, a, 0x2, 0x6, 0xA, 0xE, 0x1, 0x5, 0x9, 0xD);
+		RSTT(0x2, 0x3, a, 0x4, 0x8, 0xC, 0x0, 0x3, 0x7, 0xB, 0xF);
+		RSTT(0x4, 0x5, a, 0x6, 0xA, 0xE, 0x2, 0x5, 0x9, 0xD, 0x1);
+		RSTT(0x6, 0x7, a, 0x8, 0xC, 0x0, 0x4, 0x7, 0xB, 0xF, 0x3);
+		RSTT(0x8, 0x9, a, 0xA, 0xE, 0x2, 0x6, 0x9, 0xD, 0x1, 0x5);
+		RSTT(0xA, 0xB, a, 0xC, 0x0, 0x4, 0x8, 0xB, 0xF, 0x3, 0x7);
+		RSTT(0xC, 0xD, a, 0xE, 0x2, 0x6, 0xA, 0xD, 0x1, 0x5, 0x9);
+		RSTT(0xE, 0xF, a, 0x0, 0x4, 0x8, 0xC, 0xF, 0x3, 0x7, 0xB);
+
+		#pragma unroll
+		for (int k = 0; k<16; k++)
+			a[k] = t[k];
+	}
+}
+
+__global__ __launch_bounds__(256,1)
+void groestl256_gpu_hash32(int threads, uint32_t startNounce, uint64_t *outputHash, uint32_t *nonceVector)
+{
+#if USE_SHARED
+	extern __shared__ char mixtabs[];
+
+	if (threadIdx.x < 256) {
+		*((uint32_t*)mixtabs + (threadIdx.x)) = tex1Dfetch(t0up2, threadIdx.x);
+		*((uint32_t*)mixtabs + (256 + threadIdx.x)) = tex1Dfetch(t0dn2, threadIdx.x);
+		*((uint32_t*)mixtabs + (512 + threadIdx.x)) = tex1Dfetch(t1up2, threadIdx.x);
+		*((uint32_t*)mixtabs + (768 + threadIdx.x)) = tex1Dfetch(t1dn2, threadIdx.x);
+		*((uint32_t*)mixtabs + (1024 + threadIdx.x)) = tex1Dfetch(t2up2, threadIdx.x);
+		*((uint32_t*)mixtabs + (1280 + threadIdx.x)) = tex1Dfetch(t2dn2, threadIdx.x);
+		*((uint32_t*)mixtabs + (1536 + threadIdx.x)) = tex1Dfetch(t3up2, threadIdx.x);
+		*((uint32_t*)mixtabs + (1792 + threadIdx.x)) = tex1Dfetch(t3dn2, threadIdx.x);
+	}
+
+	__syncthreads();
+#endif
+
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		// GROESTL
+		uint32_t message[16];
+		uint32_t state[16];
+
+		#pragma unroll
+		for (int k = 0; k<4; k++)
+			LOHI(message[2*k], message[2*k+1], outputHash[k*threads+thread]);
+
+		#pragma unroll
+		for (int k = 9; k<15; k++)
+			message[k] = 0;
+
+		message[8] = 0x80;
+		message[15] = 0x01000000;
+
+		#pragma unroll 16
+		for (int u = 0; u<16; u++)
+			state[u] = message[u];
+
+		state[15] ^= 0x10000;
+
+		// Perm
+
+#if USE_SHARED
+		groestl256_perm_P(thread, state, mixtabs);
+		state[15] ^= 0x10000;
+		groestl256_perm_Q(thread, message, mixtabs);
+#else
+		groestl256_perm_P(thread, state, NULL);
+		state[15] ^= 0x10000;
+		groestl256_perm_P(thread, message, NULL);
+#endif
+		#pragma unroll 16
+		for (int u = 0; u<16; u++) state[u] ^= message[u];
+		#pragma unroll 16
+		for (int u = 0; u<16; u++) message[u] = state[u];
+#if USE_SHARED
+		groestl256_perm_P(thread, message, mixtabs);
+#else
+		groestl256_perm_P(thread, message, NULL);
+#endif
+		state[14] ^= message[14];
+		state[15] ^= message[15];
+
+		uint32_t nonce = startNounce + thread;
+		if (state[15] <= pTarget[7]) {
+			nonceVector[0] = nonce;
+		}
+	}
+}
+
+#define texDef(texname, texmem, texsource, texsize) \
+	unsigned int *texmem; \
+	cudaMalloc(&texmem, texsize); \
+	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
+	texname.normalized = 0; \
+	texname.filterMode = cudaFilterModePoint; \
+	texname.addressMode[0] = cudaAddressModeClamp; \
+	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); } \
+
+__host__
+void groestl256_cpu_init(int thr_id, int threads)
+{
+
+	// Texturen mit obigem Makro initialisieren
+	texDef(t0up2, d_T0up, T0up_cpu, sizeof(uint32_t) * 256);
+	texDef(t0dn2, d_T0dn, T0dn_cpu, sizeof(uint32_t) * 256);
+	texDef(t1up2, d_T1up, T1up_cpu, sizeof(uint32_t) * 256);
+	texDef(t1dn2, d_T1dn, T1dn_cpu, sizeof(uint32_t) * 256);
+	texDef(t2up2, d_T2up, T2up_cpu, sizeof(uint32_t) * 256);
+	texDef(t2dn2, d_T2dn, T2dn_cpu, sizeof(uint32_t) * 256);
+	texDef(t3up2, d_T3up, T3up_cpu, sizeof(uint32_t) * 256);
+	texDef(t3dn2, d_T3dn, T3dn_cpu, sizeof(uint32_t) * 256);
+
+	cudaMalloc(&d_GNonce[thr_id], sizeof(uint32_t));
+	cudaMallocHost(&d_gnounce[thr_id], 1*sizeof(uint32_t));
+}
+
+__host__
+uint32_t groestl256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
+{
+	uint32_t result = 0xffffffff;
+	cudaMemset(d_GNonce[thr_id], 0xff, sizeof(uint32_t));
+	const int threadsperblock = 256;
+
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+#if USE_SHARED
+	size_t shared_size = 8 * 256 * sizeof(uint32_t);
+#else
+	size_t shared_size = 0;
+#endif
+	groestl256_gpu_hash32<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash, d_GNonce[thr_id]);
+
+	MyStreamSynchronize(NULL, order, thr_id);
+	cudaMemcpy(d_gnounce[thr_id], d_GNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaThreadSynchronize();
+	result = *d_gnounce[thr_id];
+
+	return result;
+}
+
+__host__
+void groestl256_setTarget(const void *pTargetIn)
+{
+	cudaMemcpyToSymbol(pTarget, pTargetIn, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+}
--- a/Algo256/cuda_keccak256.cu
+++ b/Algo256/cuda_keccak256.cu
@ -27,11 +27,81 @@ uint32_t *d_KNonce[8];

 __constant__ uint32_t pTarget[8];
 __constant__ uint64_t keccak_round_constants[24];
-__constant__ uint64_t c_PaddedMessage80[10]; // padded message (80 bytes + padding)
+__constant__ uint64_t c_PaddedMessage80[10]; // padded message (80 bytes + padding?)

+#if __CUDA_ARCH__ >= 350
+__device__ __forceinline__
+static void keccak_blockv35(uint2 *s, const uint64_t *keccak_round_constants)
+{
+	size_t i;
+	uint2 t[5], u[5], v, w;

-static __device__ __forceinline__
-void keccak_block(uint64_t *s, const uint64_t *keccak_round_constants) {
+	#pragma unroll
+	for (i = 0; i < 24; i++) {
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		u[0] = t[4] ^ ROL2(t[1], 1);
+		u[1] = t[0] ^ ROL2(t[2], 1);
+		u[2] = t[1] ^ ROL2(t[3], 1);
+		u[3] = t[2] ^ ROL2(t[4], 1);
+		u[4] = t[3] ^ ROL2(t[0], 1);
+
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
+		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
+		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
+		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
+		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		v = s[1];
+		s[1] = ROL2(s[6], 44);
+		s[6] = ROL2(s[9], 20);
+		s[9] = ROL2(s[22], 61);
+		s[22] = ROL2(s[14], 39);
+		s[14] = ROL2(s[20], 18);
+		s[20] = ROL2(s[2], 62);
+		s[2] = ROL2(s[12], 43);
+		s[12] = ROL2(s[13], 25);
+		s[13] = ROL2(s[19], 8);
+		s[19] = ROL2(s[23], 56);
+		s[23] = ROL2(s[15], 41);
+		s[15] = ROL2(s[4], 27);
+		s[4] = ROL2(s[24], 14);
+		s[24] = ROL2(s[21], 2);
+		s[21] = ROL2(s[8], 55);
+		s[8] = ROL2(s[16], 45);
+		s[16] = ROL2(s[5], 36);
+		s[5] = ROL2(s[3], 28);
+		s[3] = ROL2(s[18], 21);
+		s[18] = ROL2(s[17], 15);
+		s[17] = ROL2(s[11], 10);
+		s[11] = ROL2(s[7], 6);
+		s[7] = ROL2(s[10], 3);
+		s[10] = ROL2(v, 1);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		v = s[0]; w = s[1]; s[0] ^= (~w) & s[2]; s[1] ^= (~s[2]) & s[3]; s[2] ^= (~s[3]) & s[4]; s[3] ^= (~s[4]) & v; s[4] ^= (~v) & w;
+		v = s[5]; w = s[6]; s[5] ^= (~w) & s[7]; s[6] ^= (~s[7]) & s[8]; s[7] ^= (~s[8]) & s[9]; s[8] ^= (~s[9]) & v; s[9] ^= (~v) & w;
+		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
+		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
+		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= vectorize(keccak_round_constants[i]);
+	}
+}
+#endif
+
+__device__ __forceinline__
+static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants)
+{
 	size_t i;
 	uint64_t t[5], u[5], v, w;

@ -109,14 +179,16 @@ void keccak256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash,

 		//#pragma unroll 25
 		for (int i=0; i<25; i++) {
-			if(i<9) {keccak_gpu_state[i] = c_PaddedMessage80[i];}
-			else    {keccak_gpu_state[i] = 0;}
+			if (i < 9)
+				keccak_gpu_state[i] = c_PaddedMessage80[i];
+			else
+				keccak_gpu_state[i] = 0;
 		}
-		keccak_gpu_state[9]=REPLACE_HIWORD(c_PaddedMessage80[9],cuda_swab32(nounce));
-		keccak_gpu_state[10]=0x0000000000000001;
-		keccak_gpu_state[16]=0x8000000000000000;
+		keccak_gpu_state[9]  = REPLACE_HIWORD(c_PaddedMessage80[9], cuda_swab32(nounce));
+		keccak_gpu_state[10] = 0x0000000000000001;
+		keccak_gpu_state[16] = 0x8000000000000000;

-		keccak_block(keccak_gpu_state,keccak_round_constants);
+		keccak_blockv30(keccak_gpu_state, keccak_round_constants);

 		bool rc = false;
 		if (keccak_gpu_state[3] <= ((uint64_t*)pTarget)[3]) {rc = true;}
@ -125,18 +197,7 @@ void keccak256_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash,
 			if(resNounce[0] > nounce)
 				resNounce[0] = nounce;
 		}
-	} //thread
-}
-
-void keccak256_cpu_init(int thr_id, int threads)
-{
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants,
-						host_keccak_round_constants,
-						sizeof(host_keccak_round_constants),
-						0, cudaMemcpyHostToDevice));
-
-	CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], sizeof(uint32_t)));
-	CUDA_SAFE_CALL(cudaMallocHost(&d_nounce[thr_id], 1*sizeof(uint32_t)));
+	}
 }

 __host__
@ -161,6 +222,66 @@ uint32_t keccak256_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, ui
 	return result;
 }

+#ifdef _MSC_VER
+#define UINT2(a, b) { a, b }
+#else
+#define UINT2(a, b) (uint2) { a, b }
+#endif
+
+__global__ __launch_bounds__(256,3)
+void keccak256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+#if __CUDA_ARCH__ >= 350 /* tpr: to double check if faster on SM5+ */
+		uint2 keccak_gpu_state[25];
+		#pragma unroll 25
+		for (int i = 0; i<25; i++) {
+			if (i < 4)
+				keccak_gpu_state[i] = vectorize(outputHash[i*threads+thread]);
+			else
+				keccak_gpu_state[i] = UINT2(0, 0);
+		}
+		keccak_gpu_state[4]  = UINT2(1, 0);
+		keccak_gpu_state[16] = UINT2(0, 0x80000000);
+		keccak_blockv35(keccak_gpu_state, keccak_round_constants);
+
+		#pragma unroll 4
+		for (int i=0; i<4;i++)
+			outputHash[i*threads+thread]=devectorize(keccak_gpu_state[i]);
+#else
+		uint64_t keccak_gpu_state[25];
+		#pragma unroll 25
+		for (int i = 0; i<25; i++) {
+			if (i<4)
+				keccak_gpu_state[i] = outputHash[i*threads+thread];
+			else
+				keccak_gpu_state[i] = 0;
+		}
+		keccak_gpu_state[4] = 0x0000000000000001;
+		keccak_gpu_state[16] = 0x8000000000000000;
+
+		keccak_blockv30(keccak_gpu_state, keccak_round_constants);
+		#pragma unroll 4
+		for (int i = 0; i<4; i++)
+			outputHash[i*threads + thread] = keccak_gpu_state[i];
+#endif
+	}
+}
+
+__host__
+void keccak256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
+{
+	const int threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	keccak256_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_outputHash);
+	MyStreamSynchronize(NULL, order, thr_id);
+}
+
 __host__
 void keccak256_setBlock_80(void *pdata,const void *pTargetIn)
 {
@ -168,4 +289,13 @@ void keccak256_setBlock_80(void *pdata,const void *pTargetIn)
 	memcpy(PaddedMessage, pdata, 80);
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(pTarget, pTargetIn, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 10*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
-}
+}
+
+__host__
+void keccak256_cpu_init(int thr_id, int threads)
+{
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants, host_keccak_round_constants,
+				sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMallocHost(&d_nounce[thr_id], 1*sizeof(uint32_t)));
+}
--- a/Algo256/cuda_skein256.cu
+++ b/Algo256/cuda_skein256.cu
@ -0,0 +1,196 @@
+#include <memory.h>
+
+#include "cuda_helper.h"
+
+#if 0
+static __constant__ uint64_t SKEIN_IV512_256[8] = {
+	0xCCD044A12FDB3E13, 0xE83590301A79A9EB,
+	0x55AEA0614F816E6F, 0x2A2767A4AE9B94DB,
+	0xEC06025E74DD7683, 0xE7A436CDC4746251,
+	0xC36FBAF9393AD185, 0x3EEDBA1833EDFC13
+};
+#endif
+
+static __constant__ uint2 vSKEIN_IV512_256[8] = {
+	{ 0x2FDB3E13, 0xCCD044A1 },
+	{ 0x1A79A9EB, 0xE8359030 },
+	{ 0x4F816E6F, 0x55AEA061 },
+	{ 0xAE9B94DB, 0x2A2767A4 },
+	{ 0x74DD7683, 0xEC06025E },
+	{ 0xC4746251, 0xE7A436CD },
+	{ 0x393AD185, 0xC36FBAF9 },
+	{ 0x33EDFC13, 0x3EEDBA18 }
+};
+
+static __constant__ int ROT256[8][4] =
+{
+	46,36, 19, 37,
+	33,27, 14, 42,
+	17,49, 36, 39,
+	44, 9, 54, 56,
+	39,30, 34, 24,
+	13,50, 10, 17,
+	25,29, 39, 43,
+	8, 35, 56, 22,
+};
+
+static __constant__ uint2 skein_ks_parity = { 0xA9FC1A22,0x1BD11BDA};
+static __constant__ uint2 t12[6] = {
+	{ 0x20,	0 },
+	{ 0,	0xf0000000 },
+	{ 0x20,	0xf0000000 },
+	{ 0x08,	0 },
+	{ 0,	0xff000000 },
+	{ 0x08,	0xff000000 }
+};
+
+#if 0
+static __constant__ uint64_t t12_30[6] = {
+	0x20,
+	0xf000000000000000,
+	0xf000000000000020,
+	0x08,
+	0xff00000000000000,
+	0xff00000000000008
+};
+#endif
+
+static __forceinline__ __device__
+void Round512v35(uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int ROT)
+{
+	p0 += p1; p1 = ROL2(p1, ROT256[ROT][0]);  p1 ^= p0;
+	p2 += p3; p3 = ROL2(p3, ROT256[ROT][1]);  p3 ^= p2;
+	p4 += p5; p5 = ROL2(p5, ROT256[ROT][2]);  p5 ^= p4;
+	p6 += p7; p7 = ROL2(p7, ROT256[ROT][3]);  p7 ^= p6;
+}
+
+
+static __forceinline__ __device__
+void Round_8_512v35(uint2 *ks, uint2 *ts,
+                    uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3,
+                    uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int R)
+{
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 0);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 1);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 2);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 3);
+	p0 += ks[((R)+0) % 9];   /* inject the key schedule value */
+	p1 += ks[((R)+1) % 9];
+	p2 += ks[((R)+2) % 9];
+	p3 += ks[((R)+3) % 9];
+	p4 += ks[((R)+4) % 9];
+	p5 += ks[((R)+5) % 9] + ts[((R)+0) % 3];
+	p6 += ks[((R)+6) % 9] + ts[((R)+1) % 3];
+	p7 += ks[((R)+7) % 9] + make_uint2((R),0);
+	Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 4);
+	Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 5);
+	Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 6);
+	Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 7);
+	p0 += ks[((R)+1) % 9];   /* inject the key schedule value */
+	p1 += ks[((R)+2) % 9];
+	p2 += ks[((R)+3) % 9];
+	p3 += ks[((R)+4) % 9];
+	p4 += ks[((R)+5) % 9];
+	p5 += ks[((R)+6) % 9] + ts[((R)+1) % 3];
+	p6 += ks[((R)+7) % 9] + ts[((R)+2) % 3];
+	p7 += ks[((R)+8) % 9] + make_uint2((R)+1, 0);
+}
+
+
+__global__ __launch_bounds__(256,3)
+void skein256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint2 h[9];
+		uint2 t[3];
+		uint2 dt0,dt1,dt2,dt3;
+		uint2 p0, p1, p2, p3, p4, p5, p6, p7;
+
+		h[8] = skein_ks_parity;
+		for (int i = 0; i<8; i++) {
+			h[i] = vSKEIN_IV512_256[i];
+			h[8] ^= h[i];
+		}
+
+		t[0]=t12[0];
+		t[1]=t12[1];
+		t[2]=t12[2];
+
+		LOHI(dt0.x,dt0.y,outputHash[thread]);
+		LOHI(dt1.x,dt1.y,outputHash[threads+thread]);
+		LOHI(dt2.x,dt2.y,outputHash[2*threads+thread]);
+		LOHI(dt3.x,dt3.y,outputHash[3*threads+thread]);
+
+		p0 = h[0] + dt0;
+		p1 = h[1] + dt1;
+		p2 = h[2] + dt2;
+		p3 = h[3] + dt3;
+		p4 = h[4];
+		p5 = h[5] + t[0];
+		p6 = h[6] + t[1];
+		p7 = h[7];
+
+		#pragma unroll
+		for (int i = 1; i<19; i+=2) {
+			Round_8_512v35(h,t,p0,p1,p2,p3,p4,p5,p6,p7,i);
+		}
+
+		p0 ^= dt0;
+		p1 ^= dt1;
+		p2 ^= dt2;
+		p3 ^= dt3;
+
+		h[0] = p0;
+		h[1] = p1;
+		h[2] = p2;
+		h[3] = p3;
+		h[4] = p4;
+		h[5] = p5;
+		h[6] = p6;
+		h[7] = p7;
+		h[8] = skein_ks_parity;
+
+		#pragma unroll 8
+		for (int i = 0; i<8; i++) {
+			h[8] ^= h[i];
+		}
+
+		t[0] = t12[3];
+		t[1] = t12[4];
+		t[2] = t12[5];
+		p5 += t[0];  //p5 already equal h[5]
+		p6 += t[1];
+
+		#pragma unroll
+		for (int i = 1; i<19; i+=2) {
+			Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, i);
+		}
+
+		outputHash[thread]           = devectorize(p0);
+		outputHash[threads+thread]   = devectorize(p1);
+		outputHash[2*threads+thread] = devectorize(p2);
+		outputHash[3*threads+thread] = devectorize(p3);
+	}
+}
+
+__host__
+void skein256_cpu_init(int thr_id, int threads)
+{
+	//empty
+}
+
+__host__
+void skein256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
+{
+	const int threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	skein256_gpu_hash_32<<<grid, block>>>(threads, startNounce, d_outputHash);
+
+	MyStreamSynchronize(NULL, order, thr_id);
+}
+
--- a/Algo256/keccak256.cu
+++ b/Algo256/keccak256.cu
--- a/Makefile.am
+++ b/Makefile.am
@ -10,11 +10,11 @@ EXTRA_DIST		= autogen.sh README.txt LICENSE.txt \
 			  cudaminer.sln cudaminer.vcxproj cudaminer.vcxproj.filters \
 			  compat/gettimeofday.c compat/getopt/getopt_long.c cpuminer-config.h.in

-SUBDIRS		= compat
+SUBDIRS = compat

-bin_PROGRAMS	= ccminer
+bin_PROGRAMS = ccminer

-ccminer_SOURCES		= elist.h miner.h compat.h \
+ccminer_SOURCES	= elist.h miner.h compat.h \
 			  compat/inttypes.h compat/stdbool.h compat/unistd.h \
 			  compat/sys/time.h compat/getopt/getopt.h \
 			  crc32.c hefty1.c scrypt.c \
@ -27,17 +27,20 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
 			  heavy/cuda_hefty1.cu heavy/cuda_hefty1.h \
 			  heavy/cuda_keccak512.cu heavy/cuda_keccak512.h \
 			  heavy/cuda_sha256.cu heavy/cuda_sha256.h \
-			  keccak/cuda_keccak256.cu keccak/keccak256.cu \
-			  fuguecoin.cpp cuda_fugue256.cu sph/fugue.c sph/sph_fugue.h uint256.h \
+			  fuguecoin.cpp Algo256/cuda_fugue256.cu sph/fugue.c uint256.h \
 			  groestlcoin.cpp cuda_groestlcoin.cu cuda_groestlcoin.h \
 			  myriadgroestl.cpp cuda_myriadgroestl.cu \
+			  lyra2/Lyra2.c lyra2/Sponge.c \
+			  lyra2/lyra2RE.cu lyra2/cuda_lyra2.cu \
+			  Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \
+			  Algo256/blake256.cu Algo256/keccak256.cu \
 			  JHA/jackpotcoin.cu JHA/cuda_jha_keccak512.cu \
 			  JHA/cuda_jha_compactionTest.cu cuda_checkhash.cu \
 			  quark/cuda_jh512.cu quark/cuda_quark_blake512.cu quark/cuda_quark_groestl512.cu quark/cuda_skein512.cu \
 			  quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu \
 			  quark/quarkcoin.cu quark/animecoin.cu \
 			  quark/cuda_quark_compactionTest.cu  \
-			  cuda_nist5.cu blake32.cu pentablake.cu \
+			  cuda_nist5.cu pentablake.cu \
 			  sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
 			  sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
 			  sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@

-ccMiner release 1.5.0-tpruvot (27 Nov 2014) - "Extra nonce"
+ccMiner release 1.5.1-tpruvot (16 Dec 2014) - "Vertcoin Lyra2"
 ---------------------------------------------------------------

 ***************************************************************
@ -38,6 +38,7 @@ Keccak (Maxcoin)
 Deep, Doom and Qubit
 Pentablake (Blake 512 x5)
 S3 (OneCoin)
+Lyra2RE (new VertCoin algo)

 where some of these coins have a VERY NOTABLE nVidia advantage
 over competing AMD (OpenCL Only) implementations.
@ -68,6 +69,7 @@ its command line interface and options.
                          jackpot     use to mine Jackpotcoin
                          keccak      use to mine Maxcoin
                          luffa       use to mine Doomcoin
+                          lyra2       use to mine Vertcoin
                          mjollnir    use to mine Mjollnircoin
                          myr-gr      use to mine Myriad-Groest
                          nist5       use to mine TalkCoin
@ -169,6 +171,12 @@ features.

 >>> RELEASE HISTORY <<<

+  Dec. 2014       v1.5.1 (not released yet!)
+                  Add lyra2 algo for Vertcoin (Release is 16 Dec 2014)
+                  Multiple shares support (2 for the moment)
+                  X11 optimisations (From klaust and sp-hash)
+                  HTML5 WebSocket api compatibility (see api/websocket.htm)
+
  Nov. 27th 2014  v1.5.0
                  Upgrade compat jansson to 2.6 (for windows)
                  Add pool mining.set_extranonce support
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -138,6 +138,7 @@ enum sha_algos {
 	ALGO_KECCAK,
 	ALGO_JACKPOT,
 	ALGO_LUFFA_DOOM,
+	ALGO_LYRA,
 	ALGO_MJOLLNIR,		/* Hefty hash */
 	ALGO_MYR_GR,
 	ALGO_NIST5,
@ -167,6 +168,7 @@ static const char *algo_names[] = {
 	"keccak",
 	"jackpot",
 	"luffa",
+	"lyra2",
 	"mjollnir",
 	"myr-gr",
 	"nist5",
@ -272,6 +274,7 @@ Options:\n\
 			jackpot     Jackpot\n\
 			keccak      Keccak-256 (Maxcoin)\n\
 			luffa       Doomcoin\n\
+			lyra2       VertCoin\n\
 			mjollnir    Mjollnircoin\n\
 			myr-gr      Myriad-Groestl\n\
 			nist5       NIST5 (TalkCoin)\n\
@ -1255,6 +1258,11 @@ static void *miner_thread(void *userdata)
 			                      max_nonce, &hashes_done);
 			break;

+		case ALGO_LYRA:
+			rc = scanhash_lyra(thr_id, work.data, work.target,
+			                      max_nonce, &hashes_done);
+			break;
+
 		case ALGO_NIST5:
 			rc = scanhash_nist5(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -105,7 +105,7 @@
      <MaxRegCount>80</MaxRegCount>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>false</Keep>
-      <CodeGeneration>compute_50,sm_50</CodeGeneration>
+      <CodeGeneration>compute_30,sm_30;compute_50,sm_50</CodeGeneration>
    </CudaCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
@ -173,7 +173,7 @@
      <MaxRegCount>80</MaxRegCount>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>false</Keep>
-      <CodeGeneration>compute_30,sm_30;compute_50,sm_50;</CodeGeneration>
+      <CodeGeneration>compute_50,sm_50;</CodeGeneration>
      <AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
      <Defines>
      </Defines>
@ -257,6 +257,8 @@
      <Optimization Condition="'$(Configuration)'=='Release'">Full</Optimization>
      <AdditionalOptions>/Tp %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
+    <ClCompile Include="lyra2\Lyra2.c" />
+    <ClCompile Include="lyra2\Sponge.c" />
    <ClCompile Include="sph\aes_helper.c" />
    <ClCompile Include="sph\blake.c" />
    <ClCompile Include="sph\bmw.c" />
@ -330,13 +332,15 @@
    <ClInclude Include="sph\sph_whirlpool.h" />
    <ClInclude Include="uint256.h" />
  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="lyra2\Lyra2.h" />
+    <ClInclude Include="lyra2\Sponge.h" />
+  </ItemGroup>
  <ItemGroup>
    <CudaCompile Include="cuda.cpp" />
    <CudaCompile Include="bitslice_transformations_quad.cu">
      <ExcludedFromBuild>true</ExcludedFromBuild>
    </CudaCompile>
-    <CudaCompile Include="cuda_fugue256.cu">
-    </CudaCompile>
    <CudaCompile Include="cuda_groestlcoin.cu">
    </CudaCompile>
    <CudaCompile Include="cuda_myriadgroestl.cu">
@ -369,15 +373,19 @@
    </CudaCompile>
    <CudaCompile Include="JHA\jackpotcoin.cu">
    </CudaCompile>
-    <CudaCompile Include="blake32.cu">
+    <CudaCompile Include="Algo256\blake256.cu">
      <MaxRegCount>64</MaxRegCount>
      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
      <FastMath>true</FastMath>
    </CudaCompile>
-    <CudaCompile Include="keccak\cuda_keccak256.cu">
+    <CudaCompile Include="Algo256\keccak256.cu" />
+    <CudaCompile Include="Algo256\cuda_blake256.cu" />
+    <CudaCompile Include="Algo256\cuda_fugue256.cu" />
+    <CudaCompile Include="Algo256\cuda_groestl256.cu" />
+    <CudaCompile Include="Algo256\cuda_keccak256.cu">
      <MaxRegCount>92</MaxRegCount>
    </CudaCompile>
-    <CudaCompile Include="keccak\keccak256.cu" />
+    <CudaCompile Include="Algo256\cuda_skein256.cu" />
    <CudaCompile Include="pentablake.cu">
      <MaxRegCount>80</MaxRegCount>
      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
@ -418,6 +426,8 @@
    </CudaCompile>
    <CudaCompile Include="qubit\qubit_luffa512.cu">
    </CudaCompile>
+    <CudaCompile Include="lyra2\lyra2RE.cu" />
+    <CudaCompile Include="lyra2\cuda_lyra2.cu" />
    <CudaCompile Include="x11\cuda_x11_aes.cu">
      <ExcludedFromBuild>true</ExcludedFromBuild>
    </CudaCompile>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -61,12 +61,15 @@
    <Filter Include="Source Files\jansson">
      <UniqueIdentifier>{17b56151-79ec-4a32-bac3-9d94ae7f68fe}</UniqueIdentifier>
    </Filter>
-    <Filter Include="Source Files\CUDA\keccak">
-      <UniqueIdentifier>{9762c92c-9677-4044-8292-ff6ba4bfdd89}</UniqueIdentifier>
-    </Filter>
    <Filter Include="Header Files\compat\nvapi">
      <UniqueIdentifier>{ef6f9983-bda5-4fb2-adfa-ac4f29b74f25}</UniqueIdentifier>
    </Filter>
+    <Filter Include="Source Files\CUDA\Algo256">
+      <UniqueIdentifier>{9762c92c-9677-4044-8292-ff6ba4bfdd89}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="Header Files\lyra2">
+      <UniqueIdentifier>{2ff6e4ce-7c92-4cb2-a3ad-c331e94fd81d}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="compat\jansson\dump.c">
@ -213,6 +216,12 @@
    <ClCompile Include="compat\jansson\error.c">
      <Filter>Source Files\jansson</Filter>
    </ClCompile>
+    <ClCompile Include="lyra2\Lyra2.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="lyra2\Sponge.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="compat.h">
@ -347,14 +356,17 @@
    <ClInclude Include="compat\jansson\jansson_config.h">
      <Filter>Header Files\compat</Filter>
    </ClInclude>
+    <ClInclude Include="lyra2\Lyra2.h">
+      <Filter>Header Files\lyra2</Filter>
+    </ClInclude>
+    <ClInclude Include="lyra2\Sponge.h">
+      <Filter>Header Files\lyra2</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <CudaCompile Include="cuda.cpp">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
-    <CudaCompile Include="cuda_fugue256.cu">
-      <Filter>Source Files\CUDA</Filter>
-    </CudaCompile>
    <CudaCompile Include="cuda_groestlcoin.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
@ -505,20 +517,38 @@
    <CudaCompile Include="x17\x17.cu">
      <Filter>Source Files\CUDA\x17</Filter>
    </CudaCompile>
-    <CudaCompile Include="blake32.cu">
-      <Filter>Source Files\CUDA</Filter>
-    </CudaCompile>
    <CudaCompile Include="pentablake.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
-    <CudaCompile Include="keccak\cuda_keccak256.cu">
-      <Filter>Source Files\CUDA\keccak</Filter>
-    </CudaCompile>
-    <CudaCompile Include="keccak\keccak256.cu">
-      <Filter>Source Files\CUDA\keccak</Filter>
-    </CudaCompile>
    <CudaCompile Include="x11\s3.cu">
      <Filter>Source Files\CUDA\x11</Filter>
    </CudaCompile>
+    <CudaCompile Include="Algo256\blake256.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\keccak256.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\cuda_blake256.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\cuda_fugue256.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\cuda_groestl256.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\cuda_keccak256.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="Algo256\cuda_skein256.cu">
+      <Filter>Source Files\CUDA\Algo256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="lyra2\cuda_lyra2.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="lyra2\lyra2RE.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
  </ItemGroup>
 </Project>
--- a/cuda_helper.h
+++ b/cuda_helper.h
@ -355,7 +355,7 @@ uint64_t ROTL64(const uint64_t x, const int offset)
 		"setp.lt.u32 p, %2, 32;\n\t"
 		"@!p mov.b64 %0, {vl,vh};\n\t"
 		"@p  mov.b64 %0, {vh,vl};\n\t"
-		"}"
+	"}"
 		: "=l"(res) : "l"(x) , "r"(offset)
 	);
 	return res;
@ -378,4 +378,99 @@ uint64_t SWAPDWORDS(uint64_t value)
 #endif
 }

+/* lyra2 - int2 operators */
+
+__device__ __forceinline__
+void LOHI(uint32_t &lo, uint32_t &hi, uint64_t x) {
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(lo), "=r"(hi) : "l"(x));
+}
+
+static __device__ __forceinline__ uint64_t devectorize(uint2 v) { return MAKE_ULONGLONG(v.x, v.y); }
+static __device__ __forceinline__ uint2 vectorize(uint64_t v) {
+	uint2 result;
+	LOHI(result.x, result.y, v);
+	return result;
+}
+
+static __device__ __forceinline__ uint2 operator^ (uint2 a, uint2 b) { return make_uint2(a.x ^ b.x, a.y ^ b.y); }
+static __device__ __forceinline__ uint2 operator& (uint2 a, uint2 b) { return make_uint2(a.x & b.x, a.y & b.y); }
+static __device__ __forceinline__ uint2 operator| (uint2 a, uint2 b) { return make_uint2(a.x | b.x, a.y | b.y); }
+static __device__ __forceinline__ uint2 operator~ (uint2 a) { return make_uint2(~a.x, ~a.y); }
+static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ b; }
+static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b)
+{
+	uint2 result;
+	asm("{\n\t"
+		"add.cc.u32 %0,%2,%4; \n\t"
+		"addc.u32 %1,%3,%5;   \n\t"
+	"}\n\t"
+		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
+	return result;
+}
+static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a + b; }
+
+/**
+ * basic multiplication between 64bit no carry outside that range (ie mul.lo.b64(a*b))
+ * (what does uint64 "*" operator)
+ */
+static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b)
+{
+	uint2 result;
+	asm("{\n\t"
+		"mul.lo.u32        %0,%2,%4;  \n\t"
+		"mul.hi.u32        %1,%2,%4;  \n\t"
+		"mad.lo.cc.u32    %1,%3,%4,%1; \n\t"
+		"madc.lo.u32      %1,%3,%5,%1; \n\t"
+	"}\n\t"
+		: "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y));
+	return result;
+}
+
+// uint2 method
+#if  __CUDA_ARCH__ >= 350
+__device__ __inline__ uint2 ROR2(const uint2 a, const int offset) {
+	uint2 result;
+	if (offset < 32) {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	}
+	else {
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+		asm("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+	return result;
+}
+#else
+__device__ __inline__ uint2 ROR2(const uint2 v, const int n) {
+	uint2 result;
+	result.x = (((v.x) >> (n)) | ((v.x) << (64 - (n))));
+	result.y = (((v.y) >> (n)) | ((v.y) << (64 - (n))));
+	return result;
+}
+#endif
+
+#if  __CUDA_ARCH__ >= 350
+__inline__ __device__ uint2 ROL2(const uint2 a, const int offset) {
+	uint2 result;
+	if (offset >= 32) {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset));
+	}
+	else {
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset));
+		asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset));
+	}
+	return result;
+}
+#else
+__inline__ __device__ uint2 ROL2(const uint2 v, const int n) {
+	uint2 result;
+	result.x = (((v.x) << (n)) | ((v.x) >> (64 - (n))));
+	result.y = (((v.y) << (n)) | ((v.y) >> (64 - (n))));
+	return result;
+}
+#endif
+
+
 #endif // #ifndef CUDA_HELPER_H
--- a/lyra2/Lyra2.c
+++ b/lyra2/Lyra2.c
@ -0,0 +1,211 @@
+/**
+ * Implementation of the Lyra2 Password Hashing Scheme (PHS).
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "Lyra2.h"
+#include "Sponge.h"
+
+/**
+ * Executes Lyra2 based on the G function from Blake2b. This version supports salts and passwords
+ * whose combined length is smaller than the size of the memory matrix, (i.e., (nRows x nCols x b) bits,
+ * where "b" is the underlying sponge's bitrate). In this implementation, the "basil" is composed by all
+ * integer parameters (treated as type "unsigned int") in the order they are provided, plus the value
+ * of nCols, (i.e., basil = kLen || pwdlen || saltlen || timeCost || nRows || nCols).
+ *
+ * @param K The derived key to be output by the algorithm
+ * @param kLen Desired key length
+ * @param pwd User password
+ * @param pwdlen Password length
+ * @param salt Salt
+ * @param saltlen Salt length
+ * @param timeCost Parameter to determine the processing time (T)
+ * @param nRows Number or rows of the memory matrix (R)
+ * @param nCols Number of columns of the memory matrix (C)
+ *
+ * @return 0 if the key is generated correctly; -1 if there is an error (usually due to lack of memory for allocation)
+ */
+int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols)
+{
+	//============================= Basic variables ============================//
+	int64_t row = 2; //index of row to be processed
+	int64_t prev = 1; //index of prev (last row ever computed/modified)
+	int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+	int64_t tau; //Time Loop iterator
+	int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+	int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+	int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+	int64_t i; //auxiliary iteration counter
+	//==========================================================================/
+
+	//========== Initializing the Memory Matrix and pointers to it =============//
+	//Tries to allocate enough space for the whole memory matrix
+	i = (int64_t) ((int64_t) nRows * (int64_t) ROW_LEN_BYTES);
+	uint64_t *wholeMatrix = (uint64_t*) malloc((size_t) i);
+	if (wholeMatrix == NULL) {
+		return -1;
+	}
+	memset(wholeMatrix, 0, (size_t) i);
+
+	//Allocates pointers to each row of the matrix
+	uint64_t **memMatrix = malloc((size_t) nRows * sizeof(uint64_t*));
+	if (memMatrix == NULL) {
+		return -1;
+	}
+	//Places the pointers in the correct positions
+	uint64_t *ptrWord = wholeMatrix;
+	for (i = 0; i < (int64_t) nRows; i++) {
+		memMatrix[i] = ptrWord;
+		ptrWord += ROW_LEN_INT64;
+	}
+	//==========================================================================/
+
+	//============= Getting the password + salt + basil padded with 10*1 ===============//
+	//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+	//but this ensures that the password copied locally will be overwritten as soon as possible
+
+	//First, we clean enough blocks for the password, salt, basil and padding
+	uint64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof (uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
+
+	byte *ptrByte = (byte*) wholeMatrix;
+	memset(ptrByte, 0, (size_t) nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES);
+
+	//Prepends the password
+	memcpy(ptrByte, pwd, (size_t) pwdlen);
+	ptrByte += pwdlen;
+
+	//Concatenates the salt
+	memcpy(ptrByte, salt, (size_t) saltlen);
+	ptrByte += saltlen;
+
+	//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+	memcpy(ptrByte, &kLen, sizeof (uint64_t));
+	ptrByte += sizeof (uint64_t);
+	memcpy(ptrByte, &pwdlen, sizeof (uint64_t));
+	ptrByte += sizeof (uint64_t);
+	memcpy(ptrByte, &saltlen, sizeof (uint64_t));
+	ptrByte += sizeof (uint64_t);
+	memcpy(ptrByte, &timeCost, sizeof (uint64_t));
+	ptrByte += sizeof (uint64_t);
+	memcpy(ptrByte, &nRows, sizeof (uint64_t));
+	ptrByte += sizeof (uint64_t);
+	memcpy(ptrByte, &nCols, sizeof (uint64_t));
+	ptrByte += sizeof (uint64_t);
+
+	//Now comes the padding
+	*ptrByte = 0x80; //first byte of padding: right after the password
+	ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+	ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+	*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+	//==========================================================================/
+
+	//======================= Initializing the Sponge State ====================//
+	//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+	uint64_t *state = malloc(16 * sizeof (uint64_t));
+	if (state == NULL) {
+		return -1;
+	}
+	initState(state);
+	//==========================================================================/
+
+	//================================ Setup Phase =============================//
+	//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+	ptrWord = wholeMatrix;
+	for (i = 0; i < (int64_t) nBlocksInput; i++) {
+		absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
+		ptrWord += BLOCK_LEN_BLAKE2_SAFE_BYTES; //goes to next block of pad(pwd || salt || basil)
+	}
+
+	//Initializes M[0] and M[1]
+	reducedSqueezeRow0(state, memMatrix[0]); //The locally copied password is most likely overwritten here
+
+	reducedDuplexRow1(state, memMatrix[0], memMatrix[1]);
+
+	do {
+		//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+		reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
+
+		//updates the value of row* (deterministically picked during Setup))
+		rowa = (rowa + step) & (window - 1);
+		//update prev: it now points to the last row ever computed
+		prev = row;
+		//updates row: goes to the next row to be computed
+		row++;
+
+		//Checks if all rows in the window where visited.
+		if (rowa == 0) {
+		step = window + gap; //changes the step: approximately doubles its value
+		window *= 2; //doubles the size of the re-visitation window
+		gap = -gap; //inverts the modifier to the step
+	}
+
+	} while (row < (int64_t) nRows);
+	//==========================================================================/
+
+	//============================ Wandering Phase =============================//
+	row = 0; //Resets the visitation to the first row of the memory matrix
+	for (tau = 1; tau <= (int64_t) timeCost; tau++) {
+		//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+		step = (tau % 2 == 0) ? -1 : nRows / 2 - 1;
+		do {
+			//Selects a pseudorandom index row*
+			//------------------------------------------------------------------------------------------
+			//rowa = ((unsigned int)state[0]) & (nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+			rowa = ((uint64_t) (state[0])) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+			//------------------------------------------------------------------------------------------
+
+			//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+			reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row]);
+
+			//update prev: it now points to the last row ever computed
+			prev = row;
+
+			//updates row: goes to the next row to be computed
+			//------------------------------------------------------------------------------------------
+			//row = (row + step) & (nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+			row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+			//------------------------------------------------------------------------------------------
+
+		} while (row != 0);
+	}
+	//==========================================================================/
+
+	//============================ Wrap-up Phase ===============================//
+	//Absorbs the last block of the memory matrix
+	absorbBlock(state, memMatrix[rowa]);
+
+	//Squeezes the key
+	squeeze(state, K, (size_t) kLen);
+	//==========================================================================/
+
+	//========================= Freeing the memory =============================//
+	free(memMatrix);
+	free(wholeMatrix);
+
+	//Wiping out the sponge's internal state before freeing it
+	memset(state, 0, 16 * sizeof (uint64_t));
+	free(state);
+	//==========================================================================/
+
+	return 0;
+}
--- a/lyra2/Lyra2.h
+++ b/lyra2/Lyra2.h
@ -0,0 +1,50 @@
+/**
+ * Header file for the Lyra2 Password Hashing Scheme (PHS).
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef LYRA2_H_
+#define LYRA2_H_
+
+#include <stdint.h>
+
+typedef unsigned char byte;
+
+//Block length required so Blake2's Initialization Vector (IV) is not overwritten (THIS SHOULD NOT BE MODIFIED)
+#define BLOCK_LEN_BLAKE2_SAFE_INT64 8                                   //512 bits (=64 bytes, =8 uint64_t)
+#define BLOCK_LEN_BLAKE2_SAFE_BYTES (BLOCK_LEN_BLAKE2_SAFE_INT64 * 8)   //same as above, in bytes
+
+
+#ifdef BLOCK_LEN_BITS
+        #define BLOCK_LEN_INT64 (BLOCK_LEN_BITS/64)      //Block length: 768 bits (=96 bytes, =12 uint64_t)
+        #define BLOCK_LEN_BYTES (BLOCK_LEN_BITS/8)       //Block length, in bytes
+#else   //default block lenght: 768 bits
+        #define BLOCK_LEN_INT64 12                       //Block length: 768 bits (=96 bytes, =12 uint64_t)
+        #define BLOCK_LEN_BYTES (BLOCK_LEN_INT64 * 8)    //Block length, in bytes
+#endif
+
+#ifndef N_COLS
+        #define N_COLS 8                                //Number of columns in the memory matrix: fixed to 64 by default
+#endif
+
+#define ROW_LEN_INT64 (BLOCK_LEN_INT64 * N_COLS) //Total length of a row: N_COLS blocks
+#define ROW_LEN_BYTES (ROW_LEN_INT64 * 8)        //Number of bytes per row
+
+
+int LYRA2(void *K, uint64_t kLen, const void *pwd, uint64_t pwdlen, const void *salt, uint64_t saltlen, uint64_t timeCost, uint64_t nRows, uint64_t nCols);
+
+#endif /* LYRA2_H_ */
--- a/lyra2/Sponge.c
+++ b/lyra2/Sponge.c
@ -0,0 +1,755 @@
+/**
+ * A simple implementation of Blake2b's internal permutation
+ * in the form of a sponge.
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <string.h>
+#include <stdio.h>
+#include <time.h>
+#include "Sponge.h"
+#include "Lyra2.h"
+
+
+
+/**
+ * Initializes the Sponge State. The first 512 bits are set to zeros and the remainder
+ * receive Blake2b's IV as per Blake2b's specification. <b>Note:</b> Even though sponges
+ * typically have their internal state initialized with zeros, Blake2b's G function
+ * has a fixed point: if the internal state and message are both filled with zeros. the
+ * resulting permutation will always be a block filled with zeros; this happens because
+ * Blake2b does not use the constants originally employed in Blake2 inside its G function,
+ * relying on the IV for avoiding possible fixed points.
+ *
+ * @param state         The 1024-bit array to be initialized
+ */
+ void initState(uint64_t state[/*16*/]) {
+    //First 512 bis are zeros
+    memset(state, 0, 64);
+    //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV
+
+    state[8] = blake2b_IV[0];
+    state[9] = blake2b_IV[1];
+    state[10] = blake2b_IV[2];
+    state[11] = blake2b_IV[3];
+    state[12] = blake2b_IV[4];
+    state[13] = blake2b_IV[5];
+    state[14] = blake2b_IV[6];
+    state[15] = blake2b_IV[7];
+
+}
+
+/**
+ * Execute Blake2b's G function, with all 12 rounds.
+ *
+ * @param v     A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
+ */
+__inline static void blake2bLyra(uint64_t *v) {
+    ROUND_LYRA(0);
+    ROUND_LYRA(1);
+    ROUND_LYRA(2);
+    ROUND_LYRA(3);
+    ROUND_LYRA(4);
+    ROUND_LYRA(5);
+    ROUND_LYRA(6);
+    ROUND_LYRA(7);
+    ROUND_LYRA(8);
+    ROUND_LYRA(9);
+    ROUND_LYRA(10);
+    ROUND_LYRA(11);
+}
+
+/**
+ * Executes a reduced version of Blake2b's G function with only one round
+ * @param v     A 1024-bit (16 uint64_t) array to be processed by Blake2b's G function
+ */
+__inline static void reducedBlake2bLyra(uint64_t *v) {
+    ROUND_LYRA(0);
+}
+
+/**
+ * Performs a squeeze operation, using Blake2b's G function as the
+ * internal permutation
+ *
+ * @param state      The current state of the sponge
+ * @param out        Array that will receive the data squeezed
+ * @param len        The number of bytes to be squeezed into the "out" array
+ */
+ void squeeze(uint64_t *state, byte *out, unsigned int len) {
+    int fullBlocks = len / BLOCK_LEN_BYTES;
+    byte *ptr = out;
+    int i;
+    //Squeezes full blocks
+    for (i = 0; i < fullBlocks; i++) {
+        memcpy(ptr, state, BLOCK_LEN_BYTES);
+        blake2bLyra(state);
+        ptr += BLOCK_LEN_BYTES;
+    }
+
+    //Squeezes remaining bytes
+    memcpy(ptr, state, (len % BLOCK_LEN_BYTES));
+}
+
+/**
+ * Performs an absorb operation for a single block (BLOCK_LEN_INT64 words
+ * of type uint64_t), using Blake2b's G function as the internal permutation
+ *
+ * @param state The current state of the sponge
+ * @param in    The block to be absorbed (BLOCK_LEN_INT64 words)
+ */
+void absorbBlock(uint64_t *state, const uint64_t *in) {
+    //XORs the first BLOCK_LEN_INT64 words of "in" with the current state
+    state[0] ^= in[0];
+    state[1] ^= in[1];
+    state[2] ^= in[2];
+    state[3] ^= in[3];
+    state[4] ^= in[4];
+    state[5] ^= in[5];
+    state[6] ^= in[6];
+    state[7] ^= in[7];
+    state[8] ^= in[8];
+    state[9] ^= in[9];
+    state[10] ^= in[10];
+    state[11] ^= in[11];
+
+    //Applies the transformation f to the sponge's state
+    blake2bLyra(state);
+}
+
+/**
+ * Performs an absorb operation for a single block (BLOCK_LEN_BLAKE2_SAFE_INT64
+ * words of type uint64_t), using Blake2b's G function as the internal permutation
+ *
+ * @param state The current state of the sponge
+ * @param in    The block to be absorbed (BLOCK_LEN_BLAKE2_SAFE_INT64 words)
+ */
+void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in) {
+    //XORs the first BLOCK_LEN_BLAKE2_SAFE_INT64 words of "in" with the current state
+    state[0] ^= in[0];
+    state[1] ^= in[1];
+    state[2] ^= in[2];
+    state[3] ^= in[3];
+    state[4] ^= in[4];
+    state[5] ^= in[5];
+    state[6] ^= in[6];
+    state[7] ^= in[7];
+
+    //Applies the transformation f to the sponge's state
+    blake2bLyra(state);
+/*
+    for(int i = 0; i<16; i++) {
+        printf(" final state %d %08x %08x in %08x %08x\n", i, (uint32_t)(state[i] & 0xFFFFFFFFULL), (uint32_t)(state[i] >> 32),
+        (uint32_t)(in[i] & 0xFFFFFFFFULL), (uint32_t)(in[i] >> 32));
+    }
+*/
+}
+
+/**
+ * Performs a reduced squeeze operation for a single row, from the highest to
+ * the lowest index, using the reduced-round Blake2b's G function as the
+ * internal permutation
+ *
+ * @param state     The current state of the sponge
+ * @param rowOut    Row to receive the data squeezed
+ */
+void reducedSqueezeRow0(uint64_t* state, uint64_t* rowOut) {
+    uint64_t* ptrWord = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to M[0][C-1]
+    int i;
+    //M[row][C-1-col] = H.reduced_squeeze()
+    for (i = 0; i < N_COLS; i++) {
+
+	ptrWord[0] = state[0];
+	ptrWord[1] = state[1];
+	ptrWord[2] = state[2];
+	ptrWord[3] = state[3];
+	ptrWord[4] = state[4];
+	ptrWord[5] = state[5];
+	ptrWord[6] = state[6];
+	ptrWord[7] = state[7];
+	ptrWord[8] = state[8];
+	ptrWord[9] = state[9];
+	ptrWord[10] = state[10];
+	ptrWord[11] = state[11];
+	/*
+for (int i = 0; i<12; i++) {
+		printf(" after reducedSqueezeRow0 %d %08x %08x in %08x %08x\n", i, (uint32_t)(ptrWord[i] & 0xFFFFFFFFULL), (uint32_t)(ptrWord[i] >> 32),
+			(uint32_t)(state[i] & 0xFFFFFFFFULL), (uint32_t)(state[i] >> 32));
+	}
+*/
+	//Goes to next block (column) that will receive the squeezed data
+	ptrWord -= BLOCK_LEN_INT64;
+
+	//Applies the reduced-round transformation f to the sponge's state
+	reducedBlake2bLyra(state);
+    }
+}
+
+/**
+ * Performs a reduced duplex operation for a single row, from the highest to
+ * the lowest index, using the reduced-round Blake2b's G function as the
+ * internal permutation
+ *
+ * @param state		The current state of the sponge
+ * @param rowIn		Row to feed the sponge
+ * @param rowOut	Row to receive the sponge's output
+ */
+ void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut) {
+    uint64_t* ptrWordIn = rowIn;				//In Lyra2: pointer to prev
+    uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+    int i;
+
+    for (i = 0; i < N_COLS; i++) {
+
+	//Absorbing "M[prev][col]"
+	state[0]  ^= (ptrWordIn[0]);
+	state[1]  ^= (ptrWordIn[1]);
+	state[2]  ^= (ptrWordIn[2]);
+	state[3]  ^= (ptrWordIn[3]);
+	state[4]  ^= (ptrWordIn[4]);
+	state[5]  ^= (ptrWordIn[5]);
+	state[6]  ^= (ptrWordIn[6]);
+	state[7]  ^= (ptrWordIn[7]);
+	state[8]  ^= (ptrWordIn[8]);
+	state[9]  ^= (ptrWordIn[9]);
+	state[10] ^= (ptrWordIn[10]);
+	state[11] ^= (ptrWordIn[11]);
+
+	//Applies the reduced-round transformation f to the sponge's state
+	reducedBlake2bLyra(state);
+
+	//M[row][C-1-col] = M[prev][col] XOR rand
+	ptrWordOut[0] = ptrWordIn[0]  ^ state[0];
+	ptrWordOut[1] = ptrWordIn[1]  ^ state[1];
+	ptrWordOut[2] = ptrWordIn[2]  ^ state[2];
+	ptrWordOut[3] = ptrWordIn[3]  ^ state[3];
+	ptrWordOut[4] = ptrWordIn[4]  ^ state[4];
+	ptrWordOut[5] = ptrWordIn[5]  ^ state[5];
+	ptrWordOut[6] = ptrWordIn[6]  ^ state[6];
+	ptrWordOut[7] = ptrWordIn[7]  ^ state[7];
+	ptrWordOut[8] = ptrWordIn[8]  ^ state[8];
+	ptrWordOut[9] = ptrWordIn[9]  ^ state[9];
+	ptrWordOut[10] = ptrWordIn[10] ^ state[10];
+	ptrWordOut[11] = ptrWordIn[11] ^ state[11];
+
+
+	//Input: next column (i.e., next block in sequence)
+	ptrWordIn += BLOCK_LEN_INT64;
+	//Output: goes to previous column
+	ptrWordOut -= BLOCK_LEN_INT64;
+    }
+}
+
+/**
+ * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e.,
+ * the wordwise addition of two columns, ignoring carries between words). The
+ * output of this operation, "rand", is then used to make
+ * "M[rowOut][(N_COLS-1)-col] = M[rowIn][col] XOR rand" and
+ * "M[rowInOut][col] =  M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit
+ * rotation to the left and N_COLS is a system parameter.
+ *
+ * @param state          The current state of the sponge
+ * @param rowIn          Row used only as input
+ * @param rowInOut       Row used as input and to receive output after rotation
+ * @param rowOut         Row receiving the output
+ *
+ */
+ void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
+    uint64_t* ptrWordIn = rowIn;				//In Lyra2: pointer to prev
+    uint64_t* ptrWordInOut = rowInOut;				//In Lyra2: pointer to row*
+    uint64_t* ptrWordOut = rowOut + (N_COLS-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
+    int i;
+    for (i = 0; i < N_COLS; i++) {
+	//Absorbing "M[prev] [+] M[row*]"
+	state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
+	state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
+	state[2]  ^= (ptrWordIn[2]  + ptrWordInOut[2]);
+	state[3]  ^= (ptrWordIn[3]  + ptrWordInOut[3]);
+	state[4]  ^= (ptrWordIn[4]  + ptrWordInOut[4]);
+	state[5]  ^= (ptrWordIn[5]  + ptrWordInOut[5]);
+	state[6]  ^= (ptrWordIn[6]  + ptrWordInOut[6]);
+	state[7]  ^= (ptrWordIn[7]  + ptrWordInOut[7]);
+	state[8]  ^= (ptrWordIn[8]  + ptrWordInOut[8]);
+	state[9]  ^= (ptrWordIn[9]  + ptrWordInOut[9]);
+	state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
+	state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
+
+	//Applies the reduced-round transformation f to the sponge's state
+	reducedBlake2bLyra(state);
+
+	//M[row][col] = M[prev][col] XOR rand
+	ptrWordOut[0] = ptrWordIn[0]  ^ state[0];
+	ptrWordOut[1] = ptrWordIn[1]  ^ state[1];
+	ptrWordOut[2] = ptrWordIn[2]  ^ state[2];
+	ptrWordOut[3] = ptrWordIn[3]  ^ state[3];
+	ptrWordOut[4] = ptrWordIn[4]  ^ state[4];
+	ptrWordOut[5] = ptrWordIn[5]  ^ state[5];
+	ptrWordOut[6] = ptrWordIn[6]  ^ state[6];
+	ptrWordOut[7] = ptrWordIn[7]  ^ state[7];
+	ptrWordOut[8] = ptrWordIn[8]  ^ state[8];
+	ptrWordOut[9] = ptrWordIn[9]  ^ state[9];
+	ptrWordOut[10] = ptrWordIn[10] ^ state[10];
+	ptrWordOut[11] = ptrWordIn[11] ^ state[11];
+
+	//M[row*][col] = M[row*][col] XOR rotW(rand)
+	ptrWordInOut[0]  ^= state[11];
+	ptrWordInOut[1]  ^= state[0];
+	ptrWordInOut[2]  ^= state[1];
+	ptrWordInOut[3]  ^= state[2];
+	ptrWordInOut[4]  ^= state[3];
+	ptrWordInOut[5]  ^= state[4];
+	ptrWordInOut[6]  ^= state[5];
+	ptrWordInOut[7]  ^= state[6];
+	ptrWordInOut[8]  ^= state[7];
+	ptrWordInOut[9]  ^= state[8];
+	ptrWordInOut[10] ^= state[9];
+	ptrWordInOut[11] ^= state[10];
+
+	//Inputs: next column (i.e., next block in sequence)
+	ptrWordInOut += BLOCK_LEN_INT64;
+	ptrWordIn += BLOCK_LEN_INT64;
+	//Output: goes to previous column
+	ptrWordOut -= BLOCK_LEN_INT64;
+    }
+}
+
+/**
+ * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e.,
+ * the wordwise addition of two columns, ignoring carries between words). The
+ * output of this operation, "rand", is then used to make
+ * "M[rowOut][col] = M[rowOut][col] XOR rand" and
+ * "M[rowInOut][col] =  M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit
+ * rotation to the left.
+ *
+ * @param state          The current state of the sponge
+ * @param rowIn          Row used only as input
+ * @param rowInOut       Row used as input and to receive output after rotation
+ * @param rowOut         Row receiving the output
+ *
+ */
+void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
+    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
+    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
+    uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
+    int i;
+
+    for (i = 0; i < N_COLS; i++) {
+
+	//Absorbing "M[prev] [+] M[row*]"
+	state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
+	state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
+	state[2]  ^= (ptrWordIn[2]  + ptrWordInOut[2]);
+	state[3]  ^= (ptrWordIn[3]  + ptrWordInOut[3]);
+	state[4]  ^= (ptrWordIn[4]  + ptrWordInOut[4]);
+	state[5]  ^= (ptrWordIn[5]  + ptrWordInOut[5]);
+	state[6]  ^= (ptrWordIn[6]  + ptrWordInOut[6]);
+	state[7]  ^= (ptrWordIn[7]  + ptrWordInOut[7]);
+	state[8]  ^= (ptrWordIn[8]  + ptrWordInOut[8]);
+	state[9]  ^= (ptrWordIn[9]  + ptrWordInOut[9]);
+	state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
+	state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
+
+	//Applies the reduced-round transformation f to the sponge's state
+	reducedBlake2bLyra(state);
+
+	//M[rowOut][col] = M[rowOut][col] XOR rand
+	ptrWordOut[0] ^= state[0];
+	ptrWordOut[1] ^= state[1];
+	ptrWordOut[2] ^= state[2];
+	ptrWordOut[3] ^= state[3];
+	ptrWordOut[4] ^= state[4];
+	ptrWordOut[5] ^= state[5];
+	ptrWordOut[6] ^= state[6];
+	ptrWordOut[7] ^= state[7];
+	ptrWordOut[8] ^= state[8];
+	ptrWordOut[9] ^= state[9];
+	ptrWordOut[10] ^= state[10];
+	ptrWordOut[11] ^= state[11];
+
+	//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+	ptrWordInOut[0] ^= state[11];
+	ptrWordInOut[1] ^= state[0];
+	ptrWordInOut[2] ^= state[1];
+	ptrWordInOut[3] ^= state[2];
+	ptrWordInOut[4] ^= state[3];
+	ptrWordInOut[5] ^= state[4];
+	ptrWordInOut[6] ^= state[5];
+	ptrWordInOut[7] ^= state[6];
+	ptrWordInOut[8] ^= state[7];
+	ptrWordInOut[9] ^= state[8];
+	ptrWordInOut[10] ^= state[9];
+	ptrWordInOut[11] ^= state[10];
+
+	//Goes to next block
+	ptrWordOut += BLOCK_LEN_INT64;
+	ptrWordInOut += BLOCK_LEN_INT64;
+	ptrWordIn += BLOCK_LEN_INT64;
+    }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+/**
+ * Performs a duplex operation over "M[rowInOut] [+] M[rowIn]", writing the output "rand"
+ * on M[rowOut] and making "M[rowInOut] =  M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit
+ * rotation to the left.
+ *
+ * @param state          The current state of the sponge
+ * @param rowIn          Row used only as input
+ * @param rowInOut       Row used as input and to receive output after rotation
+ * @param rowOut         Row receiving the output
+ *
+ */
+/*
+inline void reducedDuplexRowSetupOLD(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
+    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
+    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
+    uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
+    int i;
+    for (i = 0; i < N_COLS; i++) {
+
+	//Absorbing "M[rowInOut] XOR M[rowIn]"
+	state[0] ^= ptrWordInOut[0] ^ ptrWordIn[0];
+	state[1] ^= ptrWordInOut[1] ^ ptrWordIn[1];
+	state[2] ^= ptrWordInOut[2] ^ ptrWordIn[2];
+	state[3] ^= ptrWordInOut[3] ^ ptrWordIn[3];
+	state[4] ^= ptrWordInOut[4] ^ ptrWordIn[4];
+	state[5] ^= ptrWordInOut[5] ^ ptrWordIn[5];
+	state[6] ^= ptrWordInOut[6] ^ ptrWordIn[6];
+	state[7] ^= ptrWordInOut[7] ^ ptrWordIn[7];
+	state[8] ^= ptrWordInOut[8] ^ ptrWordIn[8];
+	state[9] ^= ptrWordInOut[9] ^ ptrWordIn[9];
+	state[10] ^= ptrWordInOut[10] ^ ptrWordIn[10];
+	state[11] ^= ptrWordInOut[11] ^ ptrWordIn[11];
+
+	//Applies the reduced-round transformation f to the sponge's state
+	reducedBlake2bLyra(state);
+
+	//M[row][col] = rand
+	ptrWordOut[0] = state[0];
+	ptrWordOut[1] = state[1];
+	ptrWordOut[2] = state[2];
+	ptrWordOut[3] = state[3];
+	ptrWordOut[4] = state[4];
+	ptrWordOut[5] = state[5];
+	ptrWordOut[6] = state[6];
+	ptrWordOut[7] = state[7];
+	ptrWordOut[8] = state[8];
+	ptrWordOut[9] = state[9];
+	ptrWordOut[10] = state[10];
+	ptrWordOut[11] = state[11];
+
+
+	//M[row*][col] = M[row*][col] XOR rotW(rand)
+	ptrWordInOut[0] ^= state[10];
+	ptrWordInOut[1] ^= state[11];
+	ptrWordInOut[2] ^= state[0];
+	ptrWordInOut[3] ^= state[1];
+	ptrWordInOut[4] ^= state[2];
+	ptrWordInOut[5] ^= state[3];
+	ptrWordInOut[6] ^= state[4];
+	ptrWordInOut[7] ^= state[5];
+	ptrWordInOut[8] ^= state[6];
+	ptrWordInOut[9] ^= state[7];
+	ptrWordInOut[10] ^= state[8];
+	ptrWordInOut[11] ^= state[9];
+
+	//Goes to next column (i.e., next block in sequence)
+	ptrWordInOut += BLOCK_LEN_INT64;
+	ptrWordIn += BLOCK_LEN_INT64;
+	ptrWordOut += BLOCK_LEN_INT64;
+    }
+}
+*/
+
+/**
+ * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", writing the output "rand"
+ * on M[rowOut] and making "M[rowInOut] =  M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit
+ * rotation to the left.
+ *
+ * @param state          The current state of the sponge
+ * @param rowIn          Row used only as input
+ * @param rowInOut       Row used as input and to receive output after rotation
+ * @param rowOut         Row receiving the output
+ *
+ */
+/*
+inline void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
+    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
+    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
+    uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
+    int i;
+    for (i = 0; i < N_COLS; i++) {
+
+	//Absorbing "M[rowInOut] XOR M[rowIn]"
+	state[0] ^= ptrWordInOut[0] + ptrWordIn[0];
+	state[1] ^= ptrWordInOut[1] + ptrWordIn[1];
+	state[2] ^= ptrWordInOut[2] + ptrWordIn[2];
+	state[3] ^= ptrWordInOut[3] + ptrWordIn[3];
+	state[4] ^= ptrWordInOut[4] + ptrWordIn[4];
+	state[5] ^= ptrWordInOut[5] + ptrWordIn[5];
+	state[6] ^= ptrWordInOut[6] + ptrWordIn[6];
+	state[7] ^= ptrWordInOut[7] + ptrWordIn[7];
+	state[8] ^= ptrWordInOut[8] + ptrWordIn[8];
+	state[9] ^= ptrWordInOut[9] + ptrWordIn[9];
+	state[10] ^= ptrWordInOut[10] + ptrWordIn[10];
+	state[11] ^= ptrWordInOut[11] + ptrWordIn[11];
+
+	//Applies the reduced-round transformation f to the sponge's state
+	reducedBlake2bLyra(state);
+
+
+	//M[row*][col] = M[row*][col] XOR rotW(rand)
+	ptrWordInOut[0] ^= state[10];
+	ptrWordInOut[1] ^= state[11];
+	ptrWordInOut[2] ^= state[0];
+	ptrWordInOut[3] ^= state[1];
+	ptrWordInOut[4] ^= state[2];
+	ptrWordInOut[5] ^= state[3];
+	ptrWordInOut[6] ^= state[4];
+	ptrWordInOut[7] ^= state[5];
+	ptrWordInOut[8] ^= state[6];
+	ptrWordInOut[9] ^= state[7];
+	ptrWordInOut[10] ^= state[8];
+	ptrWordInOut[11] ^= state[9];
+
+
+	//M[row][col] = rand
+	ptrWordOut[0] = state[0] ^ ptrWordIn[0];
+	ptrWordOut[1] = state[1] ^ ptrWordIn[1];
+	ptrWordOut[2] = state[2] ^ ptrWordIn[2];
+	ptrWordOut[3] = state[3] ^ ptrWordIn[3];
+	ptrWordOut[4] = state[4] ^ ptrWordIn[4];
+	ptrWordOut[5] = state[5] ^ ptrWordIn[5];
+	ptrWordOut[6] = state[6] ^ ptrWordIn[6];
+	ptrWordOut[7] = state[7] ^ ptrWordIn[7];
+	ptrWordOut[8] = state[8] ^ ptrWordIn[8];
+	ptrWordOut[9] = state[9] ^ ptrWordIn[9];
+	ptrWordOut[10] = state[10] ^ ptrWordIn[10];
+	ptrWordOut[11] = state[11] ^ ptrWordIn[11];
+
+	//Goes to next column (i.e., next block in sequence)
+	ptrWordInOut += BLOCK_LEN_INT64;
+	ptrWordIn += BLOCK_LEN_INT64;
+	ptrWordOut += BLOCK_LEN_INT64;
+    }
+}
+*/
+
+/**
+ * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", writing the output "rand"
+ * on M[rowOut] and making "M[rowInOut] =  M[rowInOut] XOR rotW(rand)", where rotW is a 64-bit
+ * rotation to the left.
+ *
+ * @param state          The current state of the sponge
+ * @param rowIn          Row used only as input
+ * @param rowInOut       Row used as input and to receive output after rotation
+ * @param rowOut         Row receiving the output
+ *
+ */
+/*
+inline void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
+    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
+    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
+    uint64_t* ptrWordOut = rowOut;
+    int i;
+
+    for (i = 0; i < N_COLS / 2; i++) {
+	//Absorbing "M[rowInOut] XOR M[rowIn]"
+	state[0] ^= ptrWordInOut[0] + ptrWordIn[0];
+	state[1] ^= ptrWordInOut[1] + ptrWordIn[1];
+	state[2] ^= ptrWordInOut[2] + ptrWordIn[2];
+	state[3] ^= ptrWordInOut[3] + ptrWordIn[3];
+	state[4] ^= ptrWordInOut[4] + ptrWordIn[4];
+	state[5] ^= ptrWordInOut[5] + ptrWordIn[5];
+	state[6] ^= ptrWordInOut[6] + ptrWordIn[6];
+	state[7] ^= ptrWordInOut[7] + ptrWordIn[7];
+	state[8] ^= ptrWordInOut[8] + ptrWordIn[8];
+	state[9] ^= ptrWordInOut[9] + ptrWordIn[9];
+	state[10] ^= ptrWordInOut[10] + ptrWordIn[10];
+	state[11] ^= ptrWordInOut[11] + ptrWordIn[11];
+
+	//Applies the reduced-round transformation f to the sponge's state
+	reducedBlake2bLyra(state);
+
+
+	//M[row*][col] = M[row*][col] XOR rotW(rand)
+	ptrWordInOut[0] ^= state[10];
+	ptrWordInOut[1] ^= state[11];
+	ptrWordInOut[2] ^= state[0];
+	ptrWordInOut[3] ^= state[1];
+	ptrWordInOut[4] ^= state[2];
+	ptrWordInOut[5] ^= state[3];
+	ptrWordInOut[6] ^= state[4];
+	ptrWordInOut[7] ^= state[5];
+	ptrWordInOut[8] ^= state[6];
+	ptrWordInOut[9] ^= state[7];
+	ptrWordInOut[10] ^= state[8];
+	ptrWordInOut[11] ^= state[9];
+
+
+	//M[row][col] = rand
+	ptrWordOut[0] = state[0] ^ ptrWordIn[0];
+	ptrWordOut[1] = state[1] ^ ptrWordIn[1];
+	ptrWordOut[2] = state[2] ^ ptrWordIn[2];
+	ptrWordOut[3] = state[3] ^ ptrWordIn[3];
+	ptrWordOut[4] = state[4] ^ ptrWordIn[4];
+	ptrWordOut[5] = state[5] ^ ptrWordIn[5];
+	ptrWordOut[6] = state[6] ^ ptrWordIn[6];
+	ptrWordOut[7] = state[7] ^ ptrWordIn[7];
+	ptrWordOut[8] = state[8] ^ ptrWordIn[8];
+	ptrWordOut[9] = state[9] ^ ptrWordIn[9];
+	ptrWordOut[10] = state[10] ^ ptrWordIn[10];
+	ptrWordOut[11] = state[11] ^ ptrWordIn[11];
+
+	//Goes to next column (i.e., next block in sequence)
+	ptrWordInOut += BLOCK_LEN_INT64;
+	ptrWordIn += BLOCK_LEN_INT64;
+	ptrWordOut += 2 * BLOCK_LEN_INT64;
+    }
+
+    ptrWordOut =  rowOut + BLOCK_LEN_INT64;
+    for (i = 0; i < N_COLS / 2; i++) {
+	//Absorbing "M[rowInOut] XOR M[rowIn]"
+	state[0] ^= ptrWordInOut[0] + ptrWordIn[0];
+	state[1] ^= ptrWordInOut[1] + ptrWordIn[1];
+	state[2] ^= ptrWordInOut[2] + ptrWordIn[2];
+	state[3] ^= ptrWordInOut[3] + ptrWordIn[3];
+	state[4] ^= ptrWordInOut[4] + ptrWordIn[4];
+	state[5] ^= ptrWordInOut[5] + ptrWordIn[5];
+	state[6] ^= ptrWordInOut[6] + ptrWordIn[6];
+	state[7] ^= ptrWordInOut[7] + ptrWordIn[7];
+	state[8] ^= ptrWordInOut[8] + ptrWordIn[8];
+	state[9] ^= ptrWordInOut[9] + ptrWordIn[9];
+	state[10] ^= ptrWordInOut[10] + ptrWordIn[10];
+	state[11] ^= ptrWordInOut[11] + ptrWordIn[11];
+
+	//Applies the reduced-round transformation f to the sponge's state
+	reducedBlake2bLyra(state);
+
+
+	//M[row*][col] = M[row*][col] XOR rotW(rand)
+	ptrWordInOut[0] ^= state[10];
+	ptrWordInOut[1] ^= state[11];
+	ptrWordInOut[2] ^= state[0];
+	ptrWordInOut[3] ^= state[1];
+	ptrWordInOut[4] ^= state[2];
+	ptrWordInOut[5] ^= state[3];
+	ptrWordInOut[6] ^= state[4];
+	ptrWordInOut[7] ^= state[5];
+	ptrWordInOut[8] ^= state[6];
+	ptrWordInOut[9] ^= state[7];
+	ptrWordInOut[10] ^= state[8];
+	ptrWordInOut[11] ^= state[9];
+
+
+	//M[row][col] = rand
+	ptrWordOut[0] = state[0] ^ ptrWordIn[0];
+	ptrWordOut[1] = state[1] ^ ptrWordIn[1];
+	ptrWordOut[2] = state[2] ^ ptrWordIn[2];
+	ptrWordOut[3] = state[3] ^ ptrWordIn[3];
+	ptrWordOut[4] = state[4] ^ ptrWordIn[4];
+	ptrWordOut[5] = state[5] ^ ptrWordIn[5];
+	ptrWordOut[6] = state[6] ^ ptrWordIn[6];
+	ptrWordOut[7] = state[7] ^ ptrWordIn[7];
+	ptrWordOut[8] = state[8] ^ ptrWordIn[8];
+	ptrWordOut[9] = state[9] ^ ptrWordIn[9];
+	ptrWordOut[10] = state[10] ^ ptrWordIn[10];
+	ptrWordOut[11] = state[11] ^ ptrWordIn[11];
+
+	//Goes to next column (i.e., next block in sequence)
+	ptrWordInOut += BLOCK_LEN_INT64;
+	ptrWordIn += BLOCK_LEN_INT64;
+	ptrWordOut += 2 * BLOCK_LEN_INT64;
+    }
+}
+*/
+
+/**
+ * Performs a duplex operation over "M[rowInOut] XOR M[rowIn]", using the output "rand"
+ * to make "M[rowOut][col] = M[rowOut][col] XOR rand" and "M[rowInOut] = M[rowInOut] XOR rotW(rand)",
+ * where rotW is a 64-bit rotation to the left.
+ *
+ * @param state          The current state of the sponge
+ * @param rowIn          Row used only as input
+ * @param rowInOut       Row used as input and to receive output after rotation
+ * @param rowOut         Row receiving the output
+ *
+ */
+/*
+inline void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut) {
+    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
+    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
+    uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
+    int i;
+    for (i = 0; i < N_COLS; i++) {
+
+	//Absorbing "M[rowInOut] XOR M[rowIn]"
+	state[0] ^= ptrWordInOut[0] + ptrWordIn[0];
+	state[1] ^= ptrWordInOut[1] + ptrWordIn[1];
+	state[2] ^= ptrWordInOut[2] + ptrWordIn[2];
+	state[3] ^= ptrWordInOut[3] + ptrWordIn[3];
+	state[4] ^= ptrWordInOut[4] + ptrWordIn[4];
+	state[5] ^= ptrWordInOut[5] + ptrWordIn[5];
+	state[6] ^= ptrWordInOut[6] + ptrWordIn[6];
+	state[7] ^= ptrWordInOut[7] + ptrWordIn[7];
+	state[8] ^= ptrWordInOut[8] + ptrWordIn[8];
+	state[9] ^= ptrWordInOut[9] + ptrWordIn[9];
+	state[10] ^= ptrWordInOut[10] + ptrWordIn[10];
+	state[11] ^= ptrWordInOut[11] + ptrWordIn[11];
+
+	//Applies the reduced-round transformation f to the sponge's state
+	reducedBlake2bLyra(state);
+
+	//M[rowOut][col] = M[rowOut][col] XOR rand
+	ptrWordOut[0] ^= state[0];
+	ptrWordOut[1] ^= state[1];
+	ptrWordOut[2] ^= state[2];
+	ptrWordOut[3] ^= state[3];
+	ptrWordOut[4] ^= state[4];
+	ptrWordOut[5] ^= state[5];
+	ptrWordOut[6] ^= state[6];
+	ptrWordOut[7] ^= state[7];
+	ptrWordOut[8] ^= state[8];
+	ptrWordOut[9] ^= state[9];
+	ptrWordOut[10] ^= state[10];
+	ptrWordOut[11] ^= state[11];
+
+	//M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
+
+
+	//Goes to next block
+	ptrWordOut += BLOCK_LEN_INT64;
+	ptrWordInOut += BLOCK_LEN_INT64;
+	ptrWordIn += BLOCK_LEN_INT64;
+    }
+}
+*/
+
+/**
+ Prints an array of unsigned chars
+ */
+void printArray(unsigned char *array, unsigned int size, char *name) {
+	int i;
+	printf("%s: ", name);
+	for (i = 0; i < size; i++) {
+		printf("%2x|", array[i]);
+	}
+	printf("\n");
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////
--- a/lyra2/Sponge.h
+++ b/lyra2/Sponge.h
@ -0,0 +1,108 @@
+/**
+ * Header file for Blake2b's internal permutation in the form of a sponge.
+ * This code is based on the original Blake2b's implementation provided by
+ * Samuel Neves (https://blake2.net/)
+ *
+ * Author: The Lyra PHC team (http://www.lyra-kdf.net/) -- 2014.
+ *
+ * This software is hereby placed in the public domain.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
+ * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef SPONGE_H_
+#define SPONGE_H_
+
+#include <stdint.h>
+
+#if defined(__GNUC__)
+#define ALIGN __attribute__ ((aligned(32)))
+#elif defined(_MSC_VER)
+#define ALIGN __declspec(align(32))
+#else
+#define ALIGN
+#endif
+
+
+/*Blake2b IV Array*/
+static const uint64_t blake2b_IV[8] =
+{
+  0x6a09e667f3bcc908ULL, 0xbb67ae8584caa73bULL,
+  0x3c6ef372fe94f82bULL, 0xa54ff53a5f1d36f1ULL,
+  0x510e527fade682d1ULL, 0x9b05688c2b3e6c1fULL,
+  0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL
+};
+
+/*Blake2b's rotation*/
+static __inline uint64_t rotr64( const uint64_t w, const unsigned c ){
+    return ( w >> c ) | ( w << ( 64 - c ) );
+}
+
+/*Blake2b's G function*/
+#define G(r,i,a,b,c,d) \
+  do { \
+    a = a + b; \
+    d = rotr64(d ^ a, 32); \
+    c = c + d; \
+    b = rotr64(b ^ c, 24); \
+    a = a + b; \
+    d = rotr64(d ^ a, 16); \
+    c = c + d; \
+    b = rotr64(b ^ c, 63); \
+  } while(0)
+
+
+/*One Round of the Blake2b's compression function*/
+#define ROUND_LYRA(r)  \
+    G(r,0,v[ 0],v[ 4],v[ 8],v[12]); \
+    G(r,1,v[ 1],v[ 5],v[ 9],v[13]); \
+    G(r,2,v[ 2],v[ 6],v[10],v[14]); \
+    G(r,3,v[ 3],v[ 7],v[11],v[15]); \
+    G(r,4,v[ 0],v[ 5],v[10],v[15]); \
+    G(r,5,v[ 1],v[ 6],v[11],v[12]); \
+    G(r,6,v[ 2],v[ 7],v[ 8],v[13]); \
+    G(r,7,v[ 3],v[ 4],v[ 9],v[14]);
+
+
+//---- Housekeeping
+void initState(uint64_t state[/*16*/]);
+
+//---- Squeezes
+void squeeze(uint64_t *state, unsigned char *out, unsigned int len);
+void reducedSqueezeRow0(uint64_t* state, uint64_t* row);
+
+//---- Absorbs
+void absorbBlock(uint64_t *state, const uint64_t *in);
+void absorbBlockBlake2Safe(uint64_t *state, const uint64_t *in);
+
+//---- Duplexes
+void reducedDuplexRow1(uint64_t *state, uint64_t *rowIn, uint64_t *rowOut);
+void reducedDuplexRowSetup(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+
+//---- Misc
+void printArray(unsigned char *array, unsigned int size, char *name);
+
+////////////////////////////////////////////////////////////////////////////////////////////////
+
+
+////TESTS////
+//void reducedDuplexRowc(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+//void reducedDuplexRowd(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+//void reducedDuplexRowSetupv4(uint64_t *state, uint64_t *rowIn1, uint64_t *rowIn2, uint64_t *rowOut1, uint64_t *rowOut2);
+//void reducedDuplexRowSetupv5(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+//void reducedDuplexRowSetupv5c(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+//void reducedDuplexRowSetupv5d(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut);
+/////////////
+
+
+#endif /* SPONGE_H_ */
--- a/lyra2/cuda_lyra2.cu
+++ b/lyra2/cuda_lyra2.cu
@ -0,0 +1,536 @@
+#include <memory.h>
+
+#include "cuda_helper.h"
+
+static __constant__ uint2 blake2b_IV[8] = {
+	{ 0xf3bcc908, 0x6a09e667 },
+	{ 0x84caa73b, 0xbb67ae85 },
+	{ 0xfe94f82b, 0x3c6ef372 },
+	{ 0x5f1d36f1, 0xa54ff53a },
+	{ 0xade682d1, 0x510e527f },
+	{ 0x2b3e6c1f, 0x9b05688c },
+	{ 0xfb41bd6b, 0x1f83d9ab },
+	{ 0x137e2179, 0x5be0cd19 }
+};
+// data: 0-4 outputhash 4-8 outputhash 8-16 basil
+
+#define reduceDuplexRowSetup(rowIn, rowInOut, rowOut) { \
+	for (int i = 0; i < 8; i++) { \
+		for (int j = 0; j < 12; j++) \
+			state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \
+		round_lyra_v35(state); \
+		for (int j = 0; j < 12; j++) \
+			Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j]; \
+		Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
+		Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
+		Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
+		Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
+		Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
+		Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
+		Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
+		Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
+		Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
+		Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
+		Matrix[10+ 12 * i][rowInOut] ^= state[9]; \
+		Matrix[11+ 12 * i][rowInOut] ^= state[10]; \
+	} \
+  }
+
+#define reduceDuplexRow(rowIn, rowInOut, rowOut) { \
+	for (int i = 0; i < 8; i++) { \
+		for (int j = 0; j < 12; j++) \
+			state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \
+		round_lyra_v35(state); \
+		for (int j = 0; j < 12; j++) \
+			Matrix[j + 12 * i][rowOut] ^= state[j]; \
+		Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
+		Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
+		Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
+		Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
+		Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
+		Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
+		Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
+		Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
+		Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
+		Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
+		Matrix[10+ 12 * i][rowInOut] ^= state[9]; \
+		Matrix[11+ 12 * i][rowInOut] ^= state[10]; \
+	} \
+  }
+
+#define absorbblock(in)  { \
+	state[0] ^= Matrix[0][in]; \
+	state[1] ^= Matrix[1][in]; \
+	state[2] ^= Matrix[2][in]; \
+	state[3] ^= Matrix[3][in]; \
+	state[4] ^= Matrix[4][in]; \
+	state[5] ^= Matrix[5][in]; \
+	state[6] ^= Matrix[6][in]; \
+	state[7] ^= Matrix[7][in]; \
+	state[8] ^= Matrix[8][in]; \
+	state[9] ^= Matrix[9][in]; \
+	state[10] ^= Matrix[10][in]; \
+	state[11] ^= Matrix[11][in]; \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+  }
+
+//// test version
+#define reduceDuplexRowSetup_test(rowIn, rowInOut, rowOut) { \
+	for (int i = 0; i < 8; i++) { \
+		for (int j = 0; j < 12; j++) \
+			state[j] ^= Matrix[j][i][rowIn] + Matrix[j][i][rowInOut]; \
+		round_lyra_v35(state); \
+		for (int j = 0; j < 12; j++) \
+			Matrix[j][7-i][rowOut] = Matrix[j][i][rowIn] ^ state[j]; \
+		Matrix[0][i][rowInOut] ^= state[11]; \
+		Matrix[1][i][rowInOut] ^= state[0]; \
+		Matrix[2][i][rowInOut] ^= state[1]; \
+		Matrix[3][i][rowInOut] ^= state[2]; \
+		Matrix[4][i][rowInOut] ^= state[3]; \
+		Matrix[5][i][rowInOut] ^= state[4]; \
+		Matrix[6][i][rowInOut] ^= state[5]; \
+		Matrix[7][i][rowInOut] ^= state[6]; \
+		Matrix[8][i][rowInOut] ^= state[7]; \
+		Matrix[9][i][rowInOut] ^= state[8]; \
+		Matrix[10][i][rowInOut] ^= state[9]; \
+		Matrix[11][i][rowInOut] ^= state[10]; \
+	} \
+  }
+
+#define reduceDuplexRow_test(rowIn, rowInOut, rowOut) { \
+	for (int i = 0; i < 8; i++) { \
+		for (int j = 0; j < 12; j++) \
+			state[j] ^= Matrix[j][i][rowIn] + Matrix[j][i][rowInOut]; \
+		round_lyra_v35(state); \
+		for (int j = 0; j < 12; j++) \
+			Matrix[j][i][rowOut] ^= state[j]; \
+		Matrix[0][i][rowInOut] ^= state[11]; \
+		Matrix[1][i][rowInOut] ^= state[0]; \
+		Matrix[2][i][rowInOut] ^= state[1]; \
+		Matrix[3][i][rowInOut] ^= state[2]; \
+		Matrix[4][i][rowInOut] ^= state[3]; \
+		Matrix[5][i][rowInOut] ^= state[4]; \
+		Matrix[6][i][rowInOut] ^= state[5]; \
+		Matrix[7][i][rowInOut] ^= state[6]; \
+		Matrix[8][i][rowInOut] ^= state[7]; \
+		Matrix[9][i][rowInOut] ^= state[8]; \
+		Matrix[10][i][rowInOut] ^= state[9]; \
+		Matrix[11][i][rowInOut] ^= state[10]; \
+	} \
+  }
+
+#define absorbblock_test(in) { \
+	state[0] ^= Matrix[0][0][ in]; \
+	state[1] ^= Matrix[1][0][in]; \
+	state[2] ^= Matrix[2][0][in]; \
+	state[3] ^= Matrix[3][0][in]; \
+	state[4] ^= Matrix[4][0][in]; \
+	state[5] ^= Matrix[5][0][in]; \
+	state[6] ^= Matrix[6][0][in]; \
+	state[7] ^= Matrix[7][0][in]; \
+	state[8] ^= Matrix[8][0][in]; \
+	state[9] ^= Matrix[9][0][in]; \
+	state[10] ^= Matrix[10][0][in]; \
+	state[11] ^= Matrix[11][0][in]; \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+	round_lyra_v35(state); \
+  }
+
+//// compute 30 version
+#define reduceDuplexRowSetup_v30(rowIn, rowInOut, rowOut) { \
+	for (int i = 0; i < 8; i++) { \
+		for (int j = 0; j < 12; j++) \
+			state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \
+		round_lyra_v30(state); \
+		for (int j = 0; j < 12; j++) \
+			Matrix[j + 84 - 12 * i][rowOut] = Matrix[12 * i + j][rowIn] ^ state[j]; \
+		Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
+		Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
+		Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
+		Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
+		Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
+		Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
+		Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
+		Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
+		Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
+		Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
+		Matrix[10 + 12 * i][rowInOut] ^= state[9]; \
+		Matrix[11 + 12 * i][rowInOut] ^= state[10]; \
+	} \
+  }
+
+#define reduceDuplexRow_v30(rowIn, rowInOut, rowOut) { \
+	for (int i = 0; i < 8; i++) { \
+		for (int j = 0; j < 12; j++) \
+			state[j] ^= Matrix[12 * i + j][rowIn] + Matrix[12 * i + j][rowInOut]; \
+		round_lyra_v30(state); \
+		for (int j = 0; j < 12; j++) \
+		 	Matrix[j + 12 * i][rowOut] ^= state[j]; \
+		Matrix[0 + 12 * i][rowInOut] ^= state[11]; \
+		Matrix[1 + 12 * i][rowInOut] ^= state[0]; \
+		Matrix[2 + 12 * i][rowInOut] ^= state[1]; \
+		Matrix[3 + 12 * i][rowInOut] ^= state[2]; \
+		Matrix[4 + 12 * i][rowInOut] ^= state[3]; \
+		Matrix[5 + 12 * i][rowInOut] ^= state[4]; \
+		Matrix[6 + 12 * i][rowInOut] ^= state[5]; \
+		Matrix[7 + 12 * i][rowInOut] ^= state[6]; \
+		Matrix[8 + 12 * i][rowInOut] ^= state[7]; \
+		Matrix[9 + 12 * i][rowInOut] ^= state[8]; \
+		Matrix[10 + 12 * i][rowInOut] ^= state[9]; \
+		Matrix[11 + 12 * i][rowInOut] ^= state[10]; \
+	} \
+  }
+
+#define absorbblock_v30(in) { \
+	state[0] ^= Matrix[0][in]; \
+	state[1] ^= Matrix[1][in]; \
+	state[2] ^= Matrix[2][in]; \
+	state[3] ^= Matrix[3][in]; \
+	state[4] ^= Matrix[4][in]; \
+	state[5] ^= Matrix[5][in]; \
+	state[6] ^= Matrix[6][in]; \
+	state[7] ^= Matrix[7][in]; \
+	state[8] ^= Matrix[8][in]; \
+	state[9] ^= Matrix[9][in]; \
+	state[10] ^= Matrix[10][in]; \
+	state[11] ^= Matrix[11][in]; \
+	round_lyra_v30(state); \
+	round_lyra_v30(state); \
+	round_lyra_v30(state); \
+	round_lyra_v30(state); \
+	round_lyra_v30(state); \
+	round_lyra_v30(state); \
+	round_lyra_v30(state); \
+	round_lyra_v30(state); \
+	round_lyra_v30(state); \
+	round_lyra_v30(state); \
+	round_lyra_v30(state); \
+	round_lyra_v30(state); \
+  }
+
+static __device__ __forceinline__
+void Gfunc_v35(uint2 & a, uint2 &b, uint2 &c, uint2 &d)
+{
+	a += b; d ^= a; d = ROR2(d, 32);
+	c += d; b ^= c; b = ROR2(b, 24);
+	a += b; d ^= a; d = ROR2(d, 16);
+	c += d; b ^= c; b = ROR2(b, 63);
+}
+
+static __device__ __forceinline__
+void Gfunc_v30(uint64_t & a, uint64_t &b, uint64_t &c, uint64_t &d)
+{
+	a += b; d ^= a; d = ROTR64(d, 32);
+	c += d; b ^= c; b = ROTR64(b, 24);
+	a += b; d ^= a; d = ROTR64(d, 16);
+	c += d; b ^= c; b = ROTR64(b, 63);
+}
+
+#define round_lyra_v35_new(state) { \
+	Gfunc_v35(state[0], state[4], state[8], state[12]); \
+	Gfunc_v35(state[1], state[5], state[9], state[13]); \
+	Gfunc_v35(state[2], state[6], state[10], state[14]); \
+	Gfunc_v35(state[3], state[7], state[11], state[15]); \
+	Gfunc_v35(state[0], state[5], state[10], state[15]); \
+	Gfunc_v35(state[1], state[6], state[11], state[12]); \
+	Gfunc_v35(state[2], state[7], state[8], state[13]); \
+	Gfunc_v35(state[3], state[4], state[9], state[14]); \
+}
+
+static __device__ __forceinline__ void round_lyra_v35(uint2 *s)
+{
+	Gfunc_v35(s[0], s[4], s[8],  s[12]);
+	Gfunc_v35(s[1], s[5], s[9],  s[13]);
+	Gfunc_v35(s[2], s[6], s[10], s[14]);
+	Gfunc_v35(s[3], s[7], s[11], s[15]);
+	Gfunc_v35(s[0], s[5], s[10], s[15]);
+	Gfunc_v35(s[1], s[6], s[11], s[12]);
+	Gfunc_v35(s[2], s[7], s[8],  s[13]);
+	Gfunc_v35(s[3], s[4], s[9],  s[14]);
+}
+
+static __device__ __forceinline__ void round_lyra_v30(uint64_t *s)
+{
+	Gfunc_v30(s[0], s[4], s[8], s[12]);
+	Gfunc_v30(s[1], s[5], s[9], s[13]);
+	Gfunc_v30(s[2], s[6], s[10], s[14]);
+	Gfunc_v30(s[3], s[7], s[11], s[15]);
+	Gfunc_v30(s[0], s[5], s[10], s[15]);
+	Gfunc_v30(s[1], s[6], s[11], s[12]);
+	Gfunc_v30(s[2], s[7], s[8], s[13]);
+	Gfunc_v30(s[3], s[4], s[9], s[14]);
+}
+
+__global__ __launch_bounds__(256, 1)
+void lyra2_gpu_hash_32_v30(int threads, uint32_t startNounce, uint64_t *outputHash)
+{
+
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint64_t state[16];
+		#pragma unroll
+		for (int i = 0; i<4; i++) { state[i] = outputHash[threads*i + thread]; } //password
+		#pragma unroll
+		for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt
+		#pragma unroll
+		for (int i = 0; i<8; i++) { state[i + 8] = devectorize(blake2b_IV[i]); }
+
+		// blake2blyra x2
+		#pragma unroll 24
+		for (int i = 0; i<24; i++) { round_lyra_v30(state); } //because 12 is not enough
+
+		uint64_t Matrix[96][8]; // not cool
+		// reducedSqueezeRow0
+		#pragma unroll 8
+		for (int i = 0; i < 8; i++) {
+			int idx = 84-12*i;
+			#pragma unroll 12
+			for (int j = 0; j<12; j++) { Matrix[j + idx][0] = state[j]; }
+			round_lyra_v30(state);
+		}
+
+		// reducedSqueezeRow1
+		#pragma unroll 8
+		for (int i = 0; i < 8; i++)
+		{
+			int idx0= 12*i;
+			int idx1= 84-idx0;
+			#pragma unroll 12
+			for (int j = 0; j<12; j++) { state[j] ^= Matrix[j + idx0][0]; }
+			round_lyra_v30(state);
+			#pragma unroll 12
+			for (int j = 0; j<12; j++) { Matrix[j + idx1][1] = Matrix[j + idx0][0] ^ state[j]; }
+		}
+
+		reduceDuplexRowSetup_v30(1, 0, 2);
+		reduceDuplexRowSetup_v30(2, 1, 3);
+		reduceDuplexRowSetup_v30(3, 0, 4);
+		reduceDuplexRowSetup_v30(4, 3, 5);
+		reduceDuplexRowSetup_v30(5, 2, 6);
+		reduceDuplexRowSetup_v30(6, 1, 7);
+
+		uint64_t rowa;
+		rowa = state[0] & 7;
+		reduceDuplexRow_v30(7, rowa, 0);
+		rowa = state[0] & 7;
+		reduceDuplexRow_v30(0, rowa, 3);
+		rowa = state[0] & 7;
+		reduceDuplexRow_v30(3, rowa, 6);
+		rowa = state[0] & 7;
+		reduceDuplexRow_v30(6, rowa, 1);
+		rowa = state[0] & 7;
+		reduceDuplexRow_v30(1, rowa, 4);
+		rowa = state[0] & 7;
+		reduceDuplexRow_v30(4, rowa, 7);
+		rowa = state[0] & 7;
+		reduceDuplexRow_v30(7, rowa, 2);
+		rowa = state[0] & 7;
+		reduceDuplexRow_v30(2, rowa, 5);
+
+		absorbblock_v30(rowa);
+
+		#pragma unroll
+		for (int i = 0; i<4; i++) {
+			outputHash[threads*i + thread] = state[i];
+		} //password
+
+	} //thread
+}
+
+__global__ __launch_bounds__(256, 1)
+void lyra2_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash)
+{
+
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint2 state[16];
+		#pragma unroll
+		for (int i = 0; i<4; i++) { LOHI(state[i].x, state[i].y, outputHash[threads*i + thread]); } //password
+		#pragma unroll
+		for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt
+		#pragma unroll
+		for (int i = 0; i<8; i++) { state[i + 8] = blake2b_IV[i]; }
+
+		// blake2blyra x2
+		#pragma unroll 24
+		for (int i = 0; i<24; i++) { round_lyra_v35(state); } //because 12 is not enough
+
+		uint2 Matrix[96][8]; // not cool
+
+		// reducedSqueezeRow0
+		#pragma unroll 8
+		for (int i = 0; i < 8; i++)
+		{
+			#pragma unroll 12
+			for (int j = 0; j<12; j++) { Matrix[j + 84 - 12 * i][0] = state[j]; }
+			round_lyra_v35(state);
+		}
+
+		// reducedSqueezeRow1
+		#pragma unroll 8
+		for (int i = 0; i < 8; i++)
+		{
+			#pragma unroll 12
+			for (int j = 0; j<12; j++) { state[j] ^= Matrix[j + 12 * i][0]; }
+			round_lyra_v35(state);
+			#pragma unroll 12
+			for (int j = 0; j<12; j++) { Matrix[j + 84 - 12 * i][1] = Matrix[j + 12 * i][0] ^ state[j]; }
+		}
+
+		reduceDuplexRowSetup(1, 0, 2);
+		reduceDuplexRowSetup(2, 1, 3);
+		reduceDuplexRowSetup(3, 0, 4);
+		reduceDuplexRowSetup(4, 3, 5);
+		reduceDuplexRowSetup(5, 2, 6);
+		reduceDuplexRowSetup(6, 1, 7);
+
+		uint32_t rowa;
+		rowa = state[0].x & 7;
+		reduceDuplexRow(7, rowa, 0);
+		rowa = state[0].x & 7;
+		reduceDuplexRow(0, rowa, 3);
+		rowa = state[0].x & 7;
+		reduceDuplexRow(3, rowa, 6);
+		rowa = state[0].x & 7;
+		reduceDuplexRow(6, rowa, 1);
+		rowa = state[0].x & 7;
+		reduceDuplexRow(1, rowa, 4);
+		rowa = state[0].x & 7;
+		reduceDuplexRow(4, rowa, 7);
+		rowa = state[0].x & 7;
+		reduceDuplexRow(7, rowa, 2);
+		rowa = state[0].x & 7;
+		reduceDuplexRow(2, rowa, 5);
+
+		absorbblock(rowa);
+
+		#pragma unroll
+		for (int i = 0; i<4; i++) {
+			outputHash[threads*i + thread] = devectorize(state[i]);
+		} //password
+
+	} //thread
+}
+
+__global__
+void __launch_bounds__(256, 1) lyra2_gpu_hash_32_test(int threads, uint32_t startNounce, uint64_t *outputHash)
+{
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint2 state[16];
+		#pragma unroll
+		for (int i = 0; i<4; i++) { LOHI(state[i].x, state[i].y, outputHash[threads*i + thread]); } //password
+		#pragma unroll
+		for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt
+		#pragma unroll
+		for (int i = 0; i<8; i++) { state[i + 8] = blake2b_IV[i]; }
+
+		// blake2blyra x2
+		#pragma unroll 24
+		for (int i = 0; i<24; i++) { round_lyra_v35(state); } //because 12 is not enough
+
+		uint2 Matrix[12][8][8]; // not cool
+
+		// reducedSqueezeRow0
+		#pragma unroll 8
+		for (int i = 0; i < 8; i++) {
+			#pragma unroll 12
+			for (int j = 0; j<12; j++) { Matrix[j][7-i][0] = state[j]; }
+			round_lyra_v35(state);
+		}
+
+		// reducedSqueezeRow1
+		#pragma unroll 8
+		for (int i = 0; i < 8; i++)
+		{
+			#pragma unroll 12
+			for (int j = 0; j<12; j++) { state[j] ^= Matrix[j][i][0]; }
+			round_lyra_v35(state);
+			#pragma unroll 12
+			for (int j = 0; j<12; j++) { Matrix[j][7-i][1] = Matrix[j][i][0] ^ state[j]; }
+		}
+
+		reduceDuplexRowSetup_test(1, 0, 2);
+		reduceDuplexRowSetup_test(2, 1, 3);
+		reduceDuplexRowSetup_test(3, 0, 4);
+		reduceDuplexRowSetup_test(4, 3, 5);
+		reduceDuplexRowSetup_test(5, 2, 6);
+		reduceDuplexRowSetup_test(6, 1, 7);
+
+		uint64_t rowa;
+		rowa = devectorize(state[0]) & 7;
+		reduceDuplexRow_test(7, rowa, 0);
+		rowa = devectorize(state[0]) & 7;
+		reduceDuplexRow_test(0, rowa, 3);
+		rowa = devectorize(state[0]) & 7;
+		reduceDuplexRow_test(3, rowa, 6);
+		rowa = devectorize(state[0]) & 7;
+		reduceDuplexRow_test(6, rowa, 1);
+		rowa = devectorize(state[0]) & 7;
+		reduceDuplexRow_test(1, rowa, 4);
+		rowa = devectorize(state[0]) & 7;
+		reduceDuplexRow_test(4, rowa, 7);
+		rowa = devectorize(state[0]) & 7;
+		reduceDuplexRow_test(7, rowa, 2);
+		rowa = devectorize(state[0]) & 7;
+		reduceDuplexRow_test(2, rowa, 5);
+
+		absorbblock_test(rowa);
+
+		#pragma unroll
+		for (int i = 0; i<4; i++) {
+			outputHash[threads*i + thread] = devectorize(state[i]);
+		} //password
+
+	} //thread
+}
+
+__host__
+void lyra2_cpu_init(int thr_id, int threads)
+{
+	//not used
+}
+
+__host__
+void lyra2_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
+{
+	const int threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	if (device_sm[device_map[thr_id]] >= 350) {
+		lyra2_gpu_hash_32 <<<grid, block>>> (threads, startNounce, d_outputHash);
+	} else {
+		// kernel for compute30 card
+		lyra2_gpu_hash_32_v30 <<<grid, block >>> (threads, startNounce, d_outputHash);
+	}
+
+	cudaDeviceSynchronize();
+	//MyStreamSynchronize(NULL, order, thr_id);
+}
+
--- a/lyra2/lyra2RE.cu
+++ b/lyra2/lyra2RE.cu
@ -0,0 +1,133 @@
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_keccak.h"
+#include "lyra2/Lyra2.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+
+static _ALIGN(64) uint64_t *d_hash[8];
+
+extern void quark_check_cpu_init(int thr_id, int threads);
+extern void quark_check_cpu_setTarget(const void *ptarget);
+extern uint32_t quark_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+extern uint32_t quark_check_cpu_hash_64_2(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint64_t *d_inputHash, int order);
+
+extern void blake256_cpu_init(int thr_id, int threads);
+extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
+extern void blake256_cpu_setBlock_80(uint32_t *pdata);
+extern void keccak256_cpu_hash_32(int thr_id, int threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+extern void keccak256_cpu_init(int thr_id, int threads);
+extern void skein256_cpu_hash_32(int thr_id, int threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+extern void skein256_cpu_init(int thr_id, int threads);
+
+extern void lyra2_cpu_hash_32(int thr_id, int threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+extern void lyra2_cpu_init(int thr_id, int threads);
+
+extern void groestl256_setTarget(const void *ptarget);
+extern uint32_t groestl256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t *d_outputHash, int order);
+extern void groestl256_cpu_init(int thr_id, int threads);
+
+extern "C" void lyra_hash(void *state, const void *input)
+{
+	sph_blake256_context     ctx_blake;
+	sph_keccak256_context    ctx_keccak;
+	sph_skein256_context     ctx_skein;
+	sph_groestl256_context   ctx_groestl;
+
+	uint32_t hashA[8], hashB[8], hash[8];
+
+	sph_blake256_init(&ctx_blake);
+	sph_blake256(&ctx_blake, input, 80);
+	sph_blake256_close(&ctx_blake, hashA);
+
+	sph_keccak256_init(&ctx_keccak);
+	sph_keccak256(&ctx_keccak, hashA, 32);
+	sph_keccak256_close(&ctx_keccak, hashB);
+
+	LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
+
+	sph_skein256_init(&ctx_skein);
+	sph_skein256(&ctx_skein, hashA, 32);
+	sph_skein256_close(&ctx_skein, hashB);
+
+	sph_groestl256_init(&ctx_groestl);
+	sph_groestl256(&ctx_groestl, hashB, 32);
+	sph_groestl256_close(&ctx_groestl, hash);
+
+	// seems wrong : hash or hashB ?
+	memcpy(state, hashB, 32);
+}
+
+static bool init[8] = { 0 };
+
+extern "C" int scanhash_lyra(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce,
+	unsigned long *hashes_done)
+{
+	const uint32_t first_nonce = pdata[19];
+	int intensity = (device_sm[device_map[thr_id]] >= 500) ? 19 : 18;
+	int throughput = opt_work_size ? opt_work_size : (1 << intensity); // 18=256*256*4;
+	throughput = min(throughput, (int)(max_nonce - first_nonce));
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x0000ff;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+
+		blake256_cpu_init(thr_id, throughput);
+		keccak256_cpu_init(thr_id,throughput);
+		skein256_cpu_init(thr_id, throughput);
+		groestl256_cpu_init(thr_id, throughput);
+		lyra2_cpu_init(thr_id, throughput);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
+
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+
+	blake256_cpu_setBlock_80(pdata);
+	groestl256_setTarget(ptarget);
+
+	do {
+		int order = 0;
+		uint32_t foundNonce;
+
+		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+
+		foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		if  (foundNonce != 0xffffffff)
+		{
+			// const uint32_t Htarg = ptarget[6];
+			uint32_t vhash64[8];
+			be32enc(&endiandata[19], foundNonce);
+			lyra_hash(vhash64, endiandata);
+
+//			if (vhash64[7]<=Htarg) { // && fulltest(vhash64, ptarget)) {
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				pdata[19] = foundNonce;
+				return 1;
+//			} else {
+//				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
+//			}
+		}
+
+		pdata[19] += throughput;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}
--- a/miner.h
+++ b/miner.h
@ -328,6 +328,10 @@ extern int scanhash_fresh(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);

+extern int scanhash_lyra(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce,
+	unsigned long *hashes_done);
+
 extern int scanhash_nist5(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
@ -645,6 +649,7 @@ void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
 void keccak256_hash(void *state, const void *input);
 unsigned int jackpothash(void *state, const void *input);
 void groestlhash(void *state, const void *input);
+void lyra_hash(void *state, const void *input);
 void myriadhash(void *state, const void *input);
 void nist5hash(void *state, const void *input);
 void pentablakehash(void *output, const void *input);
--- a/util.cpp
+++ b/util.cpp
@ -1633,18 +1633,22 @@ void print_hash_tests(void)
 	heavycoin_hash(&hash[0], &buf[0], 32);
 	printpfx("heavy", hash);

-	memset(hash, 0, sizeof hash);
-	keccak256_hash(&hash[0], &buf[0]);
-	printpfx("keccak", hash);
-
 	memset(hash, 0, sizeof hash);
 	jackpothash(&hash[0], &buf[0]);
 	printpfx("jackpot", hash);

+	memset(hash, 0, sizeof hash);
+	keccak256_hash(&hash[0], &buf[0]);
+	printpfx("keccak", hash);
+
 	memset(hash, 0, sizeof hash);
 	doomhash(&hash[0], &buf[0]);
 	printpfx("luffa", hash);
-
+/* to double check with a lyra2 cpu miner
+	memset(hash, 0, sizeof hash);
+	lyra_hash(&hash[0], &buf[0]);
+	printpfx("lyra2", hash);
+*/
 	memset(hash, 0, sizeof hash);
 	myriadhash(&hash[0], &buf[0]);
 	printpfx("myriad", hash);