From a37e909db9ff747f22203eb6bb3b4c6ed746f44f Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Fri, 27 Mar 2015 11:21:13 +0100
Subject: [PATCH] Add zr5 algo (for SM 3.5+)

uint4 copy + keccak cleanup, groestl: small uint4 opt

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
---
 JHA/cuda_jha_keccak512.cu      | 1083 ++++++++++++++++++--------------
 Makefile.am                    |    2 +-
 README.txt                     |   12 +-
 ccminer.cpp                    |   84 ++-
 ccminer.vcxproj                |   14 +-
 ccminer.vcxproj.filters        |    9 +-
 configure.ac                   |    2 +-
 cpuminer-config.h              |   30 +-
 miner.h                        |    5 +
 quark/cuda_quark_groestl512.cu |   15 +-
 util.cpp                       |   39 +-
 zr5.cu                         |  342 ++++++++++
 12 files changed, 1110 insertions(+), 527 deletions(-)
 create mode 100644 zr5.cu

diff --git a/JHA/cuda_jha_keccak512.cu b/JHA/cuda_jha_keccak512.cu
index d0d3c9b..2fc26b2 100644
--- a/JHA/cuda_jha_keccak512.cu
+++ b/JHA/cuda_jha_keccak512.cu
@@ -3,155 +3,118 @@
 
 #include "cuda_helper.h"
 
-__constant__ uint64_t c_State[25];
+// ZR5
+__constant__ uint32_t d_OriginalData[20];
+
 __constant__ uint32_t c_PaddedMessage[18];
+__constant__ uint64_t c_State[25];
+
+#define POK_DATA_MASK 0xFFFF0000
+#define POK_VERSION 0x1
 
 #define U32TO64_LE(p) \
-    (((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
+	(((uint64_t)(*p)) | (((uint64_t)(*(p + 1))) << 32))
 
 #define U64TO32_LE(p, v) \
-    *p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
+	*p = (uint32_t)((v)); *(p+1) = (uint32_t)((v) >> 32);
 
 static const uint64_t host_keccak_round_constants[24] = {
-    0x0000000000000001ull, 0x0000000000008082ull,
-    0x800000000000808aull, 0x8000000080008000ull,
-    0x000000000000808bull, 0x0000000080000001ull,
-    0x8000000080008081ull, 0x8000000000008009ull,
-    0x000000000000008aull, 0x0000000000000088ull,
-    0x0000000080008009ull, 0x000000008000000aull,
-    0x000000008000808bull, 0x800000000000008bull,
-    0x8000000000008089ull, 0x8000000000008003ull,
-    0x8000000000008002ull, 0x8000000000000080ull,
-    0x000000000000800aull, 0x800000008000000aull,
-    0x8000000080008081ull, 0x8000000000008080ull,
-    0x0000000080000001ull, 0x8000000080008008ull
+	0x0000000000000001ull, 0x0000000000008082ull,
+	0x800000000000808aull, 0x8000000080008000ull,
+	0x000000000000808bull, 0x0000000080000001ull,
+	0x8000000080008081ull, 0x8000000000008009ull,
+	0x000000000000008aull, 0x0000000000000088ull,
+	0x0000000080008009ull, 0x000000008000000aull,
+	0x000000008000808bull, 0x800000000000008bull,
+	0x8000000000008089ull, 0x8000000000008003ull,
+	0x8000000000008002ull, 0x8000000000000080ull,
+	0x000000000000800aull, 0x800000008000000aull,
+	0x8000000080008081ull, 0x8000000000008080ull,
+	0x0000000080000001ull, 0x8000000080008008ull
 };
 
 __constant__ uint64_t c_keccak_round_constants[24];
 
 static __device__ __forceinline__ void
 keccak_block(uint64_t *s, const uint32_t *in, const uint64_t *keccak_round_constants) {
-    size_t i;
-    uint64_t t[5], u[5], v, w;
-
-    /* absorb input */
-#pragma unroll 9
-    for (i = 0; i < 72 / 8; i++, in += 2)
-        s[i] ^= U32TO64_LE(in);
-    
-    for (i = 0; i < 24; i++) {
-        /* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
-        t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
-        t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
-        t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
-        t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
-        t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
-
-        /* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
-        u[0] = t[4] ^ ROTL64(t[1], 1);
-        u[1] = t[0] ^ ROTL64(t[2], 1);
-        u[2] = t[1] ^ ROTL64(t[3], 1);
-        u[3] = t[2] ^ ROTL64(t[4], 1);
-        u[4] = t[3] ^ ROTL64(t[0], 1);
-
-        /* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
-        s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
-        s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
-        s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
-        s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
-        s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
-
-        /* rho pi: b[..] = rotl(a[..], ..) */
-        v = s[ 1];
-        s[ 1] = ROTL64(s[ 6], 44);
-        s[ 6] = ROTL64(s[ 9], 20);
-        s[ 9] = ROTL64(s[22], 61);
-        s[22] = ROTL64(s[14], 39);
-        s[14] = ROTL64(s[20], 18);
-        s[20] = ROTL64(s[ 2], 62);
-        s[ 2] = ROTL64(s[12], 43);
-        s[12] = ROTL64(s[13], 25);
-        s[13] = ROTL64(s[19],  8);
-        s[19] = ROTL64(s[23], 56);
-        s[23] = ROTL64(s[15], 41);
-        s[15] = ROTL64(s[ 4], 27);
-        s[ 4] = ROTL64(s[24], 14);
-        s[24] = ROTL64(s[21],  2);
-        s[21] = ROTL64(s[ 8], 55);
-        s[ 8] = ROTL64(s[16], 45);
-        s[16] = ROTL64(s[ 5], 36);
-        s[ 5] = ROTL64(s[ 3], 28);
-        s[ 3] = ROTL64(s[18], 21);
-        s[18] = ROTL64(s[17], 15);
-        s[17] = ROTL64(s[11], 10);
-        s[11] = ROTL64(s[ 7],  6);
-        s[ 7] = ROTL64(s[10],  3);
-        s[10] = ROTL64(    v,  1);
-
-        /* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
-        v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
-        v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
-        v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
-        v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
-        v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
-
-        /* iota: a[0,0] ^= round constant */
-        s[0] ^= keccak_round_constants[i];
-    }
-}
-
-__global__ void jackpot_keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
-{
-    uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-    if (thread < threads)
-    {
-        uint32_t nounce = startNounce + thread;
-
-        int hashPosition = nounce - startNounce;
-
-        // Nachricht kopieren
-        uint32_t message[18];
-#pragma unroll 18
-        for(int i=0;i<18;i++)
-            message[i] = c_PaddedMessage[i];
-
-        // die individuelle Nounce einsetzen
-        message[1] = cuda_swab32(nounce);
-
-        // State initialisieren
-        uint64_t keccak_gpu_state[25];
-#pragma unroll 25
-        for (int i=0; i<25; i++)
-            keccak_gpu_state[i] = c_State[i];
-
-        // den Block einmal gut durchschütteln
-        keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
-
-        // das Hash erzeugen
-        uint32_t hash[16];
-
-#pragma unroll 8
-        for (size_t i = 0; i < 64; i += 8) {
-            U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]);
-        }
-
-        // fertig
-        uint32_t *outpHash = (uint32_t*)&g_hash[8 * hashPosition];
-
-#pragma unroll 16
-        for(int i=0;i<16;i++)
-            outpHash[i] = hash[i];
-    }
+	size_t i;
+	uint64_t t[5], u[5], v, w;
+
+	/* absorb input */
+	#pragma unroll 9
+	for (i = 0; i < 72 / 8; i++, in += 2)
+		s[i] ^= U32TO64_LE(in);
+
+	for (i = 0; i < 24; i++) {
+		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
+		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
+		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
+		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
+		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
+		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];
+
+		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
+		u[0] = t[4] ^ ROTL64(t[1], 1);
+		u[1] = t[0] ^ ROTL64(t[2], 1);
+		u[2] = t[1] ^ ROTL64(t[3], 1);
+		u[3] = t[2] ^ ROTL64(t[4], 1);
+		u[4] = t[3] ^ ROTL64(t[0], 1);
+
+		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
+		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
+		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
+		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
+		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
+		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];
+
+		/* rho pi: b[..] = rotl(a[..], ..) */
+		v = s[ 1];
+		s[ 1] = ROTL64(s[ 6], 44);
+		s[ 6] = ROTL64(s[ 9], 20);
+		s[ 9] = ROTL64(s[22], 61);
+		s[22] = ROTL64(s[14], 39);
+		s[14] = ROTL64(s[20], 18);
+		s[20] = ROTL64(s[ 2], 62);
+		s[ 2] = ROTL64(s[12], 43);
+		s[12] = ROTL64(s[13], 25);
+		s[13] = ROTL64(s[19],  8);
+		s[19] = ROTL64(s[23], 56);
+		s[23] = ROTL64(s[15], 41);
+		s[15] = ROTL64(s[ 4], 27);
+		s[ 4] = ROTL64(s[24], 14);
+		s[24] = ROTL64(s[21],  2);
+		s[21] = ROTL64(s[ 8], 55);
+		s[ 8] = ROTL64(s[16], 45);
+		s[16] = ROTL64(s[ 5], 36);
+		s[ 5] = ROTL64(s[ 3], 28);
+		s[ 3] = ROTL64(s[18], 21);
+		s[18] = ROTL64(s[17], 15);
+		s[17] = ROTL64(s[11], 10);
+		s[11] = ROTL64(s[ 7],  6);
+		s[ 7] = ROTL64(s[10],  3);
+		s[10] = ROTL64(    v,  1);
+
+		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
+		v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
+		v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
+		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
+		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
+		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;
+
+		/* iota: a[0,0] ^= round constant */
+		s[0] ^= keccak_round_constants[i];
+	}
 }
 
 // Setup-Funktionen
-__host__ void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads)
+__host__
+void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads)
 {
-    // Kopiere die Hash-Tabellen in den GPU-Speicher
-    cudaMemcpyToSymbol( c_keccak_round_constants,
-                        host_keccak_round_constants,
-                        sizeof(host_keccak_round_constants),
-                        0, cudaMemcpyHostToDevice);
+	// Kopiere die Hash-Tabellen in den GPU-Speicher
+	cudaMemcpyToSymbol( c_keccak_round_constants,
+						host_keccak_round_constants,
+						sizeof(host_keccak_round_constants),
+						0, cudaMemcpyHostToDevice);
 }
 
 #define cKeccakB    1600
@@ -160,379 +123,549 @@ __host__ void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads)
 #define cKeccakR_SizeInBytes    (cKeccakR / 8)
 #define crypto_hash_BYTES 64
 
-#if     (cKeccakB   == 1600)
-    typedef unsigned long long  UINT64;
-    typedef UINT64 tKeccakLane;
-    #define cKeccakNumberOfRounds   24
+#if (cKeccakB == 1600)
+	typedef unsigned long long UINT64;
+	typedef UINT64 tKeccakLane;
+	#define cKeccakNumberOfRounds 24
 #endif
 
 #define cKeccakLaneSizeInBits   (sizeof(tKeccakLane) * 8)
 
 #define ROL(a, offset) ((((tKeccakLane)a) << ((offset) % cKeccakLaneSizeInBits)) ^ (((tKeccakLane)a) >> (cKeccakLaneSizeInBits-((offset) % cKeccakLaneSizeInBits))))
 #if ((cKeccakB/25) == 8)
-    #define ROL_mult8(a, offset) ((tKeccakLane)a)
+	#define ROL_mult8(a, offset) ((tKeccakLane)a)
 #else
-    #define ROL_mult8(a, offset) ROL(a, offset)
+	#define ROL_mult8(a, offset) ROL(a, offset)
 #endif
-void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount );
 
-const tKeccakLane KeccakF_RoundConstants[cKeccakNumberOfRounds] = 
-{
-    (tKeccakLane)0x0000000000000001ULL,
-    (tKeccakLane)0x0000000000008082ULL,
-    (tKeccakLane)0x800000000000808aULL,
-    (tKeccakLane)0x8000000080008000ULL,
-    (tKeccakLane)0x000000000000808bULL,
-    (tKeccakLane)0x0000000080000001ULL,
-    (tKeccakLane)0x8000000080008081ULL,
-    (tKeccakLane)0x8000000000008009ULL,
-    (tKeccakLane)0x000000000000008aULL,
-    (tKeccakLane)0x0000000000000088ULL,
-    (tKeccakLane)0x0000000080008009ULL,
-    (tKeccakLane)0x000000008000000aULL,
-    (tKeccakLane)0x000000008000808bULL,
-    (tKeccakLane)0x800000000000008bULL,
-    (tKeccakLane)0x8000000000008089ULL,
-    (tKeccakLane)0x8000000000008003ULL,
-    (tKeccakLane)0x8000000000008002ULL,
-    (tKeccakLane)0x8000000000000080ULL
-	#if		(cKeccakB	>= 400)
+const tKeccakLane KeccakF_RoundConstants[cKeccakNumberOfRounds] = {
+	(tKeccakLane)0x0000000000000001ULL,
+	(tKeccakLane)0x0000000000008082ULL,
+	(tKeccakLane)0x800000000000808aULL,
+	(tKeccakLane)0x8000000080008000ULL,
+	(tKeccakLane)0x000000000000808bULL,
+	(tKeccakLane)0x0000000080000001ULL,
+	(tKeccakLane)0x8000000080008081ULL,
+	(tKeccakLane)0x8000000000008009ULL,
+	(tKeccakLane)0x000000000000008aULL,
+	(tKeccakLane)0x0000000000000088ULL,
+	(tKeccakLane)0x0000000080008009ULL,
+	(tKeccakLane)0x000000008000000aULL,
+	(tKeccakLane)0x000000008000808bULL,
+	(tKeccakLane)0x800000000000008bULL,
+	(tKeccakLane)0x8000000000008089ULL,
+	(tKeccakLane)0x8000000000008003ULL,
+	(tKeccakLane)0x8000000000008002ULL,
+	(tKeccakLane)0x8000000000000080ULL
+#if (cKeccakB >= 400)
   , (tKeccakLane)0x000000000000800aULL,
-    (tKeccakLane)0x800000008000000aULL
-	#if		(cKeccakB	>= 800)
+	(tKeccakLane)0x800000008000000aULL
+#if (cKeccakB >= 800)
   , (tKeccakLane)0x8000000080008081ULL,
-    (tKeccakLane)0x8000000000008080ULL
-	#if		(cKeccakB	== 1600)
+	(tKeccakLane)0x8000000000008080ULL
+#if (cKeccakB == 1600)
   , (tKeccakLane)0x0000000080000001ULL,
-    (tKeccakLane)0x8000000080008008ULL
-	#endif
-	#endif
-	#endif
+	(tKeccakLane)0x8000000080008008ULL
+#endif
+#endif
+#endif
 };
 
-void KeccakF( tKeccakLane * state, const tKeccakLane *in, int laneCount )
+void KeccakF(tKeccakLane * state, const tKeccakLane *in, int laneCount)
 {
-
-    {
-        while ( --laneCount >= 0 )
-        {
-            state[laneCount] ^= in[laneCount];
-        }
-    }
-
-    {
-        tKeccakLane Aba, Abe, Abi, Abo, Abu;
-        tKeccakLane Aga, Age, Agi, Ago, Agu;
-        tKeccakLane Aka, Ake, Aki, Ako, Aku;
-        tKeccakLane Ama, Ame, Ami, Amo, Amu;
-        tKeccakLane Asa, Ase, Asi, Aso, Asu;
-        tKeccakLane BCa, BCe, BCi, BCo, BCu;
-        tKeccakLane Da, De, Di, Do, Du;
-        tKeccakLane Eba, Ebe, Ebi, Ebo, Ebu;
-        tKeccakLane Ega, Ege, Egi, Ego, Egu;
-        tKeccakLane Eka, Eke, Eki, Eko, Eku;
-        tKeccakLane Ema, Eme, Emi, Emo, Emu;
-        tKeccakLane Esa, Ese, Esi, Eso, Esu;
-        #define    round    laneCount
-
-        //copyFromState(A, state)
-        Aba = state[ 0];
-        Abe = state[ 1];
-        Abi = state[ 2];
-        Abo = state[ 3];
-        Abu = state[ 4];
-        Aga = state[ 5];
-        Age = state[ 6];
-        Agi = state[ 7];
-        Ago = state[ 8];
-        Agu = state[ 9];
-        Aka = state[10];
-        Ake = state[11];
-        Aki = state[12];
-        Ako = state[13];
-        Aku = state[14];
-        Ama = state[15];
-        Ame = state[16];
-        Ami = state[17];
-        Amo = state[18];
-        Amu = state[19];
-        Asa = state[20];
-        Ase = state[21];
-        Asi = state[22];
-        Aso = state[23];
-        Asu = state[24];
-
-        for( round = 0; round < cKeccakNumberOfRounds; round += 2 )
-        {
-            //    prepareTheta
-            BCa = Aba^Aga^Aka^Ama^Asa;
-            BCe = Abe^Age^Ake^Ame^Ase;
-            BCi = Abi^Agi^Aki^Ami^Asi;
-            BCo = Abo^Ago^Ako^Amo^Aso;
-            BCu = Abu^Agu^Aku^Amu^Asu;
-
-            //thetaRhoPiChiIotaPrepareTheta(round  , A, E)
-            Da = BCu^ROL(BCe, 1);
-            De = BCa^ROL(BCi, 1);
-            Di = BCe^ROL(BCo, 1);
-            Do = BCi^ROL(BCu, 1);
-            Du = BCo^ROL(BCa, 1);
-
-            Aba ^= Da;
-            BCa = Aba;
-            Age ^= De;
-            BCe = ROL(Age, 44);
-            Aki ^= Di;
-            BCi = ROL(Aki, 43);
-            Amo ^= Do;
-            BCo = ROL(Amo, 21);
-            Asu ^= Du;
-            BCu = ROL(Asu, 14);
-            Eba =   BCa ^((~BCe)&  BCi );
-            Eba ^= (tKeccakLane)KeccakF_RoundConstants[round];
-            Ebe =   BCe ^((~BCi)&  BCo );
-            Ebi =   BCi ^((~BCo)&  BCu );
-            Ebo =   BCo ^((~BCu)&  BCa );
-            Ebu =   BCu ^((~BCa)&  BCe );
-
-            Abo ^= Do;
-            BCa = ROL(Abo, 28);
-            Agu ^= Du;
-            BCe = ROL(Agu, 20);
-            Aka ^= Da;
-            BCi = ROL(Aka,  3);
-            Ame ^= De;
-            BCo = ROL(Ame, 45);
-            Asi ^= Di;
-            BCu = ROL(Asi, 61);
-            Ega =   BCa ^((~BCe)&  BCi );
-            Ege =   BCe ^((~BCi)&  BCo );
-            Egi =   BCi ^((~BCo)&  BCu );
-            Ego =   BCo ^((~BCu)&  BCa );
-            Egu =   BCu ^((~BCa)&  BCe );
-
-            Abe ^= De;
-            BCa = ROL(Abe,  1);
-            Agi ^= Di;
-            BCe = ROL(Agi,  6);
-            Ako ^= Do;
-            BCi = ROL(Ako, 25);
-            Amu ^= Du;
-            BCo = ROL_mult8(Amu,  8);
-            Asa ^= Da;
-            BCu = ROL(Asa, 18);
-            Eka =   BCa ^((~BCe)&  BCi );
-            Eke =   BCe ^((~BCi)&  BCo );
-            Eki =   BCi ^((~BCo)&  BCu );
-            Eko =   BCo ^((~BCu)&  BCa );
-            Eku =   BCu ^((~BCa)&  BCe );
-
-            Abu ^= Du;
-            BCa = ROL(Abu, 27);
-            Aga ^= Da;
-            BCe = ROL(Aga, 36);
-            Ake ^= De;
-            BCi = ROL(Ake, 10);
-            Ami ^= Di;
-            BCo = ROL(Ami, 15);
-            Aso ^= Do;
-            BCu = ROL_mult8(Aso, 56);
-            Ema =   BCa ^((~BCe)&  BCi );
-            Eme =   BCe ^((~BCi)&  BCo );
-            Emi =   BCi ^((~BCo)&  BCu );
-            Emo =   BCo ^((~BCu)&  BCa );
-            Emu =   BCu ^((~BCa)&  BCe );
-
-            Abi ^= Di;
-            BCa = ROL(Abi, 62);
-            Ago ^= Do;
-            BCe = ROL(Ago, 55);
-            Aku ^= Du;
-            BCi = ROL(Aku, 39);
-            Ama ^= Da;
-            BCo = ROL(Ama, 41);
-            Ase ^= De;
-            BCu = ROL(Ase,  2);
-            Esa =   BCa ^((~BCe)&  BCi );
-            Ese =   BCe ^((~BCi)&  BCo );
-            Esi =   BCi ^((~BCo)&  BCu );
-            Eso =   BCo ^((~BCu)&  BCa );
-            Esu =   BCu ^((~BCa)&  BCe );
-
-            //    prepareTheta
-            BCa = Eba^Ega^Eka^Ema^Esa;
-            BCe = Ebe^Ege^Eke^Eme^Ese;
-            BCi = Ebi^Egi^Eki^Emi^Esi;
-            BCo = Ebo^Ego^Eko^Emo^Eso;
-            BCu = Ebu^Egu^Eku^Emu^Esu;
-
-            //thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
-            Da = BCu^ROL(BCe, 1);
-            De = BCa^ROL(BCi, 1);
-            Di = BCe^ROL(BCo, 1);
-            Do = BCi^ROL(BCu, 1);
-            Du = BCo^ROL(BCa, 1);
-
-            Eba ^= Da;
-            BCa = Eba;
-            Ege ^= De;
-            BCe = ROL(Ege, 44);
-            Eki ^= Di;
-            BCi = ROL(Eki, 43);
-            Emo ^= Do;
-            BCo = ROL(Emo, 21);
-            Esu ^= Du;
-            BCu = ROL(Esu, 14);
-            Aba =   BCa ^((~BCe)&  BCi );
-            Aba ^= (tKeccakLane)KeccakF_RoundConstants[round+1];
-            Abe =   BCe ^((~BCi)&  BCo );
-            Abi =   BCi ^((~BCo)&  BCu );
-            Abo =   BCo ^((~BCu)&  BCa );
-            Abu =   BCu ^((~BCa)&  BCe );
-
-            Ebo ^= Do;
-            BCa = ROL(Ebo, 28);
-            Egu ^= Du;
-            BCe = ROL(Egu, 20);
-            Eka ^= Da;
-            BCi = ROL(Eka, 3);
-            Eme ^= De;
-            BCo = ROL(Eme, 45);
-            Esi ^= Di;
-            BCu = ROL(Esi, 61);
-            Aga =   BCa ^((~BCe)&  BCi );
-            Age =   BCe ^((~BCi)&  BCo );
-            Agi =   BCi ^((~BCo)&  BCu );
-            Ago =   BCo ^((~BCu)&  BCa );
-            Agu =   BCu ^((~BCa)&  BCe );
-
-            Ebe ^= De;
-            BCa = ROL(Ebe, 1);
-            Egi ^= Di;
-            BCe = ROL(Egi, 6);
-            Eko ^= Do;
-            BCi = ROL(Eko, 25);
-            Emu ^= Du;
-            BCo = ROL_mult8(Emu, 8);
-            Esa ^= Da;
-            BCu = ROL(Esa, 18);
-            Aka =   BCa ^((~BCe)&  BCi );
-            Ake =   BCe ^((~BCi)&  BCo );
-            Aki =   BCi ^((~BCo)&  BCu );
-            Ako =   BCo ^((~BCu)&  BCa );
-            Aku =   BCu ^((~BCa)&  BCe );
-
-            Ebu ^= Du;
-            BCa = ROL(Ebu, 27);
-            Ega ^= Da;
-            BCe = ROL(Ega, 36);
-            Eke ^= De;
-            BCi = ROL(Eke, 10);
-            Emi ^= Di;
-            BCo = ROL(Emi, 15);
-            Eso ^= Do;
-            BCu = ROL_mult8(Eso, 56);
-            Ama =   BCa ^((~BCe)&  BCi );
-            Ame =   BCe ^((~BCi)&  BCo );
-            Ami =   BCi ^((~BCo)&  BCu );
-            Amo =   BCo ^((~BCu)&  BCa );
-            Amu =   BCu ^((~BCa)&  BCe );
-
-            Ebi ^= Di;
-            BCa = ROL(Ebi, 62);
-            Ego ^= Do;
-            BCe = ROL(Ego, 55);
-            Eku ^= Du;
-            BCi = ROL(Eku, 39);
-            Ema ^= Da;
-            BCo = ROL(Ema, 41);
-            Ese ^= De;
-            BCu = ROL(Ese, 2);
-            Asa =   BCa ^((~BCe)&  BCi );
-            Ase =   BCe ^((~BCi)&  BCo );
-            Asi =   BCi ^((~BCo)&  BCu );
-            Aso =   BCo ^((~BCu)&  BCa );
-            Asu =   BCu ^((~BCa)&  BCe );
-        }
-
-        //copyToState(state, A)
-        state[ 0] = Aba;
-        state[ 1] = Abe;
-        state[ 2] = Abi;
-        state[ 3] = Abo;
-        state[ 4] = Abu;
-        state[ 5] = Aga;
-        state[ 6] = Age;
-        state[ 7] = Agi;
-        state[ 8] = Ago;
-        state[ 9] = Agu;
-        state[10] = Aka;
-        state[11] = Ake;
-        state[12] = Aki;
-        state[13] = Ako;
-        state[14] = Aku;
-        state[15] = Ama;
-        state[16] = Ame;
-        state[17] = Ami;
-        state[18] = Amo;
-        state[19] = Amu;
-        state[20] = Asa;
-        state[21] = Ase;
-        state[22] = Asi;
-        state[23] = Aso;
-        state[24] = Asu;
-
-        #undef    round
-    }
+	while ( --laneCount >= 0 ) {
+		state[laneCount] ^= in[laneCount];
+	}
+
+	{
+		tKeccakLane Aba, Abe, Abi, Abo, Abu;
+		tKeccakLane Aga, Age, Agi, Ago, Agu;
+		tKeccakLane Aka, Ake, Aki, Ako, Aku;
+		tKeccakLane Ama, Ame, Ami, Amo, Amu;
+		tKeccakLane Asa, Ase, Asi, Aso, Asu;
+		tKeccakLane BCa, BCe, BCi, BCo, BCu;
+		tKeccakLane Da, De, Di, Do, Du;
+		tKeccakLane Eba, Ebe, Ebi, Ebo, Ebu;
+		tKeccakLane Ega, Ege, Egi, Ego, Egu;
+		tKeccakLane Eka, Eke, Eki, Eko, Eku;
+		tKeccakLane Ema, Eme, Emi, Emo, Emu;
+		tKeccakLane Esa, Ese, Esi, Eso, Esu;
+		#define    round    laneCount
+
+		//copyFromState(A, state)
+		Aba = state[ 0];
+		Abe = state[ 1];
+		Abi = state[ 2];
+		Abo = state[ 3];
+		Abu = state[ 4];
+		Aga = state[ 5];
+		Age = state[ 6];
+		Agi = state[ 7];
+		Ago = state[ 8];
+		Agu = state[ 9];
+		Aka = state[10];
+		Ake = state[11];
+		Aki = state[12];
+		Ako = state[13];
+		Aku = state[14];
+		Ama = state[15];
+		Ame = state[16];
+		Ami = state[17];
+		Amo = state[18];
+		Amu = state[19];
+		Asa = state[20];
+		Ase = state[21];
+		Asi = state[22];
+		Aso = state[23];
+		Asu = state[24];
+
+		for( round = 0; round < cKeccakNumberOfRounds; round += 2 )
+		{
+			//    prepareTheta
+			BCa = Aba^Aga^Aka^Ama^Asa;
+			BCe = Abe^Age^Ake^Ame^Ase;
+			BCi = Abi^Agi^Aki^Ami^Asi;
+			BCo = Abo^Ago^Ako^Amo^Aso;
+			BCu = Abu^Agu^Aku^Amu^Asu;
+
+			//thetaRhoPiChiIotaPrepareTheta(round  , A, E)
+			Da = BCu^ROL(BCe, 1);
+			De = BCa^ROL(BCi, 1);
+			Di = BCe^ROL(BCo, 1);
+			Do = BCi^ROL(BCu, 1);
+			Du = BCo^ROL(BCa, 1);
+
+			Aba ^= Da;
+			BCa = Aba;
+			Age ^= De;
+			BCe = ROL(Age, 44);
+			Aki ^= Di;
+			BCi = ROL(Aki, 43);
+			Amo ^= Do;
+			BCo = ROL(Amo, 21);
+			Asu ^= Du;
+			BCu = ROL(Asu, 14);
+			Eba =   BCa ^((~BCe)&  BCi );
+			Eba ^= (tKeccakLane)KeccakF_RoundConstants[round];
+			Ebe =   BCe ^((~BCi)&  BCo );
+			Ebi =   BCi ^((~BCo)&  BCu );
+			Ebo =   BCo ^((~BCu)&  BCa );
+			Ebu =   BCu ^((~BCa)&  BCe );
+
+			Abo ^= Do;
+			BCa = ROL(Abo, 28);
+			Agu ^= Du;
+			BCe = ROL(Agu, 20);
+			Aka ^= Da;
+			BCi = ROL(Aka,  3);
+			Ame ^= De;
+			BCo = ROL(Ame, 45);
+			Asi ^= Di;
+			BCu = ROL(Asi, 61);
+			Ega =   BCa ^((~BCe)&  BCi );
+			Ege =   BCe ^((~BCi)&  BCo );
+			Egi =   BCi ^((~BCo)&  BCu );
+			Ego =   BCo ^((~BCu)&  BCa );
+			Egu =   BCu ^((~BCa)&  BCe );
+
+			Abe ^= De;
+			BCa = ROL(Abe,  1);
+			Agi ^= Di;
+			BCe = ROL(Agi,  6);
+			Ako ^= Do;
+			BCi = ROL(Ako, 25);
+			Amu ^= Du;
+			BCo = ROL_mult8(Amu,  8);
+			Asa ^= Da;
+			BCu = ROL(Asa, 18);
+			Eka =   BCa ^((~BCe)&  BCi );
+			Eke =   BCe ^((~BCi)&  BCo );
+			Eki =   BCi ^((~BCo)&  BCu );
+			Eko =   BCo ^((~BCu)&  BCa );
+			Eku =   BCu ^((~BCa)&  BCe );
+
+			Abu ^= Du;
+			BCa = ROL(Abu, 27);
+			Aga ^= Da;
+			BCe = ROL(Aga, 36);
+			Ake ^= De;
+			BCi = ROL(Ake, 10);
+			Ami ^= Di;
+			BCo = ROL(Ami, 15);
+			Aso ^= Do;
+			BCu = ROL_mult8(Aso, 56);
+			Ema =   BCa ^((~BCe)&  BCi );
+			Eme =   BCe ^((~BCi)&  BCo );
+			Emi =   BCi ^((~BCo)&  BCu );
+			Emo =   BCo ^((~BCu)&  BCa );
+			Emu =   BCu ^((~BCa)&  BCe );
+
+			Abi ^= Di;
+			BCa = ROL(Abi, 62);
+			Ago ^= Do;
+			BCe = ROL(Ago, 55);
+			Aku ^= Du;
+			BCi = ROL(Aku, 39);
+			Ama ^= Da;
+			BCo = ROL(Ama, 41);
+			Ase ^= De;
+			BCu = ROL(Ase,  2);
+			Esa =   BCa ^((~BCe)&  BCi );
+			Ese =   BCe ^((~BCi)&  BCo );
+			Esi =   BCi ^((~BCo)&  BCu );
+			Eso =   BCo ^((~BCu)&  BCa );
+			Esu =   BCu ^((~BCa)&  BCe );
+
+			//    prepareTheta
+			BCa = Eba^Ega^Eka^Ema^Esa;
+			BCe = Ebe^Ege^Eke^Eme^Ese;
+			BCi = Ebi^Egi^Eki^Emi^Esi;
+			BCo = Ebo^Ego^Eko^Emo^Eso;
+			BCu = Ebu^Egu^Eku^Emu^Esu;
+
+			//thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
+			Da = BCu^ROL(BCe, 1);
+			De = BCa^ROL(BCi, 1);
+			Di = BCe^ROL(BCo, 1);
+			Do = BCi^ROL(BCu, 1);
+			Du = BCo^ROL(BCa, 1);
+
+			Eba ^= Da;
+			BCa = Eba;
+			Ege ^= De;
+			BCe = ROL(Ege, 44);
+			Eki ^= Di;
+			BCi = ROL(Eki, 43);
+			Emo ^= Do;
+			BCo = ROL(Emo, 21);
+			Esu ^= Du;
+			BCu = ROL(Esu, 14);
+			Aba =   BCa ^((~BCe)&  BCi );
+			Aba ^= (tKeccakLane)KeccakF_RoundConstants[round+1];
+			Abe =   BCe ^((~BCi)&  BCo );
+			Abi =   BCi ^((~BCo)&  BCu );
+			Abo =   BCo ^((~BCu)&  BCa );
+			Abu =   BCu ^((~BCa)&  BCe );
+
+			Ebo ^= Do;
+			BCa = ROL(Ebo, 28);
+			Egu ^= Du;
+			BCe = ROL(Egu, 20);
+			Eka ^= Da;
+			BCi = ROL(Eka, 3);
+			Eme ^= De;
+			BCo = ROL(Eme, 45);
+			Esi ^= Di;
+			BCu = ROL(Esi, 61);
+			Aga =   BCa ^((~BCe)&  BCi );
+			Age =   BCe ^((~BCi)&  BCo );
+			Agi =   BCi ^((~BCo)&  BCu );
+			Ago =   BCo ^((~BCu)&  BCa );
+			Agu =   BCu ^((~BCa)&  BCe );
+
+			Ebe ^= De;
+			BCa = ROL(Ebe, 1);
+			Egi ^= Di;
+			BCe = ROL(Egi, 6);
+			Eko ^= Do;
+			BCi = ROL(Eko, 25);
+			Emu ^= Du;
+			BCo = ROL_mult8(Emu, 8);
+			Esa ^= Da;
+			BCu = ROL(Esa, 18);
+			Aka =   BCa ^((~BCe)&  BCi );
+			Ake =   BCe ^((~BCi)&  BCo );
+			Aki =   BCi ^((~BCo)&  BCu );
+			Ako =   BCo ^((~BCu)&  BCa );
+			Aku =   BCu ^((~BCa)&  BCe );
+
+			Ebu ^= Du;
+			BCa = ROL(Ebu, 27);
+			Ega ^= Da;
+			BCe = ROL(Ega, 36);
+			Eke ^= De;
+			BCi = ROL(Eke, 10);
+			Emi ^= Di;
+			BCo = ROL(Emi, 15);
+			Eso ^= Do;
+			BCu = ROL_mult8(Eso, 56);
+			Ama =   BCa ^((~BCe)&  BCi );
+			Ame =   BCe ^((~BCi)&  BCo );
+			Ami =   BCi ^((~BCo)&  BCu );
+			Amo =   BCo ^((~BCu)&  BCa );
+			Amu =   BCu ^((~BCa)&  BCe );
+
+			Ebi ^= Di;
+			BCa = ROL(Ebi, 62);
+			Ego ^= Do;
+			BCe = ROL(Ego, 55);
+			Eku ^= Du;
+			BCi = ROL(Eku, 39);
+			Ema ^= Da;
+			BCo = ROL(Ema, 41);
+			Ese ^= De;
+			BCu = ROL(Ese, 2);
+			Asa =   BCa ^((~BCe)&  BCi );
+			Ase =   BCe ^((~BCi)&  BCo );
+			Asi =   BCi ^((~BCo)&  BCu );
+			Aso =   BCo ^((~BCu)&  BCa );
+			Asu =   BCu ^((~BCa)&  BCe );
+		}
+
+		//copyToState(state, A)
+		state[ 0] = Aba;
+		state[ 1] = Abe;
+		state[ 2] = Abi;
+		state[ 3] = Abo;
+		state[ 4] = Abu;
+		state[ 5] = Aga;
+		state[ 6] = Age;
+		state[ 7] = Agi;
+		state[ 8] = Ago;
+		state[ 9] = Agu;
+		state[10] = Aka;
+		state[11] = Ake;
+		state[12] = Aki;
+		state[13] = Ako;
+		state[14] = Aku;
+		state[15] = Ama;
+		state[16] = Ame;
+		state[17] = Ami;
+		state[18] = Amo;
+		state[19] = Amu;
+		state[20] = Asa;
+		state[21] = Ase;
+		state[22] = Asi;
+		state[23] = Aso;
+		state[24] = Asu;
+
+		#undef    round
+	}
 }
 
 // inlen kann 72...143 betragen
-__host__ void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen)
+__host__
+void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen)
+{
+	const unsigned char *in = (const unsigned char*)pdata;
+
+	tKeccakLane state[5 * 5];
+	unsigned char temp[cKeccakR_SizeInBytes];
+
+	memset( state, 0, sizeof(state) );
+
+	for ( /* empty */; inlen >= cKeccakR_SizeInBytes; inlen -= cKeccakR_SizeInBytes, in += cKeccakR_SizeInBytes)
+	{
+		KeccakF( state, (const tKeccakLane*)in, cKeccakR_SizeInBytes / sizeof(tKeccakLane) );
+	}
+
+	// Copy state of the first round (72 Bytes)
+	// in Constant Memory
+	cudaMemcpyToSymbol( c_State,
+						state,
+						sizeof(state),
+						0, cudaMemcpyHostToDevice);
+
+	// second part
+	memcpy(temp, in, inlen);
+	temp[inlen++] = 1;
+	memset(temp + inlen, 0, cKeccakR_SizeInBytes - inlen);
+	temp[cKeccakR_SizeInBytes-1] |= 0x80;
+
+	// Copy rest of the message in constant memory
+	cudaMemcpyToSymbol( c_PaddedMessage,
+						temp,
+						cKeccakR_SizeInBytes,
+						0, cudaMemcpyHostToDevice);
+}
+
+__global__
+void jackpot_keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = startNounce + thread;
+
+		int hashPosition = nounce - startNounce;
+
+		uint32_t message[18];
+		#pragma unroll 18
+		for(int i=0;i<18;i++)
+			message[i] = c_PaddedMessage[i];
+
+		message[1] = cuda_swab32(nounce);
+
+		// State init
+		uint64_t keccak_gpu_state[25];
+		#pragma unroll 25
+		for (int i=0; i<25; i++)
+			keccak_gpu_state[i] = c_State[i];
+
+		// den Block einmal gut durchschütteln
+		keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
+
+		uint32_t hash[16];
+
+		#pragma unroll 8
+		for (size_t i = 0; i < 64; i += 8) {
+			U64TO32_LE((&hash[i/4]), keccak_gpu_state[i / 8]);
+		}
+
+		// copy hash
+		uint32_t *outpHash = (uint32_t*)&g_hash[8 * hashPosition];
+
+		#pragma unroll 16
+		for(int i=0;i<16;i++)
+			outpHash[i] = hash[i];
+	}
+}
+
+__host__
+void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order)
 {
-    const unsigned char *in = (const unsigned char*)pdata;
+	const uint32_t threadsperblock = 256;
 
-    tKeccakLane    state[5 * 5];
-    unsigned char temp[cKeccakR_SizeInBytes];
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
 
-    memset( state, 0, sizeof(state) );
+	size_t shared_size = 0;
 
-    for ( /* empty */; inlen >= cKeccakR_SizeInBytes; inlen -= cKeccakR_SizeInBytes, in += cKeccakR_SizeInBytes )
-    {
-        KeccakF( state, (const tKeccakLane*)in, cKeccakR_SizeInBytes / sizeof(tKeccakLane) );
-    }
+	jackpot_keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
+	MyStreamSynchronize(NULL, order, thr_id);
+}
 
-    // Kopiere den state nach der ersten Runde (nach Absorption von 72 Bytes Inputdaten)
-    // ins Constant Memory
-    cudaMemcpyToSymbol( c_State,
-                        state,
-                        sizeof(state),
-                        0, cudaMemcpyHostToDevice);
 
-    //    padding
-    memcpy( temp, in, (size_t)inlen );
-    temp[inlen++] = 1;
-    memset( temp+inlen, 0, cKeccakR_SizeInBytes - (size_t)inlen );
-    temp[cKeccakR_SizeInBytes-1] |= 0x80;
+/* zr5 keccak, no nonce swab32 */
 
+__global__
+void zr5_keccak512_gpu_hash(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = startNounce + thread;
+		uint32_t message[18];
+
+		#pragma unroll 18
+		for(int i=0; i<18; i++)
+			message[i] = c_PaddedMessage[i];
+
+		message[1] = nounce;
+
+		// Get mid-state
+		uint64_t keccak_gpu_state[25];
+		#pragma unroll 25
+		for (int i=0; i<25; i++)
+			keccak_gpu_state[i] = c_State[i];
+
+		keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
+
+		uint32_t hash[16];
+
+		#pragma unroll 8
+		for (int i = 0; i < 8; i++) {
+			U64TO32_LE((&hash[i*2]), keccak_gpu_state[i]);
+		}
+
+		// Output (64 bytes hash required)
+		uint32_t hashPosition = nounce - startNounce;
+		//uint32_t *outpHash = (uint32_t*) (&g_hash[hashPosition*8]);
+		//#pragma unroll 16
+		//for(int i=0; i<16; i++)
+		//	outpHash[i] = hash[i];
+
+		uint4 *outpHash = (uint4*) (&g_hash[hashPosition*8]);
+		uint4 *psrc = (uint4*) hash;
+		outpHash[0] = psrc[0];
+		outpHash[1] = psrc[1];
+		outpHash[2] = psrc[2];
+		outpHash[3] = psrc[3];
+	}
+}
+
+__host__
+void zr5_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 256;
 
-    // Kopiere den Rest der Message und das Padding ins Constant Memory
-    cudaMemcpyToSymbol( c_PaddedMessage,
-                        temp,
-                        cKeccakR_SizeInBytes,
-                        0, cudaMemcpyHostToDevice);
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	zr5_keccak512_gpu_hash<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash);
+	MyStreamSynchronize(NULL, 0, thr_id);
 }
 
-__host__ void jackpot_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order)
+/* required for the second hash part of zr5 */
+
+__global__
+void zr5_keccak512_gpu_hash_pok(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint16_t *d_pokh, uint32_t version)
 {
-    const uint32_t threadsperblock = 256;
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t nounce = startNounce + thread;
+
+		//uint32_t hashPosition = thread * 16;
+		uint32_t *prevHash = &g_hash[thread * 16]; // thread * 64 / sizeof(uint32_t)
+		uint32_t message[18]; /* 72 bytes */
+
+		// pok - hash[0] from prev hash
+		message[0] = version | (prevHash[0] & POK_DATA_MASK);
+		// save pok
+		d_pokh[thread] = (uint16_t) (message[0] / 0x10000);
+		for (int i=1; i<18; i++) {
+			message[i]=d_OriginalData[i];
+		}
+
+		// first bloc
+		uint64_t keccak_gpu_state[25] = { 0 };
+		keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
+
+		// second bloc
+		message[0] = d_OriginalData[18];
+		message[1] = nounce; //cuda_swab32(nounce);
+		message[2] = 1;
+
+		#pragma unroll
+		for(int i=3; i<17; i++)
+			message[i] = 0;
+
+		message[17] = 0x80000000UL;
+
+		keccak_block(keccak_gpu_state, message, c_keccak_round_constants);
+
+		uint32_t hash[16];
+
+		#pragma unroll 8
+		for (size_t i = 0; i < 8; i++) {
+			U64TO32_LE((&hash[i*2]), keccak_gpu_state[i]);
+		}
+
+		//uint32_t *outpHash = &g_hash[thread * 16];
+		//#pragma unroll 16
+		//for(int i=0; i<16; i++)
+		//	outpHash[i] = hash[i];
+
+		uint4 *outpHash = (uint4*) (&g_hash[thread * 16]);
+		uint4 *psrc = (uint4*) hash;
+		outpHash[0] = psrc[0];
+		outpHash[1] = psrc[1];
+		outpHash[2] = psrc[2];
+		outpHash[3] = psrc[3];
+	}
+}
 
-    // berechne wie viele Thread Blocks wir brauchen
-    dim3 grid((threads + threadsperblock-1)/threadsperblock);
-    dim3 block(threadsperblock);
+__host__
+void zr5_keccak512_cpu_hash_pok(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t* pdata, uint32_t *d_hash, uint16_t *d_poks)
+{
+	const uint32_t threadsperblock = 256;
+	const uint32_t version = pdata[0] & (~POK_DATA_MASK);
 
-    // Größe des dynamischen Shared Memory Bereichs
-    size_t shared_size = 0;
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
 
-    jackpot_keccak512_gpu_hash<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash);
-    MyStreamSynchronize(NULL, order, thr_id);
+	cudaMemcpyToSymbol(d_OriginalData, pdata, sizeof(d_OriginalData), 0, cudaMemcpyHostToDevice);
+	zr5_keccak512_gpu_hash_pok<<<grid, block>>>(threads, startNounce, d_hash, d_poks, version);
+	MyStreamSynchronize(NULL, 10, thr_id);
 }
diff --git a/Makefile.am b/Makefile.am
index 21641f9..46e7f3c 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -41,7 +41,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  quark/cuda_bmw512.cu quark/cuda_quark_keccak512.cu \
 			  quark/quarkcoin.cu quark/animecoin.cu \
 			  quark/cuda_quark_compactionTest.cu  \
-			  cuda_nist5.cu pentablake.cu \
+			  cuda_nist5.cu pentablake.cu zr5.cu \
 			  sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
 			  sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
 			  sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
diff --git a/README.txt b/README.txt
index 8d4e17a..eb23210 100644
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,5 @@
 
-ccMiner release 1.6.0-tpruvot (Mar 2015) - "Pluck & Whirlpoolx"
+ccMiner release 1.6.0-tpruvot (Mar 2015) - "ZR5, Pluck & WhirlX"
 ---------------------------------------------------------------
 
 ***************************************************************
@@ -37,8 +37,9 @@ BlakeCoin (256 8-rounds)
 Keccak (Maxcoin)
 Deep, Doom and Qubit
 Pentablake (Blake 512 x5)
-S3 (OneCoin)
-Lyra2RE (new VertCoin algo)
+1Coin Triple S
+Vertcoin Lyra2RE
+Ziftrcoin (ZR5)
 
 where some of these coins have a VERY NOTABLE nVidia advantage
 over competing AMD (OpenCL Only) implementations.
@@ -84,6 +85,7 @@ its command line interface and options.
                           x14         use to mine X14Coin
                           x15         use to mine Halcyon
                           x17         use to mine X17
+                          zr5         use to mine ZiftrCoin
 
   -d, --devices         gives a comma separated list of CUDA device IDs
                         to operate on. Device IDs start counting from 0!
@@ -183,12 +185,12 @@ features.
 
 >>> RELEASE HISTORY <<<
 
-  Mar. 2015       v1.6.0 (Note for CryptoMiningBlog: NOT YET RELEASED/FINISHED!)
+  Mar. 27th 2015  v1.6.0
+                  Add the ZR5 Algo for Ziftcoin
                   Import pluck (djm34) and whirlpoolx (alexis78) algos
                   Hashrate units based on hashing rate values (Hs/kHs/MHs/GHs)
                   Default config file (also help to debug without command line)
                   Various small fixes
-                  More to come soon...
 
   Feb. 11th 2015  v1.5.3
                   Fix anime algo
diff --git a/ccminer.cpp b/ccminer.cpp
index 87c016d..e67e167 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -108,6 +108,7 @@ enum sha_algos {
 	ALGO_X14,
 	ALGO_X15,
 	ALGO_X17,
+	ALGO_ZR5,
 };
 
 static const char *algo_names[] = {
@@ -140,6 +141,7 @@ static const char *algo_names[] = {
 	"x14",
 	"x15",
 	"x17",
+	"zr5",
 };
 
 bool opt_debug = false;
@@ -166,7 +168,7 @@ static const bool opt_time = true;
 static enum sha_algos opt_algo = ALGO_X11;
 int opt_n_threads = 0;
 int opt_affinity = -1;
-int opt_priority = 3;
+int opt_priority = 0;
 static double opt_difficulty = 1; // CH
 bool opt_trust_pool = false;
 uint16_t opt_vote = 9999;
@@ -193,6 +195,7 @@ int api_thr_id = -1;
 bool stratum_need_reset = false;
 struct work_restart *work_restart = NULL;
 struct stratum_ctx stratum = { 0 };
+uint32_t zr5_pok = 0;
 
 pthread_mutex_t applog_lock;
 static pthread_mutex_t stats_lock;
@@ -254,6 +257,7 @@ Options:\n\
 			x17         X17 (peoplecurrency)\n\
 			whirl       Whirlcoin (old whirlpool)\n\
 			whirlpoolx  Vanilla coin\n\
+			zr5         ZR5 (ZiftrCoin)\n\
   -d, --devices         Comma separated list of CUDA devices to use.\n\
                         Device IDs start counting from 0! Alternatively takes\n\
                         string names of your cards like gtx780ti or gt640#2\n\
@@ -472,6 +476,10 @@ static bool work_decode(const json_t *val, struct work *work)
 	int adata_sz = ARRAY_SIZE(work->data), atarget_sz = ARRAY_SIZE(work->target);
 	int i;
 
+	if (opt_algo == ALGO_ZR5) {
+		data_size = 80; adata_sz = 20;
+	}
+
 	if (unlikely(!jobj_binary(val, "data", work->data, data_size))) {
 		applog(LOG_ERR, "JSON inval data");
 		return false;
@@ -564,12 +572,12 @@ static int share_result(int result, const char *reason)
 
 	if (reason) {
 		applog(LOG_WARNING, "reject reason: %s", reason);
-		if (strncmp(reason, "low difficulty share", 20) == 0) {
+		if (strncasecmp(reason, "low difficulty", 14) == 0) {
 			opt_difficulty = (opt_difficulty * 2.0) / 3.0;
 			applog(LOG_WARNING, "factor reduced to : %0.2f", opt_difficulty);
 			return 0;
 		}
-		if (strncmp(reason, "Duplicate share", 15) == 0 && !check_dups) {
+		if (strncasecmp(reason, "duplicate", 9) == 0 && !check_dups) {
 			applog(LOG_WARNING, "enabling duplicates check feature");
 			check_dups = true;
 		}
@@ -603,7 +611,11 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		}
 	}
 
-	if (stale_work) {
+	if (opt_algo == ALGO_ZR5 && !stale_work) {
+		stale_work = (memcmp(&work->data[1], &g_work.data[1], 68));
+	}
+
+	if (!submit_old && stale_work) {
 		if (opt_debug)
 			applog(LOG_WARNING, "stale work detected, discarding");
 		return true;
@@ -616,9 +628,16 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		uint16_t nvote;
 		char *ntimestr, *noncestr, *xnonce2str, *nvotestr;
 
-		le32enc(&ntime, work->data[17]);
-		le32enc(&nonce, work->data[19]);
-
+		switch (opt_algo) {
+		case ALGO_ZR5:
+			check_dups = true;
+			be32enc(&ntime, work->data[17]);
+			be32enc(&nonce, work->data[19]);
+			break;
+		default:
+			le32enc(&ntime, work->data[17]);
+			le32enc(&nonce, work->data[19]);
+		}
 		noncestr = bin2hex((const uchar*)(&nonce), 4);
 
 		if (check_dups)
@@ -666,14 +685,21 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 
 	} else {
 
+		int data_size = sizeof(work->data);
+		int adata_sz = ARRAY_SIZE(work->data);
+
 		/* build hex string */
 		char *str = NULL;
 
+		if (opt_algo == ALGO_ZR5) {
+			data_size = 80; adata_sz = 20;
+		}
+
 		if (opt_algo != ALGO_HEAVY && opt_algo != ALGO_MJOLLNIR) {
-			for (int i = 0; i < ARRAY_SIZE(work->data); i++)
+			for (int i = 0; i < adata_sz; i++)
 				le32enc(work->data + i, work->data[i]);
 		}
-		str = bin2hex((uchar*)work->data, sizeof(work->data));
+		str = bin2hex((uchar*)work->data, data_size);
 		if (unlikely(!str)) {
 			applog(LOG_ERR, "submit_upstream_work OOM");
 			return false;
@@ -1098,10 +1124,18 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
 	work->data[17] = le32dec(sctx->job.ntime);
 	work->data[18] = le32dec(sctx->job.nbits);
-	if (opt_algo == ALGO_MJOLLNIR || opt_algo == ALGO_HEAVY)
-	{
+
+	switch (opt_algo) {
+	case ALGO_MJOLLNIR:
+	case ALGO_HEAVY:
+		// todo: check if 19 is enough
 		for (i = 0; i < 20; i++)
 			work->data[i] = be32dec((uint32_t *)&work->data[i]);
+		break;
+	case ALGO_ZR5:
+		for (i = 0; i < 19; i++)
+			work->data[i] = be32dec((uint32_t *)&work->data[i]);
+		break;
 	}
 
 	work->data[20] = 0x80000000;
@@ -1227,6 +1261,7 @@ static void *miner_thread(void *userdata)
 
 		// &work.data[19]
 		int wcmplen = 76;
+		int wcmpoft = 0;
 		uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
 
 		if (have_stratum) {
@@ -1284,7 +1319,14 @@ static void *miner_thread(void *userdata)
 					hashlog_purge_job(work.job_id);
 			}
 		}
-		if (memcmp(work.data, g_work.data, wcmplen)) {
+
+		if (opt_algo == ALGO_ZR5) {
+			// ignore pok/version header
+			wcmpoft = 1;
+			wcmplen -= 4;
+		}
+
+		if (memcmp(&work.data[wcmpoft], &g_work.data[wcmpoft], wcmplen)) {
 			#if 0
 			if (opt_debug) {
 				for (int n=0; n <= (wcmplen-8); n+=8) {
@@ -1497,19 +1539,24 @@ static void *miner_thread(void *userdata)
 
 		case ALGO_X14:
 			rc = scanhash_x14(thr_id, work.data, work.target,
-				                  max_nonce, &hashes_done);
+				              max_nonce, &hashes_done);
 			break;
 
 		case ALGO_X15:
 			rc = scanhash_x15(thr_id, work.data, work.target,
-				                  max_nonce, &hashes_done);
+				              max_nonce, &hashes_done);
 			break;
 
 		case ALGO_X17:
 			rc = scanhash_x17(thr_id, work.data, work.target,
-				                  max_nonce, &hashes_done);
+				              max_nonce, &hashes_done);
 			break;
 
+		case ALGO_ZR5: {
+			rc = scanhash_zr5(thr_id, work.data, work.target,
+			                      max_nonce, &hashes_done);
+			break;
+		}
 		default:
 			/* should never happen */
 			goto out;
@@ -1606,6 +1653,11 @@ static void *miner_thread(void *userdata)
 			if (rc > 1 && work.data[21]) {
 				work.data[19] = work.data[21];
 				work.data[21] = 0;
+				if (opt_algo == ALGO_ZR5) {
+					// todo: use + 4..6 index for pok to allow multiple nonces
+					work.data[0] = work.data[22]; // pok
+					work.data[22] = 0;
+				}
 				if (!submit_work(mythr, &work))
 					break;
 			}
@@ -1675,10 +1727,10 @@ start:
 			submit_old = soval ? json_is_true(soval) : false;
 			pthread_mutex_lock(&g_work_lock);
 			if (work_decode(json_object_get(val, "result"), &g_work)) {
+				restart_threads();
 				if (!opt_quiet)
 					applog(LOG_BLUE, "%s detected new block", short_url);
 				g_work_time = time(NULL);
-				restart_threads();
 			}
 			pthread_mutex_unlock(&g_work_lock);
 			json_decref(val);
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 0311a35..d917465 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -105,8 +105,9 @@
       <CInterleavedPTX>false</CInterleavedPTX>
       <MaxRegCount>80</MaxRegCount>
       <PtxAsOptionV>true</PtxAsOptionV>
-      <Keep>false</Keep>
-      <CodeGeneration>compute_30,sm_30;compute_50,sm_50</CodeGeneration>
+      <Keep>true</Keep>
+      <CodeGeneration>compute_50,sm_50</CodeGeneration>
+      <GenerateLineInfo>true</GenerateLineInfo>
     </CudaCompile>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
@@ -176,8 +177,8 @@
       <CInterleavedPTX>false</CInterleavedPTX>
       <MaxRegCount>80</MaxRegCount>
       <PtxAsOptionV>true</PtxAsOptionV>
-      <Keep>false</Keep>
-      <CodeGeneration>compute_50,sm_50;</CodeGeneration>
+      <Keep>true</Keep>
+      <CodeGeneration>compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;compute_52,sm_52</CodeGeneration>
       <AdditionalOptions>--ptxas-options="-O2" %(AdditionalOptions)</AdditionalOptions>
       <Defines>
       </Defines>
@@ -221,7 +222,7 @@
       <MaxRegCount>80</MaxRegCount>
       <PtxAsOptionV>true</PtxAsOptionV>
       <Keep>false</Keep>
-      <CodeGeneration>compute_50,sm_50</CodeGeneration>
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
       <Include>
       </Include>
       <TargetMachinePlatform>64</TargetMachinePlatform>
@@ -355,6 +356,7 @@
     </CudaCompile>
     <CudaCompile Include="cuda_nist5.cu">
     </CudaCompile>
+    <CudaCompile Include="zr5.cu" />
     <CudaCompile Include="groestl_functions_quad.cu">
       <ExcludedFromBuild>true</ExcludedFromBuild>
     </CudaCompile>
@@ -509,4 +511,4 @@
   <Target Name="AfterClean">
     <Delete Files="@(FilesToCopy->'$(OutDir)%(Filename)%(Extension)')" TreatErrorsAsWarnings="true" />
   </Target>
-</Project>
\ No newline at end of file
+</Project>
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index e6173c2..376e847 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -379,9 +379,6 @@
     <CudaCompile Include="cuda_groestlcoin.cu">
       <Filter>Source Files\CUDA</Filter>
     </CudaCompile>
-    <CudaCompile Include="groestl_functions_quad.cu">
-      <Filter>Source Files\CUDA</Filter>
-    </CudaCompile>
     <CudaCompile Include="bitslice_transformations_quad.cu">
       <Filter>Source Files\CUDA</Filter>
     </CudaCompile>
@@ -571,6 +568,12 @@
     <CudaCompile Include="lyra2\lyra2RE.cu">
       <Filter>Source Files\CUDA</Filter>
     </CudaCompile>
+    <CudaCompile Include="zr5.cu">
+      <Filter>Source Files\CUDA</Filter>
+    </CudaCompile>
+    <CudaCompile Include="groestl_functions_quad.cu">
+      <Filter>Source Files\CUDA\quark</Filter>
+    </CudaCompile>
   </ItemGroup>
   <ItemGroup>
     <Image Include="res\ccminer.ico">
diff --git a/configure.ac b/configure.ac
index fe99b0c..fd49c12 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [1.6-git])
+AC_INIT([ccminer], [1.6])
 
 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
diff --git a/cpuminer-config.h b/cpuminer-config.h
index 225d7d5..c3b94c4 100644
--- a/cpuminer-config.h
+++ b/cpuminer-config.h
@@ -63,7 +63,7 @@
 #define HAVE_STRING_H 1
 
 /* Define to 1 if you have the <syslog.h> header file. */
-#define HAVE_SYSLOG_H 1
+/* #undef HAVE_SYSLOG_H */
 
 /* Define to 1 if you have the <sys/endian.h> header file. */
 /* #undef HAVE_SYS_ENDIAN_H */
@@ -108,7 +108,7 @@
 /* #undef LIBCURL_FEATURE_SSPI */
 
 /* Defined if libcurl supports DICT */
-#define LIBCURL_PROTOCOL_DICT 1
+/* #undef LIBCURL_PROTOCOL_DICT */
 
 /* Defined if libcurl supports FILE */
 #define LIBCURL_PROTOCOL_FILE 1
@@ -123,28 +123,28 @@
 #define LIBCURL_PROTOCOL_HTTP 1
 
 /* Defined if libcurl supports HTTPS */
-#define LIBCURL_PROTOCOL_HTTPS 1
+/* #undef LIBCURL_PROTOCOL_HTTPS */
 
 /* Defined if libcurl supports IMAP */
-#define LIBCURL_PROTOCOL_IMAP 1
+/* #undef LIBCURL_PROTOCOL_IMAP */
 
 /* Defined if libcurl supports LDAP */
-#define LIBCURL_PROTOCOL_LDAP 1
+/* #undef LIBCURL_PROTOCOL_LDAP */
 
 /* Defined if libcurl supports POP3 */
-#define LIBCURL_PROTOCOL_POP3 1
+/* #undef LIBCURL_PROTOCOL_POP3 */
 
 /* Defined if libcurl supports RTSP */
-#define LIBCURL_PROTOCOL_RTSP 1
+/* #undef LIBCURL_PROTOCOL_RTSP */
 
 /* Defined if libcurl supports SMTP */
-#define LIBCURL_PROTOCOL_SMTP 1
+/* #undef LIBCURL_PROTOCOL_SMTP */
 
 /* Defined if libcurl supports TELNET */
-#define LIBCURL_PROTOCOL_TELNET 1
+/* #undef LIBCURL_PROTOCOL_TELNET */
 
 /* Defined if libcurl supports TFTP */
-#define LIBCURL_PROTOCOL_TFTP 1
+/* #undef LIBCURL_PROTOCOL_TFTP */
 
 /* Define to 1 if your C compiler doesn't accept -c and -o together. */
 /* #undef NO_MINUS_C_MINUS_O */
@@ -159,16 +159,16 @@
 #define PACKAGE_NAME "ccminer"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "ccminer 1.6-git"
+#define PACKAGE_STRING "ccminer 1.6"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "ccminer"
 
 /* Define to the home page for this package. */
-#define PACKAGE_URL ""
+#define PACKAGE_URL "http://github.com/tpruvot/ccminer"
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "1.6-git"
+#define PACKAGE_VERSION "1.6"
 
 /* If using the C implementation of alloca, define if you know the
    direction of stack growth for your system; otherwise it will be
@@ -191,10 +191,10 @@
 #define USE_XOP 1
 
 /* Version number of package */
-#define VERSION "1.6-git"
+#define VERSION "1.6"
 
 /* Define curl_free() as free() if our version of curl lacks curl_free. */
 /* #undef curl_free */
 
 /* Define to `unsigned int' if <sys/types.h> does not define. */
-/* #undef size_t */
+//#define size_t unsigned int
diff --git a/miner.h b/miner.h
index 6483b15..bb660ed 100644
--- a/miner.h
+++ b/miner.h
@@ -378,6 +378,10 @@ extern int scanhash_whirlpoolx(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
 
+extern int scanhash_zr5(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce,
+	unsigned long *hashes_done);
+
 /* api related */
 void *api_thread(void *userdata);
 void api_set_throughput(int thr_id, uint32_t throughput);
@@ -679,6 +683,7 @@ void x13hash(void *output, const void *input);
 void x14hash(void *output, const void *input);
 void x15hash(void *output, const void *input);
 void x17hash(void *output, const void *input);
+void zr5hash(void *output, const void *input);
 
 #ifdef __cplusplus
 }
diff --git a/quark/cuda_quark_groestl512.cu b/quark/cuda_quark_groestl512.cu
index 6381af7..ba69214 100644
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@@ -52,11 +52,22 @@ void quark_groestl512_gpu_hash_64_quad(uint32_t threads, uint32_t startNounce, u
         uint32_t hash[16];
         from_bitslice_quad(state, hash);
 
-        if (thr == 0)
-        {
+
+        // uint4 = 4x4 uint32_t = 16 bytes
+        if (thr == 0) {
+            uint4 *phash = (uint4*) hash;
+            uint4 *outpt = (uint4*) outpHash; /* var kept for hash align */
+            outpt[0] = phash[0];
+            outpt[1] = phash[1];
+            outpt[2] = phash[2];
+            outpt[3] = phash[3];
+        }
+/*
+        if (thr == 0) {
             #pragma unroll
             for(int k=0;k<16;k++) outpHash[k] = hash[k];
         }
+*/
     }
 #endif
 }
diff --git a/util.cpp b/util.cpp
index 59f5e9d..b46adaa 100644
--- a/util.cpp
+++ b/util.cpp
@@ -1660,12 +1660,35 @@ extern void applog_hash(uchar *hash)
 #define printpfx(n,h) \
 	printf("%s%11s%s: %s\n", CL_GRN, n, CL_N, format_hash(s, h))
 
+static uint32_t zrtest[20] = {
+	swab32(0x01806486),
+	swab32(0x00000000),
+	swab32(0x00000000),
+	swab32(0x00000000),
+	swab32(0x00000000),
+	swab32(0x00000000),
+	swab32(0x00000000),
+	swab32(0x00000000),
+	swab32(0x00000000),
+	swab32(0x2ab03251),
+	swab32(0x87d4f28b),
+	swab32(0x6e22f086),
+	swab32(0x4845ddd5),
+	swab32(0x0ac4e6aa),
+	swab32(0x22a1709f),
+	swab32(0xfb4275d9),
+	swab32(0x25f26636),
+	swab32(0x300eed54),
+	swab32(0xffff0f1e),
+	swab32(0x2a9e2300),
+};
+
 void do_gpu_tests(void)
 {
 #ifdef _DEBUG
 	unsigned long done;
 	char s[128] = { '\0' };
-	uchar buf[128];
+	uchar buf[160];
 	uint32_t tgt[8] = { 0 };
 
 	opt_tracegpu = true;
@@ -1674,11 +1697,15 @@ void do_gpu_tests(void)
 	tgt[7] = 0xffff;
 
 	memset(buf, 0, sizeof buf);
-	scanhash_x11(0, (uint32_t*)buf, tgt, 1, &done);
+	//memcpy(buf, zrtest, 80);
+	scanhash_zr5(0, (uint32_t*)buf, tgt, zrtest[19]+1, &done);
+
+	//memset(buf, 0, sizeof buf);
+	//scanhash_x11(0, (uint32_t*)buf, tgt, 1, &done);
 
 	memset(buf, 0, sizeof buf);
 	// buf[0] = 1; buf[64] = 2; // for endian tests
-	scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14);
+	//scanhash_blake256(0, (uint32_t*)buf, tgt, 1, &done, 14);
 
 	//memset(buf, 0, sizeof buf);
 	//scanhash_heavy(0, (uint32_t*)buf, tgt, 1, &done, 1, 84); // HEAVYCOIN_BLKHDR_SZ=84
@@ -1688,6 +1715,7 @@ void do_gpu_tests(void)
 	opt_tracegpu = false;
 #endif
 }
+extern "C" void zr5hash_pok(void *output, uint32_t *pdata);
 
 void print_hash_tests(void)
 {
@@ -1782,6 +1810,11 @@ void print_hash_tests(void)
 	x17hash(&hash[0], &buf[0]);
 	printpfx("X17", hash);
 
+	//memcpy(buf, zrtest, 80);
+	zr5hash(&hash[0], &buf[0]);
+	//zr5hash_pok(&hash[0], (uint32_t*) &buf[0]);
+	printpfx("ZR5", hash);
+
 	printf("\n");
 
 	do_gpu_tests();
diff --git a/zr5.cu b/zr5.cu
new file mode 100644
index 0000000..4196cfb
--- /dev/null
+++ b/zr5.cu
@@ -0,0 +1,342 @@
+/* Ziftrcoin ZR5 CUDA Implementation, (c) tpruvot 2015 */
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+
+#include <stdio.h>
+#include <memory.h>
+
+#define ZR_BLAKE   0
+#define ZR_GROESTL 1
+#define ZR_JH512   2
+#define ZR_SKEIN   3
+
+#define POK_BOOL_MASK 0x00008000
+#define POK_DATA_MASK 0xFFFF0000
+
+static uint32_t* d_hash[MAX_GPUS];
+static uint16_t* d_pokh[MAX_GPUS];
+static uint16_t* h_poks[MAX_GPUS];
+
+static uint32_t* d_blake[MAX_GPUS];
+static uint32_t* d_groes[MAX_GPUS];
+static uint32_t* d_jh512[MAX_GPUS];
+static uint32_t* d_skein[MAX_GPUS];
+
+__constant__ uint8_t d_permut[24][4];
+static const uint8_t permut[24][4] = {
+	{0, 1, 2, 3},
+	{0, 1, 3, 2},
+	{0, 2, 1, 3},
+	{0, 2, 3, 1},
+	{0, 3, 1, 2},
+	{0, 3, 2, 1},
+	{1, 0, 2, 3},
+	{1, 0, 3, 2},
+	{1, 2, 0, 3},
+	{1, 2, 3, 0},
+	{1, 3, 0, 2},
+	{1, 3, 2, 0},
+	{2, 0, 1, 3},
+	{2, 0, 3, 1},
+	{2, 1, 0, 3},
+	{2, 1, 3, 0},
+	{2, 3, 0, 1},
+	{2, 3, 1, 0},
+	{3, 0, 1, 2},
+	{3, 0, 2, 1},
+	{3, 1, 0, 2},
+	{3, 1, 2, 0},
+	{3, 2, 0, 1},
+	{3, 2, 1, 0}
+};
+
+// CPU HASH
+extern "C" void zr5hash(void *output, const void *input)
+{
+	sph_keccak512_context ctx_keccak;
+	sph_blake512_context ctx_blake;
+	sph_groestl512_context ctx_groestl;
+	sph_jh512_context ctx_jh;
+	sph_skein512_context ctx_skein;
+
+	uchar _ALIGN(64) hash[64];
+	uint32_t *phash = (uint32_t *) hash;
+	uint32_t norder;
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512(&ctx_keccak, (const void*) input, 80);
+	sph_keccak512_close(&ctx_keccak, (void*) phash);
+
+	norder = phash[0] % ARRAY_SIZE(permut); /* % 24 */
+
+	for(int i = 0; i < 4; i++)
+	{
+		switch (permut[norder][i]) {
+		case ZR_BLAKE:
+			sph_blake512_init(&ctx_blake);
+			sph_blake512(&ctx_blake, (const void*) phash, 64);
+			sph_blake512_close(&ctx_blake, phash);
+			break;
+		case ZR_GROESTL:
+			sph_groestl512_init(&ctx_groestl);
+			sph_groestl512(&ctx_groestl, (const void*) phash, 64);
+			sph_groestl512_close(&ctx_groestl, phash);
+			break;
+		case ZR_JH512:
+			sph_jh512_init(&ctx_jh);
+			sph_jh512(&ctx_jh, (const void*) phash, 64);
+			sph_jh512_close(&ctx_jh, phash);
+			break;
+		case ZR_SKEIN:
+			sph_skein512_init(&ctx_skein);
+			sph_skein512(&ctx_skein, (const void*) phash, 64);
+			sph_skein512_close(&ctx_skein, phash);
+			break;
+		default:
+			break;
+		}
+	}
+	memcpy(output, phash, 32);
+}
+
+extern "C" void zr5hash_pok(void *output, uint32_t *pdata)
+{
+	const uint32_t version = pdata[0] & (~POK_DATA_MASK);
+	uint32_t _ALIGN(64) hash[8];
+
+	pdata[0] = version;
+	zr5hash(hash, pdata);
+
+	// fill PoK
+	pdata[0] = version | (hash[0] & POK_DATA_MASK);
+	zr5hash(hash, pdata);
+
+	memcpy(output, hash, 32);
+}
+
+__global__
+void zr5_copy_round_data_gpu(uint32_t threads, uint32_t *d_hash, uint32_t* d_blake, uint32_t* d_groes, uint32_t* d_jh512, uint32_t* d_skein, int rnd)
+{
+	// copy 64 bytes hash in the right algo buffer
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint64_t offset = thread * 64 / 4;
+		uint32_t *phash = &d_hash[offset];
+		// algos hash order
+		uint32_t norder = phash[0] % ARRAY_SIZE(permut);
+		uint32_t algo = d_permut[norder][rnd];
+		uint32_t* buffers[4] = { d_blake, d_groes, d_jh512, d_skein };
+
+		if (rnd > 0) {
+			int algosrc = d_permut[norder][rnd - 1];
+			phash = buffers[algosrc] + offset;
+		}
+
+		// uint4 = 4x4 uint32_t = 16 bytes
+		uint4 *psrc = (uint4*) phash;
+		uint4 *pdst = (uint4*) (buffers[algo] + offset);
+		pdst[0] = psrc[0];
+		pdst[1] = psrc[1];
+		pdst[2] = psrc[2];
+		pdst[3] = psrc[3];
+	}
+}
+
+__host__
+void zr5_move_data_to_hash(int thr_id, uint32_t threads, int rnd)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	zr5_copy_round_data_gpu <<<grid, block>>> (threads, d_hash[thr_id], d_blake[thr_id], d_groes[thr_id], d_jh512[thr_id], d_skein[thr_id], rnd);
+}
+
+__global__
+void zr5_final_round_data_gpu(uint32_t threads, uint32_t* d_blake, uint32_t* d_groes, uint32_t* d_jh512, uint32_t* d_skein, uint32_t *d_hash, uint16_t *d_pokh)
+{
+	// after the 4 algos rounds, copy back hash to d_hash
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint64_t offset = thread * 16; // 64 / 4;
+		uint32_t *phash = &d_hash[offset];
+		uint16_t norder = phash[0] % ARRAY_SIZE(permut);
+		uint16_t algosrc = d_permut[norder][3];
+
+		uint32_t* buffers[4] = { d_blake, d_groes, d_jh512, d_skein };
+
+		// copy only hash[0] + hash[6..7]
+		uint2 *psrc = (uint2*) (buffers[algosrc] + offset);
+		uint2 *pdst = (uint2*) phash;
+
+		pdst[0].x = psrc[0].x;
+		pdst[3] = psrc[3];
+
+		//phash[7] = *(buffers[algosrc] + offset + 7);
+	}
+}
+
+__host__
+void zr5_final_round(int thr_id, uint32_t threads)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	zr5_final_round_data_gpu <<<grid, block>>> (threads, d_blake[thr_id], d_groes[thr_id], d_jh512[thr_id], d_skein[thr_id], d_hash[thr_id], d_pokh[thr_id]);
+}
+
+extern void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads);
+extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen);
+
+extern void zr5_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+extern void zr5_keccak512_cpu_hash_pok(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t* pdata, uint32_t *d_hash, uint16_t *d_poks);
+
+extern void quark_blake512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void quark_groestl512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_groestl512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void quark_jh512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_jh512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void quark_skein512_cpu_init(int thr_id, uint32_t threads);
+extern void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_zr5(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+	uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) tmpdata[20];
+	const uint32_t version = pdata[0] & (~POK_DATA_MASK);
+	const uint32_t first_nonce = pdata[19];
+	uint32_t throughput =  device_intensity(thr_id, __func__, 1U << 18);
+	throughput = min(throughput, (1U << 20)-1024);
+	throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x0000ff;
+
+	memcpy(tmpdata, pdata, 80);
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+
+		// hash buffer = keccak hash 64 required
+		cudaMalloc(&d_hash[thr_id], 64 * throughput);
+		cudaMalloc(&d_pokh[thr_id], 2 * throughput);
+
+		cudaMemcpyToSymbol(d_permut, permut, 24*4, 0, cudaMemcpyHostToDevice);
+		cudaMallocHost(&h_poks[thr_id], 2 * throughput);
+
+		// data buffers for the 4 rounds
+		cudaMalloc(&d_blake[thr_id], 64 * throughput);
+		cudaMalloc(&d_groes[thr_id], 64 * throughput);
+		cudaMalloc(&d_jh512[thr_id], 64 * throughput);
+		cudaMalloc(&d_skein[thr_id], 64 * throughput);
+
+		jackpot_keccak512_cpu_init(thr_id, throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		CUDA_SAFE_CALL(cudaDeviceSynchronize());
+
+		init[thr_id] = true;
+	}
+
+	tmpdata[0] = version;
+	jackpot_keccak512_cpu_setBlock((void*)tmpdata, 80);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		// Keccak512 Hash with CUDA
+		zr5_keccak512_cpu_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+
+		for (int rnd=0; rnd<4; rnd++) {
+			zr5_move_data_to_hash(thr_id, throughput, rnd);
+			quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_blake[thr_id], order++);
+			quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_groes[thr_id], order++);
+			quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_jh512[thr_id], order++);
+			quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_skein[thr_id], order++);
+		}
+
+		// This generates all pok prefixes
+		zr5_final_round(thr_id, throughput);
+
+		// Keccak512 pok
+		zr5_keccak512_cpu_hash_pok(thr_id, throughput, pdata[19], pdata, d_hash[thr_id], d_pokh[thr_id]);
+
+		for (int rnd=0; rnd<4; rnd++) {
+			zr5_move_data_to_hash(thr_id, throughput, rnd);
+			quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_blake[thr_id], order++);
+			quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_groes[thr_id], order++);
+			quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_jh512[thr_id], order++);
+			quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_skein[thr_id], order++);
+		}
+		zr5_final_round(thr_id, throughput);
+
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (foundNonce != UINT32_MAX)
+		{
+			uint32_t vhash64[8];
+			uint32_t oldp0 = pdata[0];
+			uint32_t oldp19 = pdata[19];
+			uint32_t offset = foundNonce - pdata[19];
+			uint32_t pok = 0;
+
+			*hashes_done = pdata[19] - first_nonce + throughput;
+
+			cudaMemcpy(h_poks[thr_id], d_pokh[thr_id], 2 * throughput, cudaMemcpyDeviceToHost);
+			pok = version | (0x10000UL * h_poks[thr_id][offset]);
+			pdata[0] = pok; pdata[19] = foundNonce;
+			zr5hash(vhash64, pdata);
+			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
+				int res = 1;
+				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				if (secNonce != 0) {
+					offset = secNonce - oldp19;
+					pok = version | (0x10000UL * h_poks[thr_id][offset]);
+					memcpy(tmpdata, pdata, 80);
+					tmpdata[0] = pok; tmpdata[19] = secNonce;
+					zr5hash(vhash64, tmpdata);
+					if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
+						pdata[21] = secNonce;
+						pdata[22] = pok;
+						res++;
+					}
+				}
+				return res;
+			} else {
+				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
+				pdata[19]++;
+				pdata[0] = oldp0;
+			}
+		} else
+			pdata[19] += throughput;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}