From 3e419abf84e24935d049b781b55072d2f4f548be Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Wed, 10 Jun 2015 18:59:22 +0200
Subject: [PATCH] qubit: implement cpu precalc (klaust)

improve qubit (+5%) deep and doom (+10%) hashrate

based on klausT code, simplified...
---
 qubit/deep.cu           |  29 +-
 qubit/doom.cu           |  40 +--
 qubit/qubit_luffa512.cu | 571 ++++++++++++++++++++--------------------
 3 files changed, 318 insertions(+), 322 deletions(-)

diff --git a/qubit/deep.cu b/qubit/deep.cu
index ce6720d..4041f40 100644
--- a/qubit/deep.cu
+++ b/qubit/deep.cu
@@ -19,8 +19,6 @@ static uint32_t *d_hash[MAX_GPUS];
 extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
 extern void qubit_luffa512_cpu_setBlock_80(void *pdata);
 extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
-extern void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget);
-extern uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
 
 extern void x11_cubehash512_cpu_init(int thr_id, uint32_t threads);
 extern void x11_cubehash512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
@@ -30,13 +28,13 @@ extern void x11_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t start
 
 extern "C" void deephash(void *state, const void *input)
 {
-	// luffa1-cubehash2-shavite3-simd4-echo5
+	uint8_t _ALIGN(64) hash[64];
+
+	// luffa-80 cubehash-64 echo-64
 	sph_luffa512_context ctx_luffa;
 	sph_cubehash512_context ctx_cubehash;
 	sph_echo512_context ctx_echo;
 
-	uint8_t hash[64];
-
 	sph_luffa512_init(&ctx_luffa);
 	sph_luffa512 (&ctx_luffa, input, 80);
 	sph_luffa512_close(&ctx_luffa, (void*) hash);
@@ -54,14 +52,13 @@ extern "C" void deephash(void *state, const void *input)
 
 static bool init[MAX_GPUS] = { 0 };
 
-extern "C" int scanhash_deep(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done)
+extern "C" int scanhash_deep(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+	uint32_t max_nonce, unsigned long *hashes_done)
 {
+	uint32_t _ALIGN(64) endiandata[20];
 	const uint32_t first_nonce = pdata[19];
-	uint32_t endiandata[20];
 	uint32_t throughput =  device_intensity(thr_id, __func__, 1U << 19); // 256*256*8
-	throughput = min(throughput,  (max_nonce - first_nonce));
+	throughput = min(throughput, (max_nonce - first_nonce));
 
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000f;
@@ -69,7 +66,8 @@ extern "C" int scanhash_deep(int thr_id, uint32_t *pdata,
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
-		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], throughput * 64));
 
 		qubit_luffa512_cpu_init(thr_id, throughput);
 		x11_cubehash512_cpu_init(thr_id, throughput);
@@ -80,10 +78,10 @@ extern "C" int scanhash_deep(int thr_id, uint32_t *pdata,
 		init[thr_id] = true;
 	}
 
-	for (int k=0; k < 20; k++)
+	for (int k=0; k < 19; k++)
 		be32enc(&endiandata[k], pdata[k]);
 
-	qubit_luffa512_cpufinal_setBlock_80((void*)endiandata,ptarget);
+	qubit_luffa512_cpu_setBlock_80((void*)endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 
 	do {
@@ -96,12 +94,11 @@ extern "C" int scanhash_deep(int thr_id, uint32_t *pdata,
 		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != UINT32_MAX)
 		{
-			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
+			uint32_t _ALIGN(64) vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			deephash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
 				int res = 1;
 				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 				*hashes_done = pdata[19] - first_nonce + throughput;
diff --git a/qubit/doom.cu b/qubit/doom.cu
index 1a8aa4e..73c8622 100644
--- a/qubit/doom.cu
+++ b/qubit/doom.cu
@@ -15,15 +15,12 @@ static uint32_t *d_hash[MAX_GPUS];
 extern void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
 extern void qubit_luffa512_cpu_setBlock_80(void *pdata);
 extern void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
-extern void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget);
-extern uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
 
-extern void doomhash(void *state, const void *input)
+extern "C" void doomhash(void *state, const void *input)
 {
-	// luffa512
-	sph_luffa512_context ctx_luffa;
+	uint8_t _ALIGN(64) hash[64];
 
-	uint8_t hash[64];
+	sph_luffa512_context ctx_luffa;
 
 	sph_luffa512_init(&ctx_luffa);
 	sph_luffa512 (&ctx_luffa, input, 80);
@@ -34,12 +31,11 @@ extern void doomhash(void *state, const void *input)
 
 static bool init[MAX_GPUS] = { 0 };
 
-extern "C" int scanhash_doom(int thr_id, uint32_t *pdata,
-	const uint32_t *ptarget, uint32_t max_nonce,
-	unsigned long *hashes_done)
+extern "C" int scanhash_doom(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+	uint32_t max_nonce, unsigned long *hashes_done)
 {
+	uint32_t _ALIGN(64) endiandata[20];
 	const uint32_t first_nonce = pdata[19];
-	uint32_t endiandata[20];
 	uint32_t throughput = device_intensity(thr_id, __func__, 1U << 22); // 256*256*8*8
 	throughput = min(throughput, max_nonce - first_nonce);
 
@@ -50,31 +46,35 @@ extern "C" int scanhash_doom(int thr_id, uint32_t *pdata,
 	{
 		cudaSetDevice(device_map[thr_id]);
 
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], throughput * 64));
 
-		qubit_luffa512_cpu_init(thr_id, (int) throughput);
+		qubit_luffa512_cpu_init(thr_id, throughput);
+		cuda_check_cpu_init(thr_id, throughput);
 
 		init[thr_id] = true;
 	}
 
-	for (int k=0; k < 20; k++)
+	for (int k=0; k < 19; k++)
 		be32enc(&endiandata[k], pdata[k]);
 
-	qubit_luffa512_cpufinal_setBlock_80((void*)endiandata,ptarget);
+	qubit_luffa512_cpu_setBlock_80((void*)endiandata);
+	cuda_check_cpu_setTarget(ptarget);
 
 	do {
 		int order = 0;
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		qubit_luffa512_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], order++);
 
-		uint32_t foundNonce = qubit_luffa512_cpu_finalhash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], order++);
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (foundNonce != UINT32_MAX)
 		{
-			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
+			uint32_t _ALIGN(64) vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			doomhash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
-				*hashes_done = min(max_nonce - first_nonce, (uint64_t) pdata[19] - first_nonce + throughput);
+			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
+				//*hashes_done = min(max_nonce - first_nonce, (uint64_t) pdata[19] - first_nonce + throughput);
 				pdata[19] = foundNonce;
 				return 1;
 			} else {
@@ -82,7 +82,7 @@ extern "C" int scanhash_doom(int thr_id, uint32_t *pdata,
 			}
 		}
 
-		if ((uint64_t) pdata[19] + throughput > max_nonce) {
+		if ((uint64_t) throughput + pdata[19] > max_nonce) {
 			// pdata[19] = max_nonce;
 			break;
 		}
diff --git a/qubit/qubit_luffa512.cu b/qubit/qubit_luffa512.cu
index ec40564..d4fea4c 100644
--- a/qubit/qubit_luffa512.cu
+++ b/qubit/qubit_luffa512.cu
@@ -1,49 +1,18 @@
-/*
- * luffa_for_32.c
- * Version 2.0 (Sep 15th 2009)
- *
- * Copyright (C) 2008-2009 Hitachi, Ltd. All rights reserved.
- *
- * Hitachi, Ltd. is the owner of this software and hereby grant
- * the U.S. Government and any interested party the right to use
- * this software for the purposes of the SHA-3 evaluation process,
- * notwithstanding that this software is copyrighted.
- *
- * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
- * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
- * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
- * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+/*******************************************************************************
+ * luffa512 for 80-bytes input (with midstate precalc by klausT)
  */
+
 #include <stdio.h>
 #include <stdint.h>
 #include <memory.h>
 
 #include "cuda_helper.h"
 
-typedef unsigned char BitSequence;
-
-__constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
-__constant__ uint32_t c_Target[8];
-
-static uint32_t *h_resNounce[MAX_GPUS];
-static uint32_t *d_resNounce[MAX_GPUS];
-
-#define NBN 1 /* max results, could be 2, see blake32.cu */
-#if NBN > 1
-static uint32_t extra_results[2] = { UINT32_MAX, UINT32_MAX };
-#endif
+static __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
+static __constant__ uint32_t statebufferpre[8];
+static __constant__ uint32_t statechainvpre[40];
 
-typedef struct {
-	uint32_t buffer[8]; /* Buffer to be hashed */
-	uint32_t chainv[40];   /* Chaining values */
-} hashState;
-
-#define BYTES_SWAP32(x) cuda_swab32(x)
-
-#define MULT2(a,j)\
+#define MULT2(a,j) {\
 	tmp = a[7+(8*j)];\
 	a[7+(8*j)] = a[6+(8*j)];\
 	a[6+(8*j)] = a[5+(8*j)];\
@@ -52,22 +21,25 @@ typedef struct {
 	a[3+(8*j)] = a[2+(8*j)] ^ tmp;\
 	a[2+(8*j)] = a[1+(8*j)];\
 	a[1+(8*j)] = a[0+(8*j)] ^ tmp;\
-	a[0+(8*j)] = tmp;
+	a[0+(8*j)] = tmp;\
+}
 
-#define TWEAK(a0,a1,a2,a3,j)\
+#define TWEAK(a0,a1,a2,a3,j) { \
 	a0 = (a0<<(j))|(a0>>(32-j));\
 	a1 = (a1<<(j))|(a1>>(32-j));\
 	a2 = (a2<<(j))|(a2>>(32-j));\
-	a3 = (a3<<(j))|(a3>>(32-j));
+	a3 = (a3<<(j))|(a3>>(32-j));\
+}
 
-#define STEP(c0,c1)\
+#define STEP(c0,c1) { \
 	SUBCRUMB(chainv[0],chainv[1],chainv[2],chainv[3],tmp);\
 	SUBCRUMB(chainv[5],chainv[6],chainv[7],chainv[4],tmp);\
 	MIXWORD(chainv[0],chainv[4]);\
 	MIXWORD(chainv[1],chainv[5]);\
 	MIXWORD(chainv[2],chainv[6]);\
 	MIXWORD(chainv[3],chainv[7]);\
-	ADD_CONSTANT(chainv[0],chainv[4],c0,c1);
+	ADD_CONSTANT(chainv[0],chainv[4],c0,c1);\
+}
 
 #define SUBCRUMB(a0,a1,a2,a3,a4)\
 	a4  = a0;\
@@ -104,7 +76,7 @@ typedef struct {
 
 /* initial values of chaining variables */
 __constant__ uint32_t c_IV[40];
-const uint32_t h2_IV[40] = {
+static const uint32_t h_IV[40] = {
 	0x6d251e69,0x44b051e0,0x4eaa6fb4,0xdbf78465,
 	0x6e292011,0x90152df4,0xee058139,0xdef610bb,
 	0xc3b44b95,0xd9d2f256,0x70eee9a0,0xde099fa3,
@@ -117,7 +89,7 @@ const uint32_t h2_IV[40] = {
 	0xf5df3999,0x0fc688f1,0xb07224cc,0x03e86cea};
 
 __constant__ uint32_t c_CNS[80];
-uint32_t h2_CNS[80] = {
+static const uint32_t h_CNS[80] = {
 	0x303994a6,0xe0337818,0xc0e65299,0x441ba90d,
 	0x6cc33a12,0x7f34d442,0xdc56983e,0x9389217f,
 	0x1e00108f,0xe5a8bce6,0x7800423d,0x5274baf4,
@@ -142,213 +114,305 @@ uint32_t h2_CNS[80] = {
 
 /***************************************************/
 __device__ __forceinline__
-void rnd512(hashState *state)
+void rnd512(uint32_t *statebuffer, uint32_t *statechainv)
 {
 	int i,j;
 	uint32_t t[40];
 	uint32_t chainv[8];
 	uint32_t tmp;
 
-#pragma unroll 8
-	for(i=0;i<8;i++) {
+	#pragma unroll 8
+	for(i=0; i<8; i++) {
 		t[i]=0;
-#pragma unroll 5
-		for(j=0;j<5;j++) {
-			t[i] ^= state->chainv[i+8*j];
-		}
+		#pragma unroll 5
+		for(j=0; j<5; j++)
+			t[i] ^= statechainv[i+8*j];
 	}
 
 	MULT2(t, 0);
 
-#pragma unroll 5
-	for(j=0;j<5;j++) {
-#pragma unroll 8
-		for(i=0;i<8;i++) {
-			state->chainv[i+8*j] ^= t[i];
-		}
+	#pragma unroll 5
+	for(j=0; j<5; j++) {
+		#pragma unroll 8
+		for(i=0; i<8; i++)
+			statechainv[i+8*j] ^= t[i];
 	}
 
-#pragma unroll 5
-	for(j=0;j<5;j++) {
-#pragma unroll 8
-		for(i=0;i<8;i++) {
-			t[i+8*j] = state->chainv[i+8*j];
-		}
+	#pragma unroll 5
+	for(j=0; j<5; j++) {
+		#pragma unroll 8
+		for(i=0; i<8; i++)
+			t[i+8*j] = statechainv[i+8*j];
 	}
 
-#pragma unroll 5
-	for(j=0;j<5;j++) {
-		MULT2(state->chainv, j);
-	}
+	#pragma unroll
+	for(j=0; j<5; j++)
+		MULT2(statechainv, j);
 
-#pragma unroll 5
-	for(j=0;j<5;j++) {
-#pragma unroll 8
-		for(i=0;i<8;i++) {
-			state->chainv[8*j+i] ^= t[8*((j+1)%5)+i];
-		}
+	#pragma unroll 5
+	for(j=0; j<5; j++) {
+		#pragma unroll 8
+		for(i=0; i<8; i++)
+			statechainv[8*j+i] ^= t[8*((j+1)%5)+i];
 	}
 
-#pragma unroll 5
-	for(j=0;j<5;j++) {
-#pragma unroll 8
-		for(i=0;i<8;i++) {
-			t[i+8*j] = state->chainv[i+8*j];
-		}
+	#pragma unroll 5
+	for(j=0; j<5; j++) {
+		#pragma unroll 8
+		for(i=0; i<8; i++)
+			t[i+8*j] = statechainv[i+8*j];
 	}
 
-#pragma unroll 5
-	for(j=0;j<5;j++) {
-		MULT2(state->chainv, j);
-	}
+	#pragma unroll
+	for(j=0; j<5; j++)
+		MULT2(statechainv, j);
 
-#pragma unroll 5
-	for(j=0;j<5;j++) {
-#pragma unroll 8
-		for(i=0;i<8;i++) {
-			state->chainv[8*j+i] ^= t[8*((j+4)%5)+i];
-		}
+	#pragma unroll 5
+	for(j=0; j<5; j++) {
+		#pragma unroll 8
+		for(i=0; i<8; i++)
+			statechainv[8*j+i] ^= t[8*((j+4)%5)+i];
 	}
 
-#pragma unroll 5
-	for(j=0;j<5;j++) {
-#pragma unroll 8
-		for(i=0;i<8;i++) {
-			state->chainv[i+8*j] ^= state->buffer[i];
-		}
-		MULT2(state->buffer, 0);
+	#pragma unroll 5
+	for(j=0; j<5; j++) {
+		#pragma unroll 8
+		for(i=0; i<8; i++)
+			statechainv[i+8*j] ^= statebuffer[i];
+		MULT2(statebuffer, 0);
 	}
 
-#pragma unroll 8
-	for(i=0;i<8;i++) {
-		chainv[i] = state->chainv[i];
-	}
+	#pragma unroll
+	for(i=0; i<8; i++)
+		chainv[i] = statechainv[i];
 
-#pragma unroll 8
-	for(i=0;i<8;i++) {
-		STEP(c_CNS[(2*i)],c_CNS[(2*i)+1]);
-	}
+	#pragma unroll
+	for(i=0; i<8; i++)
+		STEP(c_CNS[(2*i)], c_CNS[(2*i)+1]);
 
-#pragma unroll 8
-	for(i=0;i<8;i++) {
-		state->chainv[i] = chainv[i];
-		chainv[i] = state->chainv[i+8];
+	#pragma unroll
+	for(i=0; i<8; i++) {
+		statechainv[i] = chainv[i];
+		chainv[i] = statechainv[i+8];
 	}
 
 	TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],1);
 
-#pragma unroll 8
-	for(i=0;i<8;i++) {
-		STEP(c_CNS[(2*i)+16],c_CNS[(2*i)+16+1]);
-	}
+	#pragma unroll
+	for(i=0; i<8; i++)
+		STEP(c_CNS[(2*i)+16], c_CNS[(2*i)+16+1]);
 
-#pragma unroll 8
-	for(i=0;i<8;i++) {
-		state->chainv[i+8] = chainv[i];
-		chainv[i] = state->chainv[i+16];
+	#pragma unroll
+	for(i=0; i<8; i++) {
+		statechainv[i+8] = chainv[i];
+		chainv[i] = statechainv[i+16];
 	}
 
 	TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],2);
 
-#pragma unroll 8
-	for(i=0;i<8;i++) {
+	#pragma unroll
+	for(i=0; i<8; i++)
 		STEP(c_CNS[(2*i)+32],c_CNS[(2*i)+32+1]);
-	}
 
-#pragma unroll 8
-	for(i=0;i<8;i++) {
-		state->chainv[i+16] = chainv[i];
-		chainv[i] = state->chainv[i+24];
+	#pragma unroll
+	for(i=0; i<8; i++) {
+		statechainv[i+16] = chainv[i];
+		chainv[i] = statechainv[i+24];
 	}
 
 	TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],3);
 
-#pragma unroll 8
-	for(i=0;i<8;i++) {
+	#pragma unroll
+	for(i=0; i<8; i++)
 		STEP(c_CNS[(2*i)+48],c_CNS[(2*i)+48+1]);
-	}
 
-#pragma unroll 8
-	for(i=0;i<8;i++) {
-		state->chainv[i+24] = chainv[i];
-		chainv[i] = state->chainv[i+32];
+	#pragma unroll
+	for(i=0; i<8; i++) {
+		statechainv[i+24] = chainv[i];
+		chainv[i] = statechainv[i+32];
 	}
 
 	TWEAK(chainv[4],chainv[5],chainv[6],chainv[7],4);
 
-#pragma unroll 8
-	for(i=0;i<8;i++) {
+	#pragma unroll
+	for(i=0; i<8; i++)
 		STEP(c_CNS[(2*i)+64],c_CNS[(2*i)+64+1]);
+
+	#pragma unroll 8
+	for(i=0; i<8; i++)
+		statechainv[i+32] = chainv[i];
+}
+
+static void rnd512_cpu(uint32_t *statebuffer, uint32_t *statechainv)
+{
+	int i, j;
+	uint32_t t[40];
+	uint32_t chainv[8];
+	uint32_t tmp;
+
+	for (i = 0; i<8; i++) {
+		t[i] = statechainv[i];
+		for (j = 1; j<5; j++)
+			t[i] ^= statechainv[i + 8 * j];
 	}
 
-#pragma unroll 8
-	for(i=0;i<8;i++) {
-		state->chainv[i+32] = chainv[i];
+	MULT2(t, 0);
+
+	for (j = 0; j<5; j++) {
+		for (i = 0; i<8; i++)
+			statechainv[i + 8 * j] ^= t[i];
+	}
+
+	for (j = 0; j<5; j++) {
+		for (i = 0; i<8; i++)
+			t[i + 8 * j] = statechainv[i + 8 * j];
 	}
-}
 
+	for (j = 0; j<5; j++)
+		MULT2(statechainv, j);
+
+	for (j = 0; j<5; j++) {
+		for (i = 0; i<8; i++)
+			statechainv[8 * j + i] ^= t[8 * ((j + 1) % 5) + i];
+	}
+
+	for (j = 0; j<5; j++) {
+		for (i = 0; i<8; i++)
+			t[i + 8 * j] = statechainv[i + 8 * j];
+	}
+
+	for (j = 0; j<5; j++)
+		MULT2(statechainv, j);
+
+	for (j = 0; j<5; j++) {
+		for (i = 0; i<8; i++)
+			statechainv[8 * j + i] ^= t[8 * ((j + 4) % 5) + i];
+	}
 
+	for (j = 0; j<5; j++) {
+		for (i = 0; i<8; i++)
+			statechainv[i + 8 * j] ^= statebuffer[i];
+		MULT2(statebuffer, 0);
+	}
+
+	for (i = 0; i<8; i++)
+		chainv[i] = statechainv[i];
+
+	for (i = 0; i<8; i++)
+		STEP(h_CNS[(2 * i)], h_CNS[(2 * i) + 1]);
+
+	for (i = 0; i<8; i++) {
+		statechainv[i] = chainv[i];
+		chainv[i] = statechainv[i + 8];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 1);
+
+	for (i = 0; i<8; i++)
+		STEP(h_CNS[(2 * i) + 16], h_CNS[(2 * i) + 16 + 1]);
+
+	for (i = 0; i<8; i++) {
+		statechainv[i + 8] = chainv[i];
+		chainv[i] = statechainv[i + 16];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 2);
+
+	for (i = 0; i<8; i++)
+		STEP(h_CNS[(2 * i) + 32], h_CNS[(2 * i) + 32 + 1]);
+
+	for (i = 0; i<8; i++) {
+		statechainv[i + 16] = chainv[i];
+		chainv[i] = statechainv[i + 24];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 3);
+
+	for (i = 0; i<8; i++)
+		STEP(h_CNS[(2 * i) + 48], h_CNS[(2 * i) + 48 + 1]);
+
+	for (i = 0; i<8; i++) {
+		statechainv[i + 24] = chainv[i];
+		chainv[i] = statechainv[i + 32];
+	}
+
+	TWEAK(chainv[4], chainv[5], chainv[6], chainv[7], 4);
+
+	for (i = 0; i<8; i++)
+		STEP(h_CNS[(2 * i) + 64], h_CNS[(2 * i) + 64 + 1]);
+
+	for (i = 0; i<8; i++)
+		statechainv[i + 32] = chainv[i];
+}
+
+/***************************************************/
 __device__ __forceinline__
-void Update512(hashState *state, const BitSequence *data)
+void Update512(uint32_t* statebuffer, uint32_t *statechainv, const uint32_t *const __restrict__ data)
 {
-#pragma unroll 8
-	for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)data)[i]);
-	rnd512(state);
-
-#pragma unroll 8
-	for(int i=0;i<8;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+32))[i]);
-	rnd512(state);
-#pragma unroll 4
-	for(int i=0;i<4;i++) state->buffer[i] = BYTES_SWAP32(((uint32_t*)(data+64))[i]);
+	#pragma unroll
+	for (int i = 0; i<8; i++)
+		statebuffer[i] = cuda_swab32((data[i]));
+	rnd512(statebuffer, statechainv);
+
+	#pragma unroll
+	for(int i=0; i<8; i++)
+		statebuffer[i] = cuda_swab32((data[i+8]));
+	rnd512(statebuffer, statechainv);
+
+	#pragma unroll
+	for(int i=0; i<4; i++)
+		statebuffer[i] = cuda_swab32((data[i+16]));
 }
 
 
 /***************************************************/
 __device__ __forceinline__
-void finalization512(hashState *state, uint32_t *b)
+void finalization512(uint32_t* statebuffer, uint32_t *statechainv, uint32_t *b)
 {
 	int i,j;
 
-	state->buffer[4] = 0x80000000;
-#pragma unroll 3
-	for(int i=5;i<8;i++) state->buffer[i] = 0;
-	rnd512(state);
+	statebuffer[4] = 0x80000000U;
+
+	#pragma unroll 3
+	for(int i=5; i<8; i++)
+		statebuffer[i] = 0;
+	rnd512(statebuffer, statechainv);
 
 	/*---- blank round with m=0 ----*/
-#pragma unroll 8
-	for(i=0;i<8;i++) state->buffer[i] =0;
-	rnd512(state);
+	#pragma unroll
+	for(i=0; i<8; i++)
+		statebuffer[i] =0;
+	rnd512(statebuffer, statechainv);
 
-#pragma unroll 8
-	for(i=0;i<8;i++) {
+	#pragma unroll
+	for(i=0; i<8; i++) {
 		b[i] = 0;
-#pragma unroll 5
-		for(j=0;j<5;j++) {
-			b[i] ^= state->chainv[i+8*j];
-		}
-		b[i] = BYTES_SWAP32((b[i]));
+		#pragma unroll 5
+		for(j=0; j<5; j++)
+			b[i] ^= statechainv[i+8*j];
+		b[i] = cuda_swab32((b[i]));
 	}
 
-#pragma unroll 8
-	for(i=0;i<8;i++) state->buffer[i]=0;
-	rnd512(state);
+	#pragma unroll
+	for(i=0; i<8; i++)
+		statebuffer[i]=0;
+	rnd512(statebuffer, statechainv);
 
-#pragma unroll 8
-	for(i=0;i<8;i++) {
+	#pragma unroll
+	for(i=0; i<8; i++)
+	{
 		b[8+i] = 0;
-#pragma unroll 5
-		for(j=0;j<5;j++) {
-			b[8+i] ^= state->chainv[i+8*j];
-		}
-		b[8+i] = BYTES_SWAP32((b[8+i]));
+		#pragma unroll 5
+		for(j=0; j<5; j++)
+			b[8+i] ^= statechainv[i+8*j];
+		b[8+i] = cuda_swab32((b[8+i]));
 	}
 }
 
 
 /***************************************************/
-// Die Hash-Funktion
 __global__
-void qubit_luffa512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash)
+void qubit_luffa512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *outputHash)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -359,118 +423,37 @@ void qubit_luffa512_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *ou
 		uint32_t buf32[32];
 		} buff;
 
-#pragma unroll 16
-		for (int i=0; i < 16; ++i) buff.buf64[i] = c_PaddedMessage80[i];
+		#pragma unroll 8
+		for (int i=8; i < 16; i++)
+			buff.buf64[i] = c_PaddedMessage80[i];
 
 		// die Nounce durch die thread-spezifische ersetzen
 		buff.buf64[9] = REPLACE_HIDWORD(buff.buf64[9], cuda_swab32(nounce));
 
+		uint32_t statebuffer[8], statechainv[40];
 
-		hashState state;
-#pragma unroll 40
-		for(int i=0;i<40;i++) state.chainv[i] = c_IV[i];
-#pragma unroll 8
-		for(int i=0;i<8;i++) state.buffer[i] = 0;
-		Update512(&state, (BitSequence*)buff.buf32);
-		uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
-		finalization512(&state, (uint32_t*)outHash);
-	}
-}
-
-__global__
-void qubit_luffa512_gpu_finalhash_80(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
-{
-	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if (thread < threads)
-	{
-		uint32_t nounce = startNounce + thread;
-		union {
-			uint64_t buf64[16];
-			uint32_t buf32[32];
-		} buff;
-		uint32_t Hash[16];
-
-		#pragma unroll 16
-		for (int i=0; i < 16; ++i) buff.buf64[i] = c_PaddedMessage80[i];
-
-		// Tested nonce
-		buff.buf64[9] = REPLACE_HIDWORD(buff.buf64[9], cuda_swab32(nounce));
-
-		hashState state;
-		#pragma unroll 40
-		for(int i=0;i<40;i++) state.chainv[i] = c_IV[i];
+		#pragma unroll
+		for (int i = 0; i<4; i++)
+			statebuffer[i] = cuda_swab32(buff.buf32[i + 16]);
 
-		#pragma unroll 8
-		for(int i=0;i<8;i++) state.buffer[i] = 0;
+		#pragma unroll 4
+		for (int i = 4; i<8; i++)
+			statebuffer[i] = statebufferpre[i];
 
-		Update512(&state, (BitSequence*)buff.buf32);
-		finalization512(&state, Hash);
+		#pragma unroll
+		for (int i = 0; i<40; i++)
+			statechainv[i] = statechainvpre[i];
 
-		/* dont ask me why not a simple if (Hash[i] > c_Target[i]) return;
-		 * we lose 20% in perfs without the position test */
-		int position = -1;
-		#pragma unroll 8
-		for (int i = 7; i >= 0; i--) {
-			if (Hash[i] > c_Target[i]) {
-				if (position < i) {
-					return;
-				}
-			}
-			if (Hash[i] < c_Target[i]) {
-				if (position < i) {
-					position = i;
-					//break; /* impact perfs, unroll ? */
-				}
-			}
-		}
-
-#if NBN == 1
-		if (resNounce[0] > nounce) {
-			resNounce[0] = nounce;
-		}
-#else
-		/* keep the smallest nounce, + extra one if found */
-		if (resNounce[0] > nounce) {
-			resNounce[1] = resNounce[0];
-			resNounce[0] = nounce;
-		} else {
-			resNounce[1] = nounce;
-		}
-#endif
+		uint32_t *outHash = &outputHash[thread * 16];
+		finalization512(statebuffer, statechainv, outHash);
 	}
 }
 
 __host__
 void qubit_luffa512_cpu_init(int thr_id, uint32_t threads)
 {
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_IV, h2_IV, sizeof(h2_IV), 0, cudaMemcpyHostToDevice));
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_CNS, h2_CNS, sizeof(h2_CNS), 0, cudaMemcpyHostToDevice));
-	CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], NBN * sizeof(uint32_t)));
-	CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], NBN * sizeof(uint32_t)));
-}
-
-__host__
-uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash,int order)
-{
-	uint32_t result = UINT32_MAX;
-	cudaMemset(d_resNounce[thr_id], 0xff, NBN * sizeof(uint32_t));
-	const uint32_t threadsperblock = 256;
-
-	dim3 grid((threads + threadsperblock-1)/threadsperblock);
-	dim3 block(threadsperblock);
-
-	size_t shared_size = 0;
-
-	qubit_luffa512_gpu_finalhash_80 <<<grid, block, shared_size>>> (threads, startNounce, d_outputHash, d_resNounce[thr_id]);
-	cudaThreadSynchronize();
-	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], NBN * sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
-		//cudaThreadSynchronize();
-		result = h_resNounce[thr_id][0];
-#if NBN > 1
-		extra_results[0] = h_resNounce[thr_id][1];
-#endif
-	}
-	return result;
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_IV, h_IV, sizeof(h_IV), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_CNS, h_CNS, sizeof(h_CNS), 0, cudaMemcpyHostToDevice));
 }
 
 __host__
@@ -486,22 +469,38 @@ void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNoun
 }
 
 __host__
-void qubit_luffa512_cpu_setBlock_80(void *pdata)
+void qubit_cpu_precalc(uint32_t* message)
 {
-	unsigned char PaddedMessage[128];
-
-	memcpy(PaddedMessage, pdata, 80);
-	memset(PaddedMessage+80, 0, 48);
-	PaddedMessage[80] = 0x80;
-	PaddedMessage[111] = 1;
-	PaddedMessage[126] = 0x02;
-	PaddedMessage[127] = 0x80;
-
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
+	uint32_t statebuffer[8];
+	uint32_t statechainv[40] =
+	{
+		0x6d251e69, 0x44b051e0, 0x4eaa6fb4, 0xdbf78465,
+		0x6e292011, 0x90152df4, 0xee058139, 0xdef610bb,
+		0xc3b44b95, 0xd9d2f256, 0x70eee9a0, 0xde099fa3,
+		0x5d9b0557, 0x8fc944b3, 0xcf1ccf0e, 0x746cd581,
+		0xf7efc89d, 0x5dba5781, 0x04016ce5, 0xad659c05,
+		0x0306194f, 0x666d1836, 0x24aa230a, 0x8b264ae7,
+		0x858075d5, 0x36d79cce, 0xe571f7d7, 0x204b1f67,
+		0x35870c6a, 0x57e9e923, 0x14bcb808, 0x7cde72ce,
+		0x6c68e9be, 0x5ec41e22, 0xc825b7c7, 0xaffb4363,
+		0xf5df3999, 0x0fc688f1, 0xb07224cc, 0x03e86cea
+	};
+
+	for (int i = 0; i<8; i++)
+		statebuffer[i] = cuda_swab32(message[i]);
+	rnd512_cpu(statebuffer, statechainv);
+
+	for (int i = 0; i<8; i++)
+		statebuffer[i] = cuda_swab32(message[i+8]);
+
+	rnd512_cpu(statebuffer, statechainv);
+
+	cudaMemcpyToSymbol(statebufferpre, statebuffer, sizeof(statebuffer), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(statechainvpre, statechainv, sizeof(statechainv), 0, cudaMemcpyHostToDevice);
 }
 
 __host__
-void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget)
+void qubit_luffa512_cpu_setBlock_80(void *pdata)
 {
 	unsigned char PaddedMessage[128];
 
@@ -512,6 +511,6 @@ void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget)
 	PaddedMessage[126] = 0x02;
 	PaddedMessage[127] = 0x80;
 
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
-	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, sizeof(PaddedMessage), 0, cudaMemcpyHostToDevice));
+	qubit_cpu_precalc((uint32_t*) PaddedMessage);
 }