Add fresh algo (based on djm34 code)

Cleaned up and adapted to my changes (cputest added) Remove Makefile.in which should be in gitignore (Plz refresh it with ./config.sh to compile on linux)
10 years ago · bc2eb75758
11 changed files with 301 additions and 1514 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,5 @@
				@@ -1,5 +1,5 @@

-minerd
+ccminer
 *.o

 autom4te.cache
@ -26,6 +26,9 @@ config.sub
				@@ -26,6 +26,9 @@ config.sub
 mingw32-config.cache

 */.dirstamp
+.DS_Store
+Desktop.ini
+Thumbs.db

 *.iml

@ -33,5 +36,14 @@ Debug/
				@@ -33,5 +36,14 @@ Debug/
 Release/
 x64/Debug/
 x64/Release/
-ccminer.*.suo
+*.suo
+*.user
+
+.settings/
+.project
+.metadata
+.classpath
+.loadpath
+.cproject
+.buildpath

--- a/Makefile.am
+++ b/Makefile.am
@ -37,9 +37,9 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
				@@ -37,9 +37,9 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
 			  sph/cubehash.c sph/echo.c sph/luffa.c sph/shavite.c sph/simd.c \
 			  sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
 			  sph/shabal.c sph/whirlpool.c \
-			  x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
-			  x11/x11.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
+			  x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
 			  x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
+			  x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
 			  x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu

 ccminer_LDFLAGS		= $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
--- a/Makefile.in
+++ b/Makefile.in
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -503,6 +503,12 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
				@@ -503,6 +503,12 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions>
    </CudaCompile>
+    <CudaCompile Include="x11\fresh.cu">
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">--ptxas-options=-O3 %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">--ptxas-options=-O3 %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions>
+    </CudaCompile>
    <CudaCompile Include="x11\simd_functions.cu">
      <ExcludedFromBuild>true</ExcludedFromBuild>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -361,6 +361,9 @@
				@@ -361,6 +361,9 @@
    <CudaCompile Include="x11\cuda_x11_simd512.cu">
      <Filter>Source Files\CUDA\x11</Filter>
    </CudaCompile>
+    <CudaCompile Include="x11\fresh.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
    <CudaCompile Include="x11\x11.cu">
      <Filter>Source Files\CUDA\x11</Filter>
    </CudaCompile>
--- a/cpu-miner.c
+++ b/cpu-miner.c
@ -132,6 +132,7 @@ typedef enum {
				@@ -132,6 +132,7 @@ typedef enum {
 	ALGO_JACKPOT,
 	ALGO_QUARK,
 	ALGO_ANIME,
+	ALGO_FRESH,
 	ALGO_NIST5,
 	ALGO_X11,
 	ALGO_X13,
@ -149,6 +150,7 @@ static const char *algo_names[] = {
				@@ -149,6 +150,7 @@ static const char *algo_names[] = {
 	"jackpot",
 	"quark",
 	"anime",
+	"fresh",
 	"nist5",
 	"x11",
 	"x13",
@ -225,6 +227,7 @@ Options:\n\
				@@ -225,6 +227,7 @@ Options:\n\
                        jackpot   Jackpot hash\n\
                        quark     Quark hash\n\
                        anime     Animecoin hash\n\
+                        fresh     Freshcoin hash (shavite 80)\n\
                        nist5     NIST5 (TalkCoin) hash\n\
                        x11       X11 (DarkCoin) hash\n\
                        x13       X13 (MaruCoin) hash\n\
@ -782,7 +785,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
				@@ -782,7 +785,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)

 	if (opt_algo == ALGO_JACKPOT)
 		diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
-	else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR)
+	else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR || opt_algo == ALGO_FRESH)
 		diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty));
 	else
 		diff_to_target(work->target, sctx->job.diff / opt_difficulty);
@ -918,6 +921,11 @@ static void *miner_thread(void *userdata)
				@@ -918,6 +921,11 @@ static void *miner_thread(void *userdata)
 			                      max_nonce, &hashes_done);
 			break;

+		case ALGO_FRESH:
+			rc = scanhash_fresh(thr_id, work.data, work.target,
+			                      max_nonce, &hashes_done);
+			break;
+
 		case ALGO_NIST5:
 			rc = scanhash_nist5(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
--- a/miner.h
+++ b/miner.h
@ -234,6 +234,10 @@ extern int scanhash_anime(int thr_id, uint32_t *pdata,
				@@ -234,6 +234,10 @@ extern int scanhash_anime(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);

+extern int scanhash_fresh(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce,
+	unsigned long *hashes_done);
+
 extern int scanhash_nist5(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
@ -355,6 +359,7 @@ void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
				@@ -355,6 +359,7 @@ void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
 void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
 void groestlhash(void *state, const void *input);
 void myriadhash(void *state, const void *input);
+void fresh_hash(void *state, const void *input);
 void nist5hash(void *state, const void *input);
 void quarkhash(void *state, const void *input);
 void x11hash(void *output, const void *input);
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@ -5,6 +5,9 @@
				@@ -5,6 +5,9 @@

 #include "cuda_helper.h"

+#define TPB 256
+#define THF 4
+
 // aus cpu-miner.c
 extern int device_map[8];

@ -18,7 +21,7 @@ static cudaDeviceProp props[8];
				@@ -18,7 +21,7 @@ static cudaDeviceProp props[8];
 #include "groestl_functions_quad.cu"
 #include "bitslice_transformations_quad.cu"

-__global__ __launch_bounds__(256, 4)
+__global__ __launch_bounds__(TPB, THF)
 void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
 {
    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
@ -60,7 +63,7 @@ void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32
				@@ -60,7 +63,7 @@ void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32
    }
 }

-__global__ void __launch_bounds__(256, 4)
+__global__ void __launch_bounds__(TPB, THF)
 quark_doublegroestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
 {
    int thread = (blockDim.x * blockIdx.x + threadIdx.x)>>2;
@ -125,11 +128,11 @@ __host__ void quark_groestl512_cpu_init(int thr_id, int threads)
				@@ -125,11 +128,11 @@ __host__ void quark_groestl512_cpu_init(int thr_id, int threads)

 __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-    int threadsperblock = 256;
+    int threadsperblock = TPB;

    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
-    const int factor = 4;
+    const int factor = THF;

    // berechne wie viele Thread Blocks wir brauchen
    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
@ -146,11 +149,11 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t sta
				@@ -146,11 +149,11 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t sta

 __host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-    int threadsperblock = 256;
+    int threadsperblock = TPB;

    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
-    const int factor = 4;
+    const int factor = THF;

    // berechne wie viele Thread Blocks wir brauchen
    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
--- a/util.c
+++ b/util.c
@ -1360,6 +1360,10 @@ void print_hash_tests(void)
				@@ -1360,6 +1360,10 @@ void print_hash_tests(void)
 	quarkhash(&hash[0], &buf[0]);
 	printf("\nquark:   "); print_hash(hash);

+	memset(hash, 0, sizeof hash);
+	fresh_hash(&hash[0], &buf[0]);
+	printf("\nfresh:   "); print_hash(hash);
+
 	memset(hash, 0, sizeof hash);
 	x11hash(&hash[0], &buf[0]);
 	printf("\nX11:     "); print_hash(hash);
--- a/x11/cuda_x11_shavite512.cu
+++ b/x11/cuda_x11_shavite512.cu
@ -1,10 +1,11 @@
				@@ -1,10 +1,11 @@
 #include "cuda_helper.h"

+#define TPB 256
+
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);

-//typedef unsigned char BitSequence;
-//typedef unsigned long long DataLength;
+__constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)

 __device__ __constant__
 static const uint32_t d_ShaviteInitVector[16] = {
@ -16,7 +17,8 @@ static const uint32_t d_ShaviteInitVector[16] = {
				@@ -16,7 +17,8 @@ static const uint32_t d_ShaviteInitVector[16] = {

 #include "cuda_x11_aes.cu"

-static __device__ __forceinline__ void AES_ROUND_NOKEY(
+__device__ __forceinline__
+static void AES_ROUND_NOKEY(
 	const uint32_t* __restrict__ sharedMemory,
 	uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3)
 {
@ -31,7 +33,8 @@ static __device__ __forceinline__ void AES_ROUND_NOKEY(
				@@ -31,7 +33,8 @@ static __device__ __forceinline__ void AES_ROUND_NOKEY(
 	x3 = y3;
 }

-static __device__ __forceinline__ void KEY_EXPAND_ELT(
+__device__ __forceinline__
+static void KEY_EXPAND_ELT(
 	const uint32_t* __restrict__ sharedMemory,
 	uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3)
 {
@ -46,8 +49,8 @@ static __device__ __forceinline__ void KEY_EXPAND_ELT(
				@@ -46,8 +49,8 @@ static __device__ __forceinline__ void KEY_EXPAND_ELT(
 	k3 = y0;
 }

-static __device__ void
-c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
+__device__
+static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, uint32_t count)
 {
 	uint32_t p0, p1, p2, p3, p4, p5, p6, p7;
 	uint32_t p8, p9, pA, pB, pC, pD, pE, pF;
@ -56,7 +59,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
				@@ -56,7 +59,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
 	uint32_t rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F;
 	uint32_t rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
 	uint32_t rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
-	const uint32_t counter = 512;
+	const uint32_t counter = count;

 	p0 = state[0x0];
 	p1 = state[0x1];
@ -1291,8 +1294,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
				@@ -1291,8 +1294,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
 	state[0xF] ^= p7;
 }

-
-// Die Hash-Funktion
+// GPU Hash
 __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
 	__shared__ uint32_t sharedMemory[1024];
@ -1305,7 +1307,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
				@@ -1305,7 +1307,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
 		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);

 		int hashPosition = nounce - startNounce;
-		uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
+		uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];

 		// kopiere init-state
 		uint32_t state[16];
@ -1334,7 +1336,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
				@@ -1334,7 +1336,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
 		msg[30] = 0;
 		msg[31] = 0x02000000;

-		c512(sharedMemory, state, msg);
+		c512(sharedMemory, state, msg, 512);

 		#pragma unroll 16
 		for(int i=0;i<16;i++)
@ -1342,8 +1344,46 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
				@@ -1342,8 +1344,46 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
 	}
 }

+__global__ void x11_shavite512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
+{
+	__shared__ uint32_t sharedMemory[1024];
+
+	aes_gpu_init(sharedMemory);
+
+	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nounce = startNounce + thread;
+
+		// kopiere init-state
+		uint32_t state[16];
+
+		#pragma unroll 16
+		for(int i=0;i<16;i++) {
+			state[i] = d_ShaviteInitVector[i];}
+
+		uint32_t msg[32];
+
+		#pragma unroll 32
+		for(int i=0;i<32;i++) {
+			msg[i] = c_PaddedMessage80[i];
+		}
+		msg[19] = cuda_swab32(nounce);
+		msg[20] = 0x80;
+		msg[27] = 0x2800000;
+		msg[31] = 0x2000000;
+
+		c512(sharedMemory, state, msg, 640);
+
+		uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
+
+		#pragma unroll 16
+		for(int i=0;i<16;i++)
+			outHash[i] = state[i];
+
+	} //thread < threads
+}

-// Setup-Funktionen
 __host__ void x11_shavite512_cpu_init(int thr_id, int threads)
 {
 	aes_cpu_init();
@ -1351,15 +1391,40 @@ __host__ void x11_shavite512_cpu_init(int thr_id, int threads)
				@@ -1351,15 +1391,40 @@ __host__ void x11_shavite512_cpu_init(int thr_id, int threads)

 __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-	const int threadsperblock = 256;
+	const int threadsperblock = TPB;

 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);

-	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;

 	x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 	MyStreamSynchronize(NULL, order, thr_id);
 }
+
+__host__ void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
+{
+	const int threadsperblock = TPB;
+
+	// berechne wie viele Thread Blocks wir brauchen
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	size_t shared_size = 0;
+
+	x11_shavite512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
+	MyStreamSynchronize(NULL, order, thr_id);
+}
+
+__host__ void x11_shavite512_setBlock_80(void *pdata)
+{
+	// Message mit Padding bereitstellen
+	// lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen.
+	unsigned char PaddedMessage[128];
+	memcpy(PaddedMessage, pdata, 80);
+	memset(PaddedMessage+80, 0, 48);
+
+	cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+}
+
--- a/x11/fresh.cu
+++ b/x11/fresh.cu
@ -0,0 +1,169 @@
				@@ -0,0 +1,169 @@
+/**
+ * Fresh algorithm
+ */
+extern "C" {
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+#include "miner.h"
+#include "cuda_helper.h"
+}
+
+// to test gpu hash on a null buffer
+#define NULLTEST 0
+
+static uint32_t *d_hash[8];
+
+extern int device_map[8];
+extern bool opt_benchmark;
+
+extern void x11_shavite512_cpu_init(int thr_id, int threads);
+extern void x11_shavite512_setBlock_80(void *pdata);
+extern void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
+extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x11_simd512_cpu_init(int thr_id, int threads);
+extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x11_echo512_cpu_init(int thr_id, int threads);
+extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void cuda_check_cpu_init(int thr_id, int threads);
+extern void cuda_check_cpu_setTarget(const void *ptarget);
+extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
+
+extern void quark_compactTest_cpu_init(int thr_id, int threads);
+extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
+											uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse,
+											int order);
+
+// CPU Hash
+extern "C" void fresh_hash(void *state, const void *input)
+{
+	// shavite-simd-shavite-simd-echo
+
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_echo512_context ctx_echo;
+
+	unsigned char hash[128]; // uint32_t hashA[16], hashB[16];
+	#define hashA hash
+	#define hashB hash+64
+
+	memset(hash, 0, sizeof hash);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512(&ctx_shavite, input, 80);
+	sph_shavite512_close(&ctx_shavite, hashA);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512(&ctx_simd, hashA, 64);
+	sph_simd512_close(&ctx_simd, hashB);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512(&ctx_shavite, hashB, 64);
+	sph_shavite512_close(&ctx_shavite, hashA);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512(&ctx_simd, hashA, 64);
+	sph_simd512_close(&ctx_simd, hashB);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512(&ctx_echo, hashB, 64);
+	sph_echo512_close(&ctx_echo, hashA);
+
+	memcpy(state, hash, 32);
+}
+
+#if NULLTEST
+static void print_hash(unsigned char *hash)
+{
+	for (int i=0; i < 32; i += 4) {
+		printf("%02x%02x%02x%02x ", hash[i], hash[i+1], hash[i+2], hash[i+3]);
+	}
+}
+#endif
+
+extern "C" int scanhash_fresh(int thr_id, uint32_t *pdata,
+	const uint32_t *ptarget, uint32_t max_nonce,
+	unsigned long *hashes_done)
+{
+	const uint32_t first_nonce = pdata[19];
+	const int throughput = 256*256*8;
+	static bool init[8] = {0,0,0,0,0,0,0,0};
+	uint32_t endiandata[20];
+	uint32_t Htarg = ptarget[7];
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = Htarg = 0x0000ff;
+
+#if NULLTEST
+	for (int k=0; k < 20; k++)
+		pdata[k] = 0;
+#endif
+
+	if (!init[thr_id])
+	{
+		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput + 4));
+
+		x11_shavite512_cpu_init(thr_id, throughput);
+		x11_simd512_cpu_init(thr_id, throughput);
+		x11_echo512_cpu_init(thr_id, throughput);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
+	
+	x11_shavite512_setBlock_80((void*)endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+	do {
+		uint32_t foundNonce;
+		int order = 0;
+
+		// GPU Hash
+		x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+#if NULLTEST
+		uint32_t buf[8]; memset(buf, 0, sizeof buf);
+		CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost));
+		CUDA_SAFE_CALL(cudaThreadSynchronize());
+		print_hash((unsigned char*)buf); printf("\n");
+#endif
+
+		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		if (foundNonce != 0xffffffff)
+		{
+			uint32_t vhash64[8];
+			be32enc(&endiandata[19], foundNonce);
+			fresh_hash(vhash64, endiandata);
+
+			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+				pdata[19] = foundNonce;
+				*hashes_done = foundNonce - first_nonce + 1;
+				return 1;
+			}
+			else if (vhash64[7] > Htarg) {
+				applog(LOG_INFO, "GPU #%d: result for %08x is not in range: %x > %x", thr_id, foundNonce, vhash64[7], Htarg);
+			}
+			else {
+				applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
+			}
+		}
+
+		pdata[19] += throughput;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce + 1;
+	return 0;
+}