Add fresh algo (based on djm34 code)

Cleaned up and adapted to my changes (cputest added) Remove Makefile.in which should be in gitignore (Plz refresh it with ./config.sh to compile on linux)
2025-08-26 05:42:02 +00:00 · 2014-08-19 15:34:15 +02:00 · 2014-08-19 15:34:15 +02:00 · bc2eb75758
commit bc2eb75758
parent d9ea5f72ce
11 changed files with 301 additions and 1514 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,5 @@
-minerd
+ccminer
 *.o
 autom4te.cache
@ -26,6 +26,9 @@ config.sub
 mingw32-config.cache
 */.dirstamp
 .DS_Store
 Desktop.ini
 Thumbs.db
 *.iml
@ -33,5 +36,14 @@ Debug/
 Release/
 x64/Debug/
 x64/Release/
-ccminer.*.suo
+*.suo
 *.user
 .settings/
 .project
 .metadata
 .classpath
 .loadpath
 .cproject
 .buildpath
--- a/Makefile.am
+++ b/Makefile.am
@ -37,9 +37,9 @@ ccminer_SOURCES		= elist.h miner.h compat.h \
 			  sph/cubehash.c sph/echo.c sph/luffa.c sph/shavite.c sph/simd.c \
 			  sph/hamsi.c sph/hamsi_helper.c sph/sph_hamsi.h \
 			  sph/shabal.c sph/whirlpool.c \
-			  x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
+			  x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
 			  x11/x11.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
 			  x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
 			  x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
 			  x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu
 ccminer_LDFLAGS		= $(PTHREAD_FLAGS) @CUDA_LDFLAGS@
--- a/Makefile.in
+++ b/Makefile.in
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -503,6 +503,12 @@ copy "$(CudaToolkitBinDir)\cudart64*.dll" "$(OutDir)"</Command>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">%(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions>
    </CudaCompile>
    <CudaCompile Include="x11\fresh.cu">
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">--ptxas-options=-O3 %(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">%(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|x64'">--ptxas-options=-O3 %(AdditionalOptions)</AdditionalOptions>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">%(AdditionalOptions)</AdditionalOptions>
    </CudaCompile>
    <CudaCompile Include="x11\simd_functions.cu">
      <ExcludedFromBuild>true</ExcludedFromBuild>
      <AdditionalOptions Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">%(AdditionalOptions)</AdditionalOptions>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -361,6 +361,9 @@
    <CudaCompile Include="x11\cuda_x11_simd512.cu">
      <Filter>Source Files\CUDA\x11</Filter>
    </CudaCompile>
    <CudaCompile Include="x11\fresh.cu">
      <Filter>Source Files\CUDA\x11</Filter>
    </CudaCompile>
    <CudaCompile Include="x11\x11.cu">
      <Filter>Source Files\CUDA\x11</Filter>
    </CudaCompile>
--- a/cpu-miner.c
+++ b/cpu-miner.c
@ -132,6 +132,7 @@ typedef enum {
 	ALGO_JACKPOT,
 	ALGO_QUARK,
 	ALGO_ANIME,
 	ALGO_FRESH,
 	ALGO_NIST5,
 	ALGO_X11,
 	ALGO_X13,
@ -149,6 +150,7 @@ static const char *algo_names[] = {
 	"jackpot",
 	"quark",
 	"anime",
 	"fresh",
 	"nist5",
 	"x11",
 	"x13",
@ -225,6 +227,7 @@ Options:\n\
                        jackpot   Jackpot hash\n\
                        quark     Quark hash\n\
                        anime     Animecoin hash\n\
                        fresh     Freshcoin hash (shavite 80)\n\
                        nist5     NIST5 (TalkCoin) hash\n\
                        x11       X11 (DarkCoin) hash\n\
                        x13       X13 (MaruCoin) hash\n\
@ -782,7 +785,7 @@ static void stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 	if (opt_algo == ALGO_JACKPOT)
 		diff_to_target(work->target, sctx->job.diff / (65536.0 * opt_difficulty));
-	else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR)
+	else if (opt_algo == ALGO_FUGUE256 || opt_algo == ALGO_GROESTL || opt_algo == ALGO_DMD_GR || opt_algo == ALGO_FRESH)
 		diff_to_target(work->target, sctx->job.diff / (256.0 * opt_difficulty));
 	else
 		diff_to_target(work->target, sctx->job.diff / opt_difficulty);
@ -918,6 +921,11 @@ static void *miner_thread(void *userdata)
 			                      max_nonce, &hashes_done);
 			break;
 		case ALGO_FRESH:
 			rc = scanhash_fresh(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
 			break;
 		case ALGO_NIST5:
 			rc = scanhash_nist5(thr_id, work.data, work.target,
 			                      max_nonce, &hashes_done);
--- a/miner.h
+++ b/miner.h
@ -234,6 +234,10 @@ extern int scanhash_anime(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
 extern int scanhash_fresh(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
 extern int scanhash_nist5(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done);
@ -355,6 +359,7 @@ void fugue256_hash(unsigned char* output, const unsigned char* input, int len);
 void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
 void groestlhash(void *state, const void *input);
 void myriadhash(void *state, const void *input);
 void fresh_hash(void *state, const void *input);
 void nist5hash(void *state, const void *input);
 void quarkhash(void *state, const void *input);
 void x11hash(void *output, const void *input);
--- a/quark/cuda_quark_groestl512.cu
+++ b/quark/cuda_quark_groestl512.cu
@ -5,6 +5,9 @@
 #include "cuda_helper.h"
 #define TPB 256
 #define THF 4
 // aus cpu-miner.c
 extern int device_map[8];
@ -18,7 +21,7 @@ static cudaDeviceProp props[8];
 #include "groestl_functions_quad.cu"
 #include "bitslice_transformations_quad.cu"
-__global__ __launch_bounds__(256, 4)
+__global__ __launch_bounds__(TPB, THF)
 void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
 {
    // durch 4 dividieren, weil jeweils 4 Threads zusammen ein Hash berechnen
@ -60,7 +63,7 @@ void quark_groestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32
    }
 }
-__global__ void __launch_bounds__(256, 4)
+__global__ void __launch_bounds__(TPB, THF)
 quark_doublegroestl512_gpu_hash_64_quad(int threads, uint32_t startNounce, uint32_t *g_hash, uint32_t *g_nonceVector)
 {
    int thread = (blockDim.x * blockIdx.x + threadIdx.x)>>2;
@ -125,11 +128,11 @@ __host__ void quark_groestl512_cpu_init(int thr_id, int threads)
 __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-    int threadsperblock = 256;
+    int threadsperblock = TPB;
    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
-    const int factor = 4;
+    const int factor = THF;
    // berechne wie viele Thread Blocks wir brauchen
    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
@ -146,11 +149,11 @@ __host__ void quark_groestl512_cpu_hash_64(int thr_id, int threads, uint32_t sta
 __host__ void quark_doublegroestl512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-    int threadsperblock = 256;
+    int threadsperblock = TPB;
    // Compute 3.0 benutzt die registeroptimierte Quad Variante mit Warp Shuffle
    // mit den Quad Funktionen brauchen wir jetzt 4 threads pro Hash, daher Faktor 4 bei der Blockzahl
-    const int factor = 4;
+    const int factor = THF;
    // berechne wie viele Thread Blocks wir brauchen
    dim3 grid(factor*((threads + threadsperblock-1)/threadsperblock));
--- a/util.c
+++ b/util.c
@ -1360,6 +1360,10 @@ void print_hash_tests(void)
 	quarkhash(&hash[0], &buf[0]);
 	printf("\nquark:   "); print_hash(hash);
 	memset(hash, 0, sizeof hash);
 	fresh_hash(&hash[0], &buf[0]);
 	printf("\nfresh:   "); print_hash(hash);
 	memset(hash, 0, sizeof hash);
 	x11hash(&hash[0], &buf[0]);
 	printf("\nX11:     "); print_hash(hash);
--- a/x11/cuda_x11_shavite512.cu
+++ b/x11/cuda_x11_shavite512.cu
@ -1,10 +1,11 @@
 #include "cuda_helper.h"
 #define TPB 256
 // aus heavy.cu
 extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id);
-//typedef unsigned char BitSequence;
+__constant__ uint32_t c_PaddedMessage80[32]; // padded message (80 bytes + padding)
 //typedef unsigned long long DataLength;
 __device__ __constant__
 static const uint32_t d_ShaviteInitVector[16] = {
@ -16,7 +17,8 @@ static const uint32_t d_ShaviteInitVector[16] = {
 #include "cuda_x11_aes.cu"
-static __device__ __forceinline__ void AES_ROUND_NOKEY(
+__device__ __forceinline__
 static void AES_ROUND_NOKEY(
 	const uint32_t* __restrict__ sharedMemory,
 	uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3)
 {
@ -31,7 +33,8 @@ static __device__ __forceinline__ void AES_ROUND_NOKEY(
 	x3 = y3;
 }
-static __device__ __forceinline__ void KEY_EXPAND_ELT(
+__device__ __forceinline__
 static void KEY_EXPAND_ELT(
 	const uint32_t* __restrict__ sharedMemory,
 	uint32_t &k0, uint32_t &k1, uint32_t &k2, uint32_t &k3)
 {
@ -46,8 +49,8 @@ static __device__ __forceinline__ void KEY_EXPAND_ELT(
 	k3 = y0;
 }
-static __device__ void
+__device__
-c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
+static void c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg, uint32_t count)
 {
 	uint32_t p0, p1, p2, p3, p4, p5, p6, p7;
 	uint32_t p8, p9, pA, pB, pC, pD, pE, pF;
@ -56,7 +59,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
 	uint32_t rk08, rk09, rk0A, rk0B, rk0C, rk0D, rk0E, rk0F;
 	uint32_t rk10, rk11, rk12, rk13, rk14, rk15, rk16, rk17;
 	uint32_t rk18, rk19, rk1A, rk1B, rk1C, rk1D, rk1E, rk1F;
-	const uint32_t counter = 512;
+	const uint32_t counter = count;
 	p0 = state[0x0];
 	p1 = state[0x1];
@ -1291,8 +1294,7 @@ c512(const uint32_t* sharedMemory, uint32_t *state, uint32_t *msg)
 	state[0xF] ^= p7;
 }
-
+// GPU Hash
 // Die Hash-Funktion
 __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector)
 {
 	__shared__ uint32_t sharedMemory[1024];
@ -1305,7 +1307,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
 		uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread);
 		int hashPosition = nounce - startNounce;
-		uint32_t *Hash = (uint32_t*)&g_hash[8 * hashPosition];
+		uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
 		// kopiere init-state
 		uint32_t state[16];
@ -1320,7 +1322,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
 		// fülle die Nachricht mit 64-byte (vorheriger Hash)
 		#pragma unroll 16
 		for(int i=0;i<16;i++)
-			msg[i] = Hash[i];			
+			msg[i] = Hash[i];
 		// Nachrichtenende
 		msg[16] = 0x80;
@ -1334,7 +1336,7 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
 		msg[30] = 0;
 		msg[31] = 0x02000000;
-		c512(sharedMemory, state, msg);
+		c512(sharedMemory, state, msg, 512);
 		#pragma unroll 16
 		for(int i=0;i<16;i++)
@ -1342,8 +1344,46 @@ __global__ void x11_shavite512_gpu_hash_64(int threads, uint32_t startNounce, ui
 	}
 }
 __global__ void x11_shavite512_gpu_hash_80(int threads, uint32_t startNounce, void *outputHash)
 {
 	__shared__ uint32_t sharedMemory[1024];
 	aes_gpu_init(sharedMemory);
 	int thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		const uint32_t nounce = startNounce + thread;
 		// kopiere init-state
 		uint32_t state[16];
 		#pragma unroll 16
 		for(int i=0;i<16;i++) {
 			state[i] = d_ShaviteInitVector[i];}
 		uint32_t msg[32];
 		#pragma unroll 32
 		for(int i=0;i<32;i++) {
 			msg[i] = c_PaddedMessage80[i];
 		}
 		msg[19] = cuda_swab32(nounce);
 		msg[20] = 0x80;
 		msg[27] = 0x2800000;
 		msg[31] = 0x2000000;
 		c512(sharedMemory, state, msg, 640);
 		uint32_t *outHash = (uint32_t *)outputHash + 16 * thread;
 		#pragma unroll 16
 		for(int i=0;i<16;i++)
 			outHash[i] = state[i];
 	} //thread < threads
 }
 // Setup-Funktionen
 __host__ void x11_shavite512_cpu_init(int thr_id, int threads)
 {
 	aes_cpu_init();
@ -1351,15 +1391,40 @@ __host__ void x11_shavite512_cpu_init(int thr_id, int threads)
 __host__ void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order)
 {
-	const int threadsperblock = 256;
+	const int threadsperblock = TPB;
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 	// Größe des dynamischen Shared Memory Bereichs
 	size_t shared_size = 0;
 	x11_shavite512_gpu_hash_64<<<grid, block, shared_size>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 	MyStreamSynchronize(NULL, order, thr_id);
 }
 __host__ void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
 {
 	const int threadsperblock = TPB;
 	// berechne wie viele Thread Blocks wir brauchen
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 	size_t shared_size = 0;
 	x11_shavite512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
 	MyStreamSynchronize(NULL, order, thr_id);
 }
 __host__ void x11_shavite512_setBlock_80(void *pdata)
 {
 	// Message mit Padding bereitstellen
 	// lediglich die korrekte Nonce ist noch ab Byte 76 einzusetzen.
 	unsigned char PaddedMessage[128];
 	memcpy(PaddedMessage, pdata, 80);
 	memset(PaddedMessage+80, 0, 48);
 	cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 32*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
 }
--- a/x11/fresh.cu
+++ b/x11/fresh.cu
@ -0,0 +1,169 @@
 /**
 * Fresh algorithm
 */
 extern "C" {
 #include "sph/sph_shavite.h"
 #include "sph/sph_simd.h"
 #include "sph/sph_echo.h"
 #include "miner.h"
 #include "cuda_helper.h"
 }
 // to test gpu hash on a null buffer
 #define NULLTEST 0
 static uint32_t *d_hash[8];
 extern int device_map[8];
 extern bool opt_benchmark;
 extern void x11_shavite512_cpu_init(int thr_id, int threads);
 extern void x11_shavite512_setBlock_80(void *pdata);
 extern void x11_shavite512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_hash, int order);
 extern void x11_shavite512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void x11_simd512_cpu_init(int thr_id, int threads);
 extern void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void x11_echo512_cpu_init(int thr_id, int threads);
 extern void x11_echo512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void cuda_check_cpu_init(int thr_id, int threads);
 extern void cuda_check_cpu_setTarget(const void *ptarget);
 extern uint32_t cuda_check_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_inputHash, int order);
 extern void quark_compactTest_cpu_init(int thr_id, int threads);
 extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes,
 											uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse,
 											int order);
 // CPU Hash
 extern "C" void fresh_hash(void *state, const void *input)
 {
 	// shavite-simd-shavite-simd-echo
 	sph_shavite512_context ctx_shavite;
 	sph_simd512_context ctx_simd;
 	sph_echo512_context ctx_echo;
 	unsigned char hash[128]; // uint32_t hashA[16], hashB[16];
 	#define hashA hash
 	#define hashB hash+64
 	memset(hash, 0, sizeof hash);
 	sph_shavite512_init(&ctx_shavite);
 	sph_shavite512(&ctx_shavite, input, 80);
 	sph_shavite512_close(&ctx_shavite, hashA);
 	sph_simd512_init(&ctx_simd);
 	sph_simd512(&ctx_simd, hashA, 64);
 	sph_simd512_close(&ctx_simd, hashB);
 	sph_shavite512_init(&ctx_shavite);
 	sph_shavite512(&ctx_shavite, hashB, 64);
 	sph_shavite512_close(&ctx_shavite, hashA);
 	sph_simd512_init(&ctx_simd);
 	sph_simd512(&ctx_simd, hashA, 64);
 	sph_simd512_close(&ctx_simd, hashB);
 	sph_echo512_init(&ctx_echo);
 	sph_echo512(&ctx_echo, hashB, 64);
 	sph_echo512_close(&ctx_echo, hashA);
 	memcpy(state, hash, 32);
 }
 #if NULLTEST
 static void print_hash(unsigned char *hash)
 {
 	for (int i=0; i < 32; i += 4) {
 		printf("%02x%02x%02x%02x ", hash[i], hash[i+1], hash[i+2], hash[i+3]);
 	}
 }
 #endif
 extern "C" int scanhash_fresh(int thr_id, uint32_t *pdata,
 	const uint32_t *ptarget, uint32_t max_nonce,
 	unsigned long *hashes_done)
 {
 	const uint32_t first_nonce = pdata[19];
 	const int throughput = 256*256*8;
 	static bool init[8] = {0,0,0,0,0,0,0,0};
 	uint32_t endiandata[20];
 	uint32_t Htarg = ptarget[7];
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = Htarg = 0x0000ff;
 #if NULLTEST
 	for (int k=0; k < 20; k++)
 		pdata[k] = 0;
 #endif
 	if (!init[thr_id])
 	{
 		CUDA_SAFE_CALL(cudaSetDevice(device_map[thr_id]));
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput + 4));
 		x11_shavite512_cpu_init(thr_id, throughput);
 		x11_simd512_cpu_init(thr_id, throughput);
 		x11_echo512_cpu_init(thr_id, throughput);
 		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], ((uint32_t*)pdata)[k]);
 	x11_shavite512_setBlock_80((void*)endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 	do {
 		uint32_t foundNonce;
 		int order = 0;
 		// GPU Hash
 		x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 #if NULLTEST
 		uint32_t buf[8]; memset(buf, 0, sizeof buf);
 		CUDA_SAFE_CALL(cudaMemcpy(buf, d_hash[thr_id], sizeof buf, cudaMemcpyDeviceToHost));
 		CUDA_SAFE_CALL(cudaThreadSynchronize());
 		print_hash((unsigned char*)buf); printf("\n");
 #endif
 		foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		if (foundNonce != 0xffffffff)
 		{
 			uint32_t vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			fresh_hash(vhash64, endiandata);
 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
 				pdata[19] = foundNonce;
 				*hashes_done = foundNonce - first_nonce + 1;
 				return 1;
 			}
 			else if (vhash64[7] > Htarg) {
 				applog(LOG_INFO, "GPU #%d: result for %08x is not in range: %x > %x", thr_id, foundNonce, vhash64[7], Htarg);
 			}
 			else {
 				applog(LOG_INFO, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
 			}
 		}
 		pdata[19] += throughput;
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }