From 03c3b7d341d4b5e63c59650427a47548ad7659c9 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sun, 10 May 2015 18:14:06 +0200
Subject: [PATCH] Various algos cleanup + lyra2 sec nonce fix

---
 Algo256/cuda_fugue256.cu   | 19 ++++++++-----------
 Algo256/cuda_groestl256.cu |  7 +++++--
 Algo256/keccak256.cu       | 18 ++++++++----------
 cuda_groestlcoin.cu        | 22 +++++++++++-----------
 cuda_myriadgroestl.cu      | 15 ++++++++-------
 fuguecoin.cpp              |  9 ++++++---
 groestlcoin.cpp            | 21 ++++++++++-----------
 lyra2/lyra2RE.cu           | 10 ++++------
 myriadgroestl.cpp          | 26 +++++++++++---------------
 9 files changed, 71 insertions(+), 76 deletions(-)

diff --git a/Algo256/cuda_fugue256.cu b/Algo256/cuda_fugue256.cu
index bb72752..7bcbfcf 100644
--- a/Algo256/cuda_fugue256.cu
+++ b/Algo256/cuda_fugue256.cu
@@ -9,7 +9,7 @@
 #define USE_SHARED 1
 
 uint32_t *d_fugue256_hashoutput[MAX_GPUS];
-uint32_t *d_resultNonce[MAX_GPUS];
+static uint32_t *d_resultNonce[MAX_GPUS];
 
 __constant__ uint32_t GPUstate[30]; // Single GPU
 __constant__ uint32_t pTarget[8]; // Single GPU
@@ -718,10 +718,9 @@ fugue256_gpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outp
 	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); }
 
 
+__host__
 void fugue256_cpu_init(int thr_id, uint32_t threads)
 {
-	cudaSetDevice(device_map[thr_id]);
-
 	// Kopiere die Hash-Tabellen in den GPU-Speicher
 	texDef(mixTab0Tex, mixTab0m, mixtab0_cpu, sizeof(uint32_t)*256);
 	texDef(mixTab1Tex, mixTab1m, mixtab1_cpu, sizeof(uint32_t)*256);
@@ -733,25 +732,23 @@ void fugue256_cpu_init(int thr_id, uint32_t threads)
 	cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t));
 }
 
-__host__ void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
+__host__
+void fugue256_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 {
 	// CPU-Vorbereitungen treffen
 	sph_fugue256_context ctx_fugue_const;
 	sph_fugue256_init(&ctx_fugue_const);
 	sph_fugue256 (&ctx_fugue_const, data, 80);	// State speichern
 
-	cudaMemcpyToSymbol(	GPUstate,
-						ctx_fugue_const.S,
-						sizeof(uint32_t) * 30 );
+	cudaMemcpyToSymbol(GPUstate, ctx_fugue_const.S, sizeof(uint32_t) * 30);
 
-	cudaMemcpyToSymbol(	pTarget,
-						pTargetIn,
-						sizeof(uint32_t) * 8 );
+	cudaMemcpyToSymbol(pTarget, pTargetIn, sizeof(uint32_t) * 8);
 
 	cudaMemset(d_resultNonce[thr_id], 0xFF, sizeof(uint32_t));
 }
 
-__host__ void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce)
+__host__
+void fugue256_cpu_hash(int thr_id, uint32_t threads, int startNounce, void *outputHashes, uint32_t *nounce)
 {
 #if USE_SHARED
 	const uint32_t threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN
diff --git a/Algo256/cuda_groestl256.cu b/Algo256/cuda_groestl256.cu
index 865767e..1e8bf61 100644
--- a/Algo256/cuda_groestl256.cu
+++ b/Algo256/cuda_groestl256.cu
@@ -280,8 +280,8 @@ void groestl256_cpu_init(int thr_id, uint32_t threads)
 __host__
 uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
 {
-	uint32_t result = 0xffffffff;
-	cudaMemset(d_GNonces[thr_id], 0xff, sizeof(uint32_t));
+	uint32_t result = UINT32_MAX;
+	cudaMemset(d_GNonces[thr_id], 0xff, 2*sizeof(uint32_t));
 	const uint32_t threadsperblock = 256;
 
 	// berechne wie viele Thread Blocks wir brauchen
@@ -308,7 +308,10 @@ __host__
 uint32_t groestl256_getSecNonce(int thr_id, int num)
 {
 	uint32_t results[2];
+	memset(results, 0xFF, sizeof(results));
 	cudaMemcpy(results, d_GNonces[thr_id], sizeof(results), cudaMemcpyDeviceToHost);
+	if (results[1] == results[0])
+		return UINT32_MAX;
 	return results[num];
 }
 
diff --git a/Algo256/keccak256.cu b/Algo256/keccak256.cu
index 17c57c9..ecb181e 100644
--- a/Algo256/keccak256.cu
+++ b/Algo256/keccak256.cu
@@ -23,10 +23,9 @@ extern uint32_t keccak256_cpu_hash_80(int thr_id, uint32_t threads, uint32_t sta
 // CPU Hash
 extern "C" void keccak256_hash(void *state, const void *input)
 {
+	uint32_t _ALIGN(64) hash[16];
 	sph_keccak_context ctx_keccak;
 
-	uint32_t hash[16];
-
 	sph_keccak256_init(&ctx_keccak);
 	sph_keccak256 (&ctx_keccak, input, 80);
 	sph_keccak256_close(&ctx_keccak, (void*) hash);
@@ -50,8 +49,8 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
 	if (!init[thr_id]) {
 		cudaSetDevice(device_map[thr_id]);
 
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
-		keccak256_cpu_init(thr_id, (int) throughput);
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], throughput * 64));
+		keccak256_cpu_init(thr_id, throughput);
 
 		init[thr_id] = true;
 	}
@@ -65,16 +64,16 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
 	do {
 		int order = 0;
 
-		uint32_t foundNonce = keccak256_cpu_hash_80(thr_id, (int) throughput, pdata[19], d_hash[thr_id], order++);
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		uint32_t foundNonce = keccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		if (foundNonce != UINT32_MAX)
 		{
-			uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
+			uint32_t _ALIGN(64) vhash64[8];
 			be32enc(&endiandata[19], foundNonce);
 			keccak256_hash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
-				*hashes_done = foundNonce - first_nonce + 1;
+			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
 				pdata[19] = foundNonce;
 				return 1;
 			}
@@ -91,6 +90,5 @@ extern "C" int scanhash_keccak256(int thr_id, uint32_t *pdata,
 
 	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
diff --git a/cuda_groestlcoin.cu b/cuda_groestlcoin.cu
index e580f55..6d57fe8 100644
--- a/cuda_groestlcoin.cu
+++ b/cuda_groestlcoin.cu
@@ -8,10 +8,10 @@
 
 // globaler Speicher für alle HeftyHashes aller Threads
 __constant__ uint32_t pTarget[8]; // Single GPU
-extern uint32_t *d_resultNonce[MAX_GPUS];
-
 __constant__ uint32_t groestlcoin_gpu_msg[32];
 
+static uint32_t *d_resultNonce[MAX_GPUS];
+
 #if __CUDA_ARCH__ >= 300
 // 64 Registers Variant for Compute 3.0+
 #include "quark/groestl_functions_quad.h"
@@ -30,7 +30,8 @@ void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t
     {
         // GROESTL
         uint32_t paddedInput[8];
-#pragma unroll 8
+
+        #pragma unroll 8
         for(int k=0;k<8;k++) paddedInput[k] = groestlcoin_gpu_msg[4*k+threadIdx.x%4];
 
         uint32_t nounce = startNounce + thread;
@@ -68,7 +69,7 @@ void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t
             int i, position = -1;
             bool rc = true;
 
-    #pragma unroll 8
+            #pragma unroll 8
             for (i = 7; i >= 0; i--) {
                 if (out_state[i] > pTarget[i]) {
                     if(position < i) {
@@ -92,16 +93,14 @@ void groestlcoin_gpu_hash_quad(uint32_t threads, uint32_t startNounce, uint32_t
 #endif
 }
 
-// Setup-Funktionen
-__host__ void groestlcoin_cpu_init(int thr_id, uint32_t threads)
+__host__
+void groestlcoin_cpu_init(int thr_id, uint32_t threads)
 {
-    cudaSetDevice(device_map[thr_id]);
-
-    // Speicher für Gewinner-Nonce belegen
     cudaMalloc(&d_resultNonce[thr_id], sizeof(uint32_t)); 
 }
 
-__host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
+__host__
+void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 {
     // Nachricht expandieren und setzen
     uint32_t msgBlock[32];
@@ -128,7 +127,8 @@ __host__ void groestlcoin_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
                         sizeof(uint32_t) * 8 );
 }
 
-__host__ void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
+__host__
+void groestlcoin_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
 {
     uint32_t threadsperblock = 256;
 
diff --git a/cuda_myriadgroestl.cu b/cuda_myriadgroestl.cu
index 0c9acd7..28ea94c 100644
--- a/cuda_myriadgroestl.cu
+++ b/cuda_myriadgroestl.cu
@@ -14,7 +14,7 @@
 // globaler Speicher für alle HeftyHashes aller Threads
 __constant__ uint32_t pTarget[8]; // Single GPU
 uint32_t *d_outputHashes[MAX_GPUS];
-extern uint32_t *d_resultNonce[MAX_GPUS];
+static uint32_t *d_resultNonce[MAX_GPUS];
 
 __constant__ uint32_t myriadgroestl_gpu_msg[32];
 
@@ -299,11 +299,10 @@ __global__ void
 #endif
 }
 
-// Setup-Funktionen
-__host__ void myriadgroestl_cpu_init(int thr_id, uint32_t threads)
+// Setup Function
+__host__
+void myriadgroestl_cpu_init(int thr_id, uint32_t threads)
 {
-    cudaSetDevice(device_map[thr_id]);
-
     cudaMemcpyToSymbol( myr_sha256_gpu_hashTable,
                         myr_sha256_cpu_hashTable,
                         sizeof(uint32_t) * 8 );
@@ -328,7 +327,8 @@ __host__ void myriadgroestl_cpu_init(int thr_id, uint32_t threads)
     cudaMalloc(&d_outputHashes[thr_id], 16*sizeof(uint32_t)*threads);
 }
 
-__host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
+__host__
+void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn)
 {
     // Nachricht expandieren und setzen
     uint32_t msgBlock[32];
@@ -355,7 +355,8 @@ __host__ void myriadgroestl_cpu_setBlock(int thr_id, void *data, void *pTargetIn
                         sizeof(uint32_t) * 8 );
 }
 
-__host__ void myriadgroestl_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
+__host__
+void myriadgroestl_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce, void *outputHashes, uint32_t *nounce)
 {
     uint32_t threadsperblock = 256;
 
diff --git a/fuguecoin.cpp b/fuguecoin.cpp
index fdd28c7..d2f8dd9 100644
--- a/fuguecoin.cpp
+++ b/fuguecoin.cpp
@@ -1,5 +1,6 @@
 #include <string.h>
 #include <stdint.h>
+#include <cuda_runtime.h>
 
 #include "uint256.h"
 #include "sph/sph_fugue.h"
@@ -22,7 +23,7 @@ extern "C" void my_fugue256_addbits_and_close(void *cc, unsigned ub, unsigned n,
 
 static bool init[MAX_GPUS] = { 0 };
 
-extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
+int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
 	uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t start_nonce = pdata[19]++;
@@ -36,6 +37,8 @@ extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *pt
 	// init
 	if(!init[thr_id])
 	{
+		cudaSetDevice(device_map[thr_id]);
+
 		fugue256_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
@@ -50,10 +53,10 @@ extern "C" int scanhash_fugue256(int thr_id, uint32_t *pdata, const uint32_t *pt
 
 	do {
 		// GPU
-		uint32_t foundNounce = 0xFFFFFFFF;
+		uint32_t foundNounce = UINT32_MAX;
 		fugue256_cpu_hash(thr_id, throughput, pdata[19], NULL, &foundNounce);
 
-		if(foundNounce < 0xffffffff)
+		if (foundNounce < UINT32_MAX)
 		{
 			uint32_t hash[8];
 			const uint32_t Htarg = ptarget[7];
diff --git a/groestlcoin.cpp b/groestlcoin.cpp
index db74549..b501284 100644
--- a/groestlcoin.cpp
+++ b/groestlcoin.cpp
@@ -1,5 +1,6 @@
 #include <string.h>
 #include <stdint.h>
+#include <cuda_runtime.h>
 #include <openssl/sha.h>
 
 #include "uint256.h"
@@ -36,11 +37,11 @@ static bool init[MAX_GPUS] = { 0 };
 extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t *ptarget,
     uint32_t max_nonce, unsigned long *hashes_done)
 {
-    uint32_t start_nonce = pdata[19]++;
+    uint32_t start_nonce = pdata[19];
     uint32_t throughput = device_intensity(thr_id, __func__, 1 << 19); // 256*256*8
     throughput = min(throughput, max_nonce - start_nonce);
 
-    uint32_t *outputHash = (uint32_t*)malloc(throughput * 16 * sizeof(uint32_t));
+    uint32_t *outputHash = (uint32_t*)malloc(throughput * 64);
 
     if (opt_benchmark)
         ((uint32_t*)ptarget)[7] = 0x000000ff;
@@ -48,6 +49,7 @@ extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t
     // init
     if(!init[thr_id])
     {
+        cudaSetDevice(device_map[thr_id]);
         groestlcoin_cpu_init(thr_id, throughput);
         init[thr_id] = true;
     }
@@ -62,27 +64,25 @@ extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t
 
     do {
         // GPU
-        uint32_t foundNounce = 0xFFFFFFFF;
-        const uint32_t Htarg = ptarget[7];
+        uint32_t foundNounce = UINT32_MAX;
+
+        *hashes_done = pdata[19] - start_nonce + throughput;
 
         groestlcoin_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce);
 
-        if(foundNounce < 0xffffffff)
+        if(foundNounce < UINT32_MAX)
         {
-            uint32_t tmpHash[8];
+            uint32_t _ALIGN(64) tmpHash[8];
             endiandata[19] = SWAP32(foundNounce);
             groestlhash(tmpHash, endiandata);
 
-            if (tmpHash[7] <= Htarg && fulltest(tmpHash, ptarget)) {
+            if (tmpHash[7] <= ptarget[7] && fulltest(tmpHash, ptarget)) {
                 pdata[19] = foundNounce;
-                *hashes_done = foundNounce - start_nonce + 1;
                 free(outputHash);
                 return true;
             } else {
                 applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNounce);
             }
-
-            foundNounce = 0xffffffff;
         }
 
         if (pdata[19] + throughput < pdata[19])
@@ -91,7 +91,6 @@ extern "C" int scanhash_groestlcoin(int thr_id, uint32_t *pdata, const uint32_t
 
     } while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
 
-    *hashes_done = pdata[19] - start_nonce + 1;
     free(outputHash);
     return 0;
 }
diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu
index bbc5131..e5fe1fe 100644
--- a/lyra2/lyra2RE.cu
+++ b/lyra2/lyra2RE.cu
@@ -79,7 +79,7 @@ extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata,
 		skein256_cpu_init(thr_id, throughput);
 		groestl256_cpu_init(thr_id, throughput);
 
-		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], throughput * 64));
 
 		init[thr_id] = true;
 	}
@@ -95,23 +95,22 @@ extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata,
 		int order = 0;
 		uint32_t foundNonce;
 
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
 		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 
-		*hashes_done = pdata[19] - first_nonce + throughput;
-
 		foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		if (foundNonce != UINT32_MAX)
 		{
 			uint32_t _ALIGN(64) vhash64[8];
-			const uint32_t Htarg = ptarget[7];
 
 			be32enc(&endiandata[19], foundNonce);
 			lyra2_hash(vhash64, endiandata);
 
-			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
+			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
 				int res = 1;
 				uint32_t secNonce = groestl256_getSecNonce(thr_id, 1);
 				if (secNonce != UINT32_MAX)
@@ -136,6 +135,5 @@ extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata,
 
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
 }
diff --git a/myriadgroestl.cpp b/myriadgroestl.cpp
index 41b3d83..a3e0340 100644
--- a/myriadgroestl.cpp
+++ b/myriadgroestl.cpp
@@ -1,5 +1,6 @@
 #include <string.h>
 #include <stdint.h>
+#include <cuda_runtime.h>
 #include <openssl/sha.h>
 
 #include "uint256.h"
@@ -41,7 +42,7 @@ extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptar
 	uint32_t throughput = device_intensity(thr_id, __func__, 1 << 17);
 	throughput = min(throughput, max_nonce - start_nonce);
 
-	uint32_t *outputHash = (uint32_t*)malloc(throughput * 16 * sizeof(uint32_t));
+	uint32_t *outputHash = (uint32_t*)malloc(throughput * 64);
 
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x0000ff;
@@ -49,14 +50,13 @@ extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptar
 	// init
 	if(!init[thr_id])
 	{
-#if BIG_DEBUG
-#else
+		cudaSetDevice(device_map[thr_id]);
+
 		myriadgroestl_cpu_init(thr_id, throughput);
-#endif
 		init[thr_id] = true;
 	}
 
-	uint32_t endiandata[32];
+	uint32_t _ALIGN(64) endiandata[32];
 	for (int kk=0; kk < 32; kk++)
 		be32enc(&endiandata[kk], pdata[kk]);
 
@@ -66,26 +66,23 @@ extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptar
 	do {
 		// GPU
 		uint32_t foundNounce = UINT32_MAX;
-		const uint32_t Htarg = ptarget[7];
+
+		*hashes_done = pdata[19] - start_nonce + throughput;
 
 		myriadgroestl_cpu_hash(thr_id, throughput, pdata[19], outputHash, &foundNounce);
 
 		if (foundNounce < UINT32_MAX)
 		{
-			uint32_t tmpHash[8];
+			uint32_t _ALIGN(64) tmpHash[8];
 			endiandata[19] = SWAP32(foundNounce);
 			myriadhash(tmpHash, endiandata);
-			if (tmpHash[7] <= Htarg &&
-					fulltest(tmpHash, ptarget)) {
-						pdata[19] = foundNounce;
-						*hashes_done = foundNounce - start_nonce + 1;
-						free(outputHash);
+			if (tmpHash[7] <= ptarget[7] && fulltest(tmpHash, ptarget)) {
+				pdata[19] = foundNounce;
+				free(outputHash);
 				return true;
 			} else {
 				applog(LOG_WARNING, "GPU #%d: result for nonce %08x does not validate on CPU!", device_map[thr_id], foundNounce);
 			}
-
-			foundNounce = 0xffffffff;
 		}
 
 		if ((uint64_t) pdata[19] + throughput > (uint64_t) max_nonce) {
@@ -96,7 +93,6 @@ extern "C" int scanhash_myriad(int thr_id, uint32_t *pdata, const uint32_t *ptar
 
 	} while (!work_restart[thr_id].restart);
 
-	*hashes_done = pdata[19] - start_nonce + 1;
 	free(outputHash);
 	return 0;
 }