From 34fd40844094e2e333a83152ab39a4a001b3ad30 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Sun, 10 May 2015 03:18:14 +0200
Subject: [PATCH] lyra2: get a second nonce per gpu scan

---
 Algo256/cuda_groestl256.cu | 34 ++++++++++++++++++++++------------
 lyra2/lyra2RE.cu           | 26 +++++++++++++++++++++-----
 2 files changed, 43 insertions(+), 17 deletions(-)

diff --git a/Algo256/cuda_groestl256.cu b/Algo256/cuda_groestl256.cu
index 888d29c..865767e 100644
--- a/Algo256/cuda_groestl256.cu
+++ b/Algo256/cuda_groestl256.cu
@@ -5,8 +5,8 @@
 
 #include "cuda_helper.h"
 
-uint32_t *d_gnounce[MAX_GPUS];
-uint32_t *d_GNonce[MAX_GPUS];
+static uint32_t *h_GNonces[MAX_GPUS];
+static uint32_t *d_GNonces[MAX_GPUS];
 
 __constant__ uint32_t pTarget[8];
 
@@ -175,7 +175,7 @@ void groestl256_perm_Q(uint32_t thread, uint32_t *a, char *mixtabs)
 }
 
 __global__ __launch_bounds__(256,1)
-void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash, uint32_t *nonceVector)
+void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash, uint32_t *resNonces)
 {
 #if USE_SHARED
 	extern __shared__ char mixtabs[];
@@ -243,7 +243,8 @@ void groestl256_gpu_hash32(uint32_t threads, uint32_t startNounce, uint64_t *out
 
 		uint32_t nonce = startNounce + thread;
 		if (state[15] <= pTarget[7]) {
-			nonceVector[0] = nonce;
+			atomicMin(&resNonces[1], resNonces[0]);
+			atomicMin(&resNonces[0], nonce);
 		}
 	}
 }
@@ -272,15 +273,15 @@ void groestl256_cpu_init(int thr_id, uint32_t threads)
 	texDef(t3up2, d_T3up, T3up_cpu, sizeof(uint32_t) * 256);
 	texDef(t3dn2, d_T3dn, T3dn_cpu, sizeof(uint32_t) * 256);
 
-	cudaMalloc(&d_GNonce[thr_id], sizeof(uint32_t));
-	cudaMallocHost(&d_gnounce[thr_id], 1*sizeof(uint32_t));
+	cudaMalloc(&d_GNonces[thr_id], 2*sizeof(uint32_t));
+	cudaMallocHost(&h_GNonces[thr_id], 2*sizeof(uint32_t));
 }
 
 __host__
 uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order)
 {
 	uint32_t result = 0xffffffff;
-	cudaMemset(d_GNonce[thr_id], 0xff, sizeof(uint32_t));
+	cudaMemset(d_GNonces[thr_id], 0xff, sizeof(uint32_t));
 	const uint32_t threadsperblock = 256;
 
 	// berechne wie viele Thread Blocks wir brauchen
@@ -292,18 +293,27 @@ uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNoun
 #else
 	size_t shared_size = 0;
 #endif
-	groestl256_gpu_hash32<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash, d_GNonce[thr_id]);
+	groestl256_gpu_hash32<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash, d_GNonces[thr_id]);
 
 	MyStreamSynchronize(NULL, order, thr_id);
-	cudaMemcpy(d_gnounce[thr_id], d_GNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
-	cudaThreadSynchronize();
-	result = *d_gnounce[thr_id];
+
+	// get first found nonce
+	cudaMemcpy(h_GNonces[thr_id], d_GNonces[thr_id], 1*sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	result = *h_GNonces[thr_id];
 
 	return result;
 }
 
+__host__
+uint32_t groestl256_getSecNonce(int thr_id, int num)
+{
+	uint32_t results[2];
+	cudaMemcpy(results, d_GNonces[thr_id], sizeof(results), cudaMemcpyDeviceToHost);
+	return results[num];
+}
+
 __host__
 void groestl256_setTarget(const void *pTargetIn)
 {
-	cudaMemcpyToSymbol(pTarget, pTargetIn, 8 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(pTarget, pTargetIn, 32, 0, cudaMemcpyHostToDevice);
 }
diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu
index dbeab7a..bbc5131 100644
--- a/lyra2/lyra2RE.cu
+++ b/lyra2/lyra2RE.cu
@@ -9,7 +9,7 @@ extern "C" {
 #include "miner.h"
 #include "cuda_helper.h"
 
-static _ALIGN(64) uint64_t *d_hash[MAX_GPUS];
+static uint64_t* d_hash[MAX_GPUS];
 
 extern void blake256_cpu_init(int thr_id, uint32_t threads);
 extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
@@ -21,9 +21,10 @@ extern void skein256_cpu_init(int thr_id, uint32_t threads);
 
 extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
 
+extern void groestl256_cpu_init(int thr_id, uint32_t threads);
 extern void groestl256_setTarget(const void *ptarget);
 extern uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order);
-extern void groestl256_cpu_init(int thr_id, uint32_t threads);
+extern uint32_t groestl256_getSecNonce(int thr_id, int num);
 
 extern "C" void lyra2_hash(void *state, const void *input)
 {
@@ -99,18 +100,33 @@ extern "C" int scanhash_lyra2(int thr_id, uint32_t *pdata,
 		lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
 		foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		if (foundNonce != UINT32_MAX)
 		{
+			uint32_t _ALIGN(64) vhash64[8];
 			const uint32_t Htarg = ptarget[7];
-			uint32_t vhash64[8];
+
 			be32enc(&endiandata[19], foundNonce);
 			lyra2_hash(vhash64, endiandata);
 
 			if (vhash64[7] <= Htarg && fulltest(vhash64, ptarget)) {
-				*hashes_done = pdata[19] - first_nonce + throughput;
+				int res = 1;
+				uint32_t secNonce = groestl256_getSecNonce(thr_id, 1);
+				if (secNonce != UINT32_MAX)
+				{
+					be32enc(&endiandata[19], secNonce);
+					lyra2_hash(vhash64, endiandata);
+					if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
+						if (opt_debug)
+							applog(LOG_BLUE, "GPU #%d: found second nonce %08x", device_map[thr_id], secNonce);
+						pdata[21] = secNonce;
+						res++;
+					}
+				}
 				pdata[19] = foundNonce;
-				return 1;
+				return res;
 			} else {
 				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
 			}