From 23f0cee61fd976285524e6d20a096cdd4d3b8240 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Thu, 11 Sep 2014 01:59:43 +0200
Subject: [PATCH] Add cuda error checks on qubit algos

And rename doom to luffa, like djm34
---
 README.txt              |  2 +-
 cpu-miner.c             |  6 ++-
 qubit/doom.cu           |  9 +++-
 qubit/qubit_luffa512.cu | 96 +++++++++++++++++++++++++----------------
 util.c                  |  8 ++--
 5 files changed, 76 insertions(+), 45 deletions(-)

diff --git a/README.txt b/README.txt
index 61ae2c6..cee450a 100644
--- a/README.txt
+++ b/README.txt
@@ -58,12 +58,12 @@ its command line interface and options.
                           heavy       use to mine Heavycoin
                           mjollnir    use to mine Mjollnircoin
                           deep        use to mine Deepcoin
-                          doom        use to mine Doomcoin
                           fugue256    use to mine Fuguecoin
                           groestl     use to mine Groestlcoin
                           dmd-gr      use to mine Diamond-Groestl
                           myr-gr      use to mine Myriad-Groest
                           jackpot     use to mine Jackpotcoin
+                          luffa       use to mine Doomcoin
                           quark       use to mine Quarkcoin
                           qubit       use to mine Qubit Algo
                           anime       use to mine Animecoin
diff --git a/cpu-miner.c b/cpu-miner.c
index 287fdc9..830b68a 100644
--- a/cpu-miner.c
+++ b/cpu-miner.c
@@ -136,6 +136,7 @@ typedef enum {
 	ALGO_GROESTL,
 	ALGO_HEAVY,		/* Heavycoin hash */
 	ALGO_JACKPOT,
+	ALGO_LUFFA_DOOM,
 	ALGO_MJOLLNIR,		/* Mjollnir hash */
 	ALGO_MYR_GR,
 	ALGO_NIST5,
@@ -156,12 +157,13 @@ static const char *algo_names[] = {
 	"blake",
 	"blakecoin",
 	"deep",
-	"doom",
+	"doom", /* is luffa */
 	"fresh",
 	"fugue256",
 	"groestl",
 	"heavy",
 	"jackpot",
+	"luffa",
 	"mjollnir",
 	"myr-gr",
 	"nist5",
@@ -242,12 +244,12 @@ Options:\n\
                         blake     Blake 256 (like NEOS blake)\n\
                         blakecoin Old Blake 256 (8 rounds)\n\
                         deep      Deepcoin hash\n\
-                        doom      Doomcoin hash\n\
                         fresh     Freshcoin hash (shavite 80)\n\
                         fugue256  Fuguecoin hash\n\
                         groestl   Groestlcoin hash\n\
                         heavy     Heavycoin hash\n\
                         jackpot   Jackpot hash\n\
+                        luffa     Doomcoin hash\n\
                         mjollnir  Mjollnircoin hash\n\
                         myr-gr    Myriad-Groestl hash\n\
                         nist5     NIST5 (TalkCoin) hash\n\
diff --git a/qubit/doom.cu b/qubit/doom.cu
index 30719c7..8f93ac4 100644
--- a/qubit/doom.cu
+++ b/qubit/doom.cu
@@ -51,7 +51,7 @@ extern "C" int scanhash_doom(int thr_id, uint32_t *pdata,
 	{
 		cudaSetDevice(device_map[thr_id]);
 
-		cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput);
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput));
 
 		qubit_luffa512_cpu_init(thr_id, throughput);
 
@@ -86,7 +86,12 @@ extern "C" int scanhash_doom(int thr_id, uint32_t *pdata,
 
 		pdata[19] += throughput;
 
-	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+		if ((uint64_t) pdata[19] + throughput > max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+	} while (!work_restart[thr_id].restart);
 
 	*hashes_done = pdata[19] - first_nonce + 1;
 	return 0;
diff --git a/qubit/qubit_luffa512.cu b/qubit/qubit_luffa512.cu
index 1d827d5..23d95bb 100644
--- a/qubit/qubit_luffa512.cu
+++ b/qubit/qubit_luffa512.cu
@@ -23,13 +23,20 @@
 
 #include "cuda_helper.h"
 
+#define MAXU 0xffffffffU
+
 typedef unsigned char BitSequence;
 
 __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding)
-__constant__ uint32_t pTarget[8];
+__constant__ uint32_t c_Target[8];
+
+static uint32_t *h_resNounce[8];
+static uint32_t *d_resNounce[8];
 
-uint32_t *d_lnounce[8];
-uint32_t *d_LNonce[8];
+#define NBN 1 /* max results, could be 2, see blake32.cu */
+#if NBN > 1
+static uint32_t extra_results[2] = { MAXU, MAXU };
+#endif
 
 typedef struct {
 	uint32_t buffer[8]; /* Buffer to be hashed */
@@ -380,64 +387,75 @@ void qubit_luffa512_gpu_finalhash_80(int threads, uint32_t startNounce, void *ou
 	{
 		uint32_t nounce = startNounce + thread;
 		union {
-		uint64_t buf64[16];
-		uint32_t buf32[32];
+			uint64_t buf64[16];
+			uint32_t buf32[32];
 		} buff;
 		uint32_t Hash[16];
 
-#pragma unroll 16
+		#pragma unroll 16
 		for (int i=0; i < 16; ++i) buff.buf64[i] = c_PaddedMessage80[i];
 
-		// die Nounce durch die thread-spezifische ersetzen
+		// Tested nonce
 		buff.buf64[9] = REPLACE_HIWORD(buff.buf64[9], cuda_swab32(nounce));
 
-
 		hashState state;
-#pragma unroll 40
+		#pragma unroll 40
 		for(int i=0;i<40;i++) state.chainv[i] = c_IV[i];
-#pragma unroll 8
+
+		#pragma unroll 8
 		for(int i=0;i<8;i++) state.buffer[i] = 0;
+
 		Update512(&state, (BitSequence*)buff.buf32);
 		finalization512(&state, Hash);
 
-		bool rc = true;
+		/* dont ask me why not a simple if (Hash[i] > c_Target[i]) return;
+		 * we lose 20% in perfs without the position test */
 		int position = -1;
-#pragma unroll 8
+		#pragma unroll 8
 		for (int i = 7; i >= 0; i--) {
-			if (Hash[i] > pTarget[i]) {
-				if(position < i) {
-					position = i;
-					rc = false;
+			if (Hash[i] > c_Target[i]) {
+				if (position < i) {
+					return;
 				}
-
 			}
-			if (Hash[i] < pTarget[i]) {
-				if(position < i) {
+			if (Hash[i] < c_Target[i]) {
+				if (position < i) {
 					position = i;
-					rc = true;
+					//break; /* impact perfs, unroll ? */
 				}
 			}
 		}
 
-		if(rc && resNounce[0] > nounce)
+#if NBN == 1
+		if (resNounce[0] > nounce) {
+			resNounce[0] = nounce;
+		}
+#else
+		/* keep the smallest nounce, + extra one if found */
+		if (resNounce[0] > nounce) {
+			resNounce[1] = resNounce[0];
 			resNounce[0] = nounce;
+		} else {
+			resNounce[1] = nounce;
+		}
+#endif
 	}
 }
 
 __host__
 void qubit_luffa512_cpu_init(int thr_id, int threads)
 {
-	cudaMemcpyToSymbol( c_IV, h2_IV, sizeof(h2_IV), 0, cudaMemcpyHostToDevice );
-	cudaMemcpyToSymbol( c_CNS, h2_CNS, sizeof(h2_CNS), 0, cudaMemcpyHostToDevice );
-	cudaMalloc(&d_LNonce[thr_id], sizeof(uint32_t));
-	cudaMallocHost(&d_lnounce[thr_id], 1*sizeof(uint32_t));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_IV, h2_IV, sizeof(h2_IV), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_CNS, h2_CNS, sizeof(h2_CNS), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMalloc(&d_resNounce[thr_id], NBN * sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMallocHost(&h_resNounce[thr_id], NBN * sizeof(uint32_t)));
 }
 
 __host__
 uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, int threads, uint32_t startNounce, uint32_t *d_outputHash,int order)
 {
-	uint32_t result = 0xffffffff;
-	cudaMemset(d_LNonce[thr_id], 0xffffffff, sizeof(uint32_t));
+	uint32_t result = MAXU;
+	cudaMemset(d_resNounce[thr_id], 0xff, NBN * sizeof(uint32_t));
 	const int threadsperblock = 256;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
@@ -445,11 +463,15 @@ uint32_t qubit_luffa512_cpu_finalhash_80(int thr_id, int threads, uint32_t start
 
 	size_t shared_size = 0;
 
-	qubit_luffa512_gpu_finalhash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash, d_LNonce[thr_id]);
-	MyStreamSynchronize(NULL, order, thr_id);
-	cudaMemcpy(d_lnounce[thr_id], d_LNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
-	//cudaThreadSynchronize();
-	result = *d_lnounce[thr_id];
+	qubit_luffa512_gpu_finalhash_80 <<<grid, block, shared_size>>> (threads, startNounce, d_outputHash, d_resNounce[thr_id]);
+	cudaDeviceSynchronize();
+	if (cudaSuccess == cudaMemcpy(h_resNounce[thr_id], d_resNounce[thr_id], NBN * sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
+		//cudaThreadSynchronize();
+		result = h_resNounce[thr_id][0];
+#if NBN > 1
+		extra_results[0] = h_resNounce[thr_id][1];
+#endif
+	}
 	return result;
 }
 
@@ -462,7 +484,7 @@ void qubit_luffa512_cpu_hash_80(int thr_id, int threads, uint32_t startNounce, u
 	dim3 block(threadsperblock);
 	size_t shared_size = 0;
 
-	qubit_luffa512_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash);
+	qubit_luffa512_gpu_hash_80 <<<grid, block, shared_size>>> (threads, startNounce, d_outputHash);
 	MyStreamSynchronize(NULL, order, thr_id);
 }
 
@@ -470,6 +492,7 @@ __host__
 void qubit_luffa512_cpu_setBlock_80(void *pdata)
 {
 	unsigned char PaddedMessage[128];
+
 	memcpy(PaddedMessage, pdata, 80);
 	memset(PaddedMessage+80, 0, 48);
 	PaddedMessage[80] = 0x80;
@@ -477,20 +500,21 @@ void qubit_luffa512_cpu_setBlock_80(void *pdata)
 	PaddedMessage[126] = 0x02;
 	PaddedMessage[127] = 0x80;
 
-	cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
 }
 
 __host__
 void qubit_luffa512_cpufinal_setBlock_80(void *pdata, const void *ptarget)
 {
 	unsigned char PaddedMessage[128];
+
 	memcpy(PaddedMessage, pdata, 80);
 	memset(PaddedMessage+80, 0, 48);
 	PaddedMessage[80] = 0x80;
 	PaddedMessage[111] = 1;
 	PaddedMessage[126] = 0x02;
 	PaddedMessage[127] = 0x80;
-	cudaMemcpyToSymbol( pTarget, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice);
 
-	cudaMemcpyToSymbol( c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_Target, ptarget, 8*sizeof(uint32_t), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 16*sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
 }
\ No newline at end of file
diff --git a/util.c b/util.c
index 374afca..1c8cdb8 100644
--- a/util.c
+++ b/util.c
@@ -1474,10 +1474,6 @@ void print_hash_tests(void)
 	deephash(&hash[0], &buf[0]);
 	printpfx("deep", hash);
 
-	memset(hash, 0, sizeof hash);
-	doomhash(&hash[0], &buf[0]);
-	printpfx("doom", hash);
-
 	memset(hash, 0, sizeof hash);
 	fresh_hash(&hash[0], &buf[0]);
 	printpfx("fresh", hash);
@@ -1498,6 +1494,10 @@ void print_hash_tests(void)
 	jackpothash(&hash[0], &buf[0]);
 	printpfx("jackpot", hash);
 
+	memset(hash, 0, sizeof hash);
+	doomhash(&hash[0], &buf[0]);
+	printpfx("luffa", hash);
+
 	memset(hash, 0, sizeof hash);
 	myriadhash(&hash[0], &buf[0]);
 	printpfx("myriad", hash);