From dcdafd8732a16f913fbf955e76ce8c2efee1d811 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Fri, 3 Apr 2015 03:00:44 +0200
Subject: [PATCH] zr5: enhance pok hash system

And store some vars on the device to reduce proc params
---
 JHA/cuda_jha_keccak512.cu |   9 +-
 skein.cu                  |  11 ++-
 zr5.cu                    | 177 +++++++++++++++++++++++++++-----------
 3 files changed, 139 insertions(+), 58 deletions(-)

diff --git a/JHA/cuda_jha_keccak512.cu b/JHA/cuda_jha_keccak512.cu
index 2fc26b2..83d5757 100644
--- a/JHA/cuda_jha_keccak512.cu
+++ b/JHA/cuda_jha_keccak512.cu
@@ -599,21 +599,18 @@ void zr5_keccak512_cpu_hash(int thr_id, uint32_t threads, uint32_t startNounce,
 /* required for the second hash part of zr5 */
 
 __global__
-void zr5_keccak512_gpu_hash_pok(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint16_t *d_pokh, uint32_t version)
+void zr5_keccak512_gpu_hash_pok(uint32_t threads, uint32_t startNounce, uint32_t *g_hash, uint16_t *d_poks, uint32_t version)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t nounce = startNounce + thread;
 
-		//uint32_t hashPosition = thread * 16;
-		uint32_t *prevHash = &g_hash[thread * 16]; // thread * 64 / sizeof(uint32_t)
 		uint32_t message[18]; /* 72 bytes */
 
 		// pok - hash[0] from prev hash
-		message[0] = version | (prevHash[0] & POK_DATA_MASK);
-		// save pok
-		d_pokh[thread] = (uint16_t) (message[0] / 0x10000);
+		message[0] = version | (0x10000UL * d_poks[thread]);
+		#pragma unroll
 		for (int i=1; i<18; i++) {
 			message[i]=d_OriginalData[i];
 		}
diff --git a/skein.cu b/skein.cu
index ad8e4ca..6018265 100644
--- a/skein.cu
+++ b/skein.cu
@@ -467,11 +467,14 @@ extern "C" int scanhash_skeincoin(int thr_id, uint32_t *pdata,
 				return res;
 			}
 			else {
-				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", thr_id, foundNonce);
-				pdata[19]++;
+				applog(LOG_INFO, "GPU #%d: result for nonce $%08X does not validate on CPU!", device_map[thr_id], foundNonce);
+
+				// reinit card
+				cudaDeviceReset();
+				init[thr_id] = false;
 			}
-		} else
-			pdata[19] += throughput;
+		}
+		pdata[19] += throughput;
 
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
 
diff --git a/zr5.cu b/zr5.cu
index 91532d7..af37d70 100644
--- a/zr5.cu
+++ b/zr5.cu
@@ -23,15 +23,17 @@ extern "C" {
 #define POK_DATA_MASK 0xFFFF0000
 
 static uint32_t* d_hash[MAX_GPUS];
-static uint16_t* d_pokh[MAX_GPUS];
-static uint16_t* h_poks[MAX_GPUS];
+static uint16_t* d_poks[MAX_GPUS];
+
+static uint32_t**d_buffers[MAX_GPUS];
+static uint8_t*  d_permut[MAX_GPUS];
 
 static uint32_t* d_blake[MAX_GPUS];
 static uint32_t* d_groes[MAX_GPUS];
 static uint32_t* d_jh512[MAX_GPUS];
 static uint32_t* d_skein[MAX_GPUS];
 
-__constant__ uint8_t d_permut[24][4];
+__constant__ uint8_t c_permut[24][4];
 static const uint8_t permut[24][4] = {
 	{0, 1, 2, 3},
 	{0, 1, 3, 2},
@@ -123,28 +125,70 @@ extern "C" void zr5hash_pok(void *output, uint32_t *pdata)
 	memcpy(output, hash, 32);
 }
 
-__global__
-void zr5_copy_round_data_gpu(uint32_t threads, uint32_t *d_hash, uint32_t* d_blake, uint32_t* d_groes, uint32_t* d_jh512, uint32_t* d_skein, int rnd)
+// ------------------------------------------------------------------------------------------------
+
+__global__ __launch_bounds__(128, 8)
+void zr5_init_vars_gpu(uint32_t threads, uint32_t* d_hash, uint8_t* d_permut, uint32_t** d_buffers,
+        uint32_t* d_blake, uint32_t* d_groes, uint32_t* d_jh512, uint32_t* d_skein)
 {
-	// copy 64 bytes hash in the right algo buffer
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		const uint64_t offset = thread * 64 / 4;
+		uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t);
 		uint32_t *phash = &d_hash[offset];
-		// algos hash order
-		uint32_t norder = phash[0] % ARRAY_SIZE(permut);
-		uint32_t algo = d_permut[norder][rnd];
-		uint32_t* buffers[4] = { d_blake, d_groes, d_jh512, d_skein };
-
-		if (rnd > 0) {
-			int algosrc = d_permut[norder][rnd - 1];
-			phash = buffers[algosrc] + offset;
-		}
 
-		// uint4 = 4x4 uint32_t = 16 bytes
+		// store the algos order for other procs
+		const uint8_t norder = (phash[0] % ARRAY_SIZE(permut));
+		const uint8_t algo = c_permut[norder][0];
+		d_permut[thread] = norder;
+
+		// init array for other procs
+		d_buffers[0] = d_blake;
+		d_buffers[1] = d_groes;
+		d_buffers[2] = d_jh512;
+		d_buffers[3] = d_skein;
+
+		// Copy From d_hash to the first algo buffer
+		// uint4 = 4x uint32_t = 16 bytes
 		uint4 *psrc = (uint4*) phash;
-		uint4 *pdst = (uint4*) (buffers[algo] + offset);
+		uint4 *pdst = (uint4*) (d_buffers[algo] + offset);
+		pdst[0] = psrc[0];
+		pdst[1] = psrc[1];
+		pdst[2] = psrc[2];
+		pdst[3] = psrc[3];
+	}
+}
+
+__host__
+void zr5_init_vars(int thr_id, uint32_t threads)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	zr5_init_vars_gpu <<<grid, block>>> (
+		threads, d_hash[thr_id], d_permut[thr_id], d_buffers[thr_id],
+		d_blake[thr_id], d_groes[thr_id], d_jh512[thr_id], d_skein[thr_id]
+	);
+}
+
+
+__global__ __launch_bounds__(128, 8)
+void zr5_move_data_to_hash_gpu(const uint32_t threads, const int rnd, uint32_t** const d_buffers, uint8_t *d_permut, uint32_t *d_hash)
+{
+	// copy 64 bytes hash from/to the right algo buffers
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint8_t norder = d_permut[thread];
+		const uint8_t algodst = c_permut[norder][rnd];
+		const uint8_t algosrc = c_permut[norder][rnd-1];
+
+		const uint32_t offset = thread * (64 / 4);
+
+		// uint4 = 4x uint32_t = 16 bytes
+		uint4 *psrc = (uint4*) (d_buffers[algosrc] + offset);
+		uint4 *pdst = (uint4*) (d_buffers[algodst] + offset);
 		pdst[0] = psrc[0];
 		pdst[1] = psrc[1];
 		pdst[2] = psrc[2];
@@ -159,31 +203,54 @@ void zr5_move_data_to_hash(int thr_id, uint32_t threads, int rnd)
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	zr5_copy_round_data_gpu <<<grid, block>>> (threads, d_hash[thr_id], d_blake[thr_id], d_groes[thr_id], d_jh512[thr_id], d_skein[thr_id], rnd);
+	zr5_move_data_to_hash_gpu <<<grid, block>>> (threads, rnd, d_buffers[thr_id], d_permut[thr_id], d_hash[thr_id]);
 }
 
-__global__
-void zr5_final_round_data_gpu(uint32_t threads, uint32_t* d_blake, uint32_t* d_groes, uint32_t* d_jh512, uint32_t* d_skein, uint32_t *d_hash, uint16_t *d_pokh)
+
+__global__ __launch_bounds__(128, 8)
+void zr5_get_poks_gpu(uint32_t threads, uint32_t** const d_buffers, uint8_t* const d_permut, uint32_t *d_hash, uint16_t *d_poks)
 {
-	// after the 4 algos rounds, copy back hash to d_hash
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
-		const uint64_t offset = thread * 16; // 64 / 4;
-		uint32_t *phash = &d_hash[offset];
-		uint16_t norder = phash[0] % ARRAY_SIZE(permut);
-		uint16_t algosrc = d_permut[norder][3];
+		const uint8_t norder = d_permut[thread];
+		const uint8_t algosrc = c_permut[norder][3];
 
-		uint32_t* buffers[4] = { d_blake, d_groes, d_jh512, d_skein };
+		// copy only pok
+		const uint32_t offset = thread * 16U; // 64 / 4;
+		uint16_t* hash0 = (uint16_t*) (d_buffers[algosrc] + offset);
+		d_poks[thread] = hash0[1];
+	}
+}
 
-		// copy only hash[0] + hash[6..7]
-		uint2 *psrc = (uint2*) (buffers[algosrc] + offset);
-		uint2 *pdst = (uint2*) phash;
+__host__
+void zr5_get_poks(int thr_id, uint32_t threads, uint16_t* d_poks)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	zr5_get_poks_gpu <<<grid, block>>> (threads, d_buffers[thr_id], d_permut[thr_id], d_hash[thr_id], d_poks);
+}
 
-		pdst[0].x = psrc[0].x;
-		pdst[3] = psrc[3];
 
-		//phash[7] = *(buffers[algosrc] + offset + 7);
+__global__ __launch_bounds__(128, 8)
+void zr5_final_round_data_gpu(uint32_t threads, uint32_t** const d_buffers, uint8_t* const d_permut, uint32_t *d_hash, uint16_t *d_poks)
+{
+	// after the 4 algos rounds, copy back hash to d_hash
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint8_t norder = d_permut[thread];
+		const uint8_t algosrc = c_permut[norder][3];
+		const uint32_t offset = thread * 16U; // 64 / 4;
+
+		// copy only hash[4..7]
+		uint2 *psrc = (uint2*) (d_buffers[algosrc] + offset);
+		uint2 *phash = (uint2*) (&d_hash[offset]);
+
+		phash[2] = psrc[2];
+		phash[3] = psrc[3];
 	}
 }
 
@@ -194,9 +261,10 @@ void zr5_final_round(int thr_id, uint32_t threads)
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 
-	zr5_final_round_data_gpu <<<grid, block>>> (threads, d_blake[thr_id], d_groes[thr_id], d_jh512[thr_id], d_skein[thr_id], d_hash[thr_id], d_pokh[thr_id]);
+	zr5_final_round_data_gpu <<<grid, block>>> (threads, d_buffers[thr_id], d_permut[thr_id], d_hash[thr_id], d_poks[thr_id]);
 }
 
+
 extern void jackpot_keccak512_cpu_init(int thr_id, uint32_t threads);
 extern void jackpot_keccak512_cpu_setBlock(void *pdata, size_t inlen);
 
@@ -236,12 +304,14 @@ extern "C" int scanhash_zr5(int thr_id, uint32_t *pdata, const uint32_t *ptarget
 	{
 		cudaSetDevice(device_map[thr_id]);
 
+		// constants
+		cudaMemcpyToSymbol(c_permut, permut, 24*4, 0, cudaMemcpyHostToDevice);
+
 		// hash buffer = keccak hash 64 required
 		cudaMalloc(&d_hash[thr_id], 64 * throughput);
-		cudaMalloc(&d_pokh[thr_id], 2 * throughput);
-
-		cudaMemcpyToSymbol(d_permut, permut, 24*4, 0, cudaMemcpyHostToDevice);
-		cudaMallocHost(&h_poks[thr_id], 2 * throughput);
+		cudaMalloc(&d_poks[thr_id], sizeof(uint16_t) * throughput);
+		cudaMalloc(&d_permut[thr_id], sizeof(uint8_t) * throughput);
+		cudaMalloc(&d_buffers[thr_id], 4 * sizeof(uint32_t*));
 
 		// data buffers for the 4 rounds
 		cudaMalloc(&d_blake[thr_id], 64 * throughput);
@@ -272,23 +342,27 @@ extern "C" int scanhash_zr5(int thr_id, uint32_t *pdata, const uint32_t *ptarget
 
 		// Keccak512 Hash with CUDA
 		zr5_keccak512_cpu_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		zr5_init_vars(thr_id, throughput);
 
 		for (int rnd=0; rnd<4; rnd++) {
-			zr5_move_data_to_hash(thr_id, throughput, rnd);
+			if (rnd > 0)
+				zr5_move_data_to_hash(thr_id, throughput, rnd);
 			quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_blake[thr_id], order++);
 			quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_groes[thr_id], order++);
 			quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_jh512[thr_id], order++);
 			quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_skein[thr_id], order++);
 		}
 
-		// This generates all pok prefixes
-		zr5_final_round(thr_id, throughput);
+		// store on device d_poks all hash[0] prefixes
+		zr5_get_poks(thr_id, throughput, d_poks[thr_id]);
 
-		// Keccak512 pok
-		zr5_keccak512_cpu_hash_pok(thr_id, throughput, pdata[19], pdata, d_hash[thr_id], d_pokh[thr_id]);
+		// Keccak512 with pok
+		zr5_keccak512_cpu_hash_pok(thr_id, throughput, pdata[19], pdata, d_hash[thr_id], d_poks[thr_id]);
+		zr5_init_vars(thr_id, throughput);
 
 		for (int rnd=0; rnd<4; rnd++) {
-			zr5_move_data_to_hash(thr_id, throughput, rnd);
+			if (rnd > 0)
+				zr5_move_data_to_hash(thr_id, throughput, rnd);
 			quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_blake[thr_id], order++);
 			quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_groes[thr_id], order++);
 			quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_jh512[thr_id], order++);
@@ -304,11 +378,12 @@ extern "C" int scanhash_zr5(int thr_id, uint32_t *pdata, const uint32_t *ptarget
 			uint32_t oldp19 = pdata[19];
 			uint32_t offset = foundNonce - pdata[19];
 			uint32_t pok = 0;
+			uint16_t h_pok;
 
 			*hashes_done = pdata[19] - first_nonce + throughput;
 
-			cudaMemcpy(h_poks[thr_id], d_pokh[thr_id], 2 * throughput, cudaMemcpyDeviceToHost);
-			pok = version | (0x10000UL * h_poks[thr_id][offset]);
+			cudaMemcpy(&h_pok, d_poks[thr_id] + offset, sizeof(uint16_t), cudaMemcpyDeviceToHost);
+			pok = version | (0x10000UL * h_pok);
 			pdata[0] = pok; pdata[19] = foundNonce;
 			zr5hash(vhash64, pdata);
 			if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) {
@@ -316,7 +391,8 @@ extern "C" int scanhash_zr5(int thr_id, uint32_t *pdata, const uint32_t *ptarget
 				uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, oldp19, d_hash[thr_id], 1);
 				if (secNonce != 0) {
 					offset = secNonce - oldp19;
-					pok = version | (0x10000UL * h_poks[thr_id][offset]);
+					cudaMemcpy(&h_pok, d_poks[thr_id] + offset, sizeof(uint16_t), cudaMemcpyDeviceToHost);
+					pok = version | (0x10000UL * h_pok);
 					memcpy(tmpdata, pdata, 80);
 					tmpdata[0] = pok; tmpdata[19] = secNonce;
 					zr5hash(vhash64, tmpdata);
@@ -328,7 +404,12 @@ extern "C" int scanhash_zr5(int thr_id, uint32_t *pdata, const uint32_t *ptarget
 				}
 				return res;
 			} else {
-				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", thr_id, foundNonce);
+				applog(LOG_WARNING, "GPU #%d: result for %08x does not validate on CPU!", device_map[thr_id], foundNonce);
+
+				// reinit card..
+				cudaDeviceReset();
+				init[thr_id] = false;
+
 				pdata[19]++;
 				pdata[0] = oldp0;
 			}