From 73dd6aac5cc0219e9c10545954310a59b449e0e9 Mon Sep 17 00:00:00 2001
From: Tanguy Pruvot <tanguy.pruvot@gmail.com>
Date: Thu, 4 Jan 2018 15:46:39 +0100
Subject: [PATCH] keccak: avoid to use twice cuda_default_throughput

and drop useless gpu hash alloc...
---
 Algo256/cuda_keccak256_sm3.cu | 20 ++++++--------------
 Algo256/keccak256.cu          | 14 +++++---------
 2 files changed, 11 insertions(+), 23 deletions(-)

diff --git a/Algo256/cuda_keccak256_sm3.cu b/Algo256/cuda_keccak256_sm3.cu
index ff46932..7100ad6 100644
--- a/Algo256/cuda_keccak256_sm3.cu
+++ b/Algo256/cuda_keccak256_sm3.cu
@@ -22,7 +22,6 @@ static const uint64_t host_keccak_round_constants[24] = {
 	0x0000000080000001ull, 0x8000000080008008ull
 };
 
-static uint32_t *d_nounce[MAX_GPUS];
 static uint32_t *d_KNonce[MAX_GPUS];
 
 __constant__ uint32_t pTarget[8];
@@ -170,7 +169,7 @@ static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants)
 #endif
 
 __global__ __launch_bounds__(128,5)
-void keccak256_sm3_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce)
+void keccak256_sm3_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@@ -210,10 +209,9 @@ void keccak256_sm3_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *out
 }
 
 __host__
-uint32_t keccak256_sm3_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order)
+void keccak256_sm3_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, int order)
 {
-	uint32_t result = UINT32_MAX;
-	cudaMemset(d_KNonce[thr_id], 0xff, sizeof(uint32_t));
+	cudaMemset(d_KNonce[thr_id], 0xff, 2*sizeof(uint32_t));
 	const uint32_t threadsperblock = 128;
 
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
@@ -221,14 +219,10 @@ uint32_t keccak256_sm3_hash_80(int thr_id, uint32_t threads, uint32_t startNounc
 
 	size_t shared_size = 0;
 
-	keccak256_sm3_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_outputHash, d_KNonce[thr_id]);
+	keccak256_sm3_gpu_hash_80<<<grid, block, shared_size>>>(threads, startNounce, d_KNonce[thr_id]);
 
-	MyStreamSynchronize(NULL, order, thr_id);
-	cudaMemcpy(d_nounce[thr_id], d_KNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaMemcpy(resNonces, d_KNonce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	cudaThreadSynchronize();
-	result = *d_nounce[thr_id];
-
-	return result;
 }
 
 #if 0
@@ -299,13 +293,11 @@ void keccak256_sm3_init(int thr_id, uint32_t threads)
 {
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants, host_keccak_round_constants,
 				sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice));
-	CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], sizeof(uint32_t)));
-	CUDA_SAFE_CALL(cudaMallocHost(&d_nounce[thr_id], 1*sizeof(uint32_t)));
+	CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], 2*sizeof(uint32_t)));
 }
 
 __host__
 void keccak256_sm3_free(int thr_id)
 {
 	cudaFree(d_KNonce[thr_id]);
-	cudaFreeHost(d_nounce[thr_id]);
 }
diff --git a/Algo256/keccak256.cu b/Algo256/keccak256.cu
index 4de505f..b6ed947 100644
--- a/Algo256/keccak256.cu
+++ b/Algo256/keccak256.cu
@@ -14,8 +14,6 @@ extern "C"
 
 #include "cuda_helper.h"
 
-static uint32_t *d_hash[MAX_GPUS];
-
 // SM5+ cuda
 extern void keccak256_cpu_init(int thr_id);
 extern void keccak256_cpu_free(int thr_id);
@@ -27,7 +25,7 @@ extern void keccak256_setOutput(int thr_id);
 extern void keccak256_sm3_init(int thr_id, uint32_t threads);
 extern void keccak256_sm3_free(int thr_id);
 extern void keccak256_sm3_setBlock_80(void *pdata, const void *ptarget);
-extern uint32_t keccak256_sm3_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
+extern uint32_t keccak256_sm3_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t* resNonces, int order);
 
 // CPU Hash
 extern "C" void keccak256_hash(void *state, const void *input)
@@ -52,13 +50,13 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
 	const int dev_id = device_map[thr_id];
-	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 21); // 256*256*8*4
+	uint32_t throughput;
+	uint32_t intensity = 23;
 	if(!use_compat_kernels[thr_id]) {
-		uint32_t intensity = 23;
 		if (strstr(device_name[dev_id], "GTX 1070")) intensity = 25;
 		if (strstr(device_name[dev_id], "GTX 1080")) intensity = 26;
-		throughput = cuda_default_throughput(thr_id, 1U << intensity);
 	}
+	throughput = cuda_default_throughput(thr_id, 1U << intensity);
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 
 	if (opt_benchmark)
@@ -80,7 +78,6 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
 			keccak256_cpu_init(thr_id);
 		} else {
 			// really useful ?
-			CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], throughput * 64));
 			keccak256_sm3_init(thr_id, throughput);
 		}
 
@@ -107,7 +104,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no
 		*hashes_done = pdata[19] - first_nonce + throughput;
 
 		if(use_compat_kernels[thr_id])
-			work->nonces[0] = keccak256_sm3_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+			keccak256_sm3_hash_80(thr_id, throughput, pdata[19], work->nonces, order++);
 		else {
 			keccak256_cpu_hash_80(thr_id, throughput, pdata[19], work->nonces, highTarget);
 		}
@@ -170,7 +167,6 @@ extern "C" void free_keccak256(int thr_id)
 	if(!use_compat_kernels[thr_id])
 		keccak256_cpu_free(thr_id);
 	else {
-		cudaFree(d_hash[thr_id]);
 		keccak256_sm3_free(thr_id);
 	}