From 73dd6aac5cc0219e9c10545954310a59b449e0e9 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Thu, 4 Jan 2018 15:46:39 +0100 Subject: [PATCH] keccak: avoid to use twice cuda_default_throughput and drop useless gpu hash alloc... --- Algo256/cuda_keccak256_sm3.cu | 20 ++++++-------------- Algo256/keccak256.cu | 14 +++++--------- 2 files changed, 11 insertions(+), 23 deletions(-) diff --git a/Algo256/cuda_keccak256_sm3.cu b/Algo256/cuda_keccak256_sm3.cu index ff46932..7100ad6 100644 --- a/Algo256/cuda_keccak256_sm3.cu +++ b/Algo256/cuda_keccak256_sm3.cu @@ -22,7 +22,6 @@ static const uint64_t host_keccak_round_constants[24] = { 0x0000000080000001ull, 0x8000000080008008ull }; -static uint32_t *d_nounce[MAX_GPUS]; static uint32_t *d_KNonce[MAX_GPUS]; __constant__ uint32_t pTarget[8]; @@ -170,7 +169,7 @@ static void keccak_blockv30(uint64_t *s, const uint64_t *keccak_round_constants) #endif __global__ __launch_bounds__(128,5) -void keccak256_sm3_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, uint32_t *resNounce) +void keccak256_sm3_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint32_t *resNounce) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -210,10 +209,9 @@ void keccak256_sm3_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *out } __host__ -uint32_t keccak256_sm3_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash, int order) +void keccak256_sm3_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, int order) { - uint32_t result = UINT32_MAX; - cudaMemset(d_KNonce[thr_id], 0xff, sizeof(uint32_t)); + cudaMemset(d_KNonce[thr_id], 0xff, 2*sizeof(uint32_t)); const uint32_t threadsperblock = 128; dim3 grid((threads + threadsperblock-1)/threadsperblock); @@ -221,14 +219,10 @@ uint32_t keccak256_sm3_hash_80(int thr_id, uint32_t threads, uint32_t startNounc size_t shared_size = 0; - keccak256_sm3_gpu_hash_80<<>>(threads, startNounce, d_outputHash, d_KNonce[thr_id]); + keccak256_sm3_gpu_hash_80<<>>(threads, startNounce, d_KNonce[thr_id]); - MyStreamSynchronize(NULL, order, thr_id); - cudaMemcpy(d_nounce[thr_id], d_KNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + cudaMemcpy(resNonces, d_KNonce[thr_id], 2*sizeof(uint32_t), cudaMemcpyDeviceToHost); cudaThreadSynchronize(); - result = *d_nounce[thr_id]; - - return result; } #if 0 @@ -299,13 +293,11 @@ void keccak256_sm3_init(int thr_id, uint32_t threads) { CUDA_SAFE_CALL(cudaMemcpyToSymbol(keccak_round_constants, host_keccak_round_constants, sizeof(host_keccak_round_constants), 0, cudaMemcpyHostToDevice)); - CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], sizeof(uint32_t))); - CUDA_SAFE_CALL(cudaMallocHost(&d_nounce[thr_id], 1*sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMalloc(&d_KNonce[thr_id], 2*sizeof(uint32_t))); } __host__ void keccak256_sm3_free(int thr_id) { cudaFree(d_KNonce[thr_id]); - cudaFreeHost(d_nounce[thr_id]); } diff --git a/Algo256/keccak256.cu b/Algo256/keccak256.cu index 4de505f..b6ed947 100644 --- a/Algo256/keccak256.cu +++ b/Algo256/keccak256.cu @@ -14,8 +14,6 @@ extern "C" #include "cuda_helper.h" -static uint32_t *d_hash[MAX_GPUS]; - // SM5+ cuda extern void keccak256_cpu_init(int thr_id); extern void keccak256_cpu_free(int thr_id); @@ -27,7 +25,7 @@ extern void keccak256_setOutput(int thr_id); extern void keccak256_sm3_init(int thr_id, uint32_t threads); extern void keccak256_sm3_free(int thr_id); extern void keccak256_sm3_setBlock_80(void *pdata, const void *ptarget); -extern uint32_t keccak256_sm3_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order); +extern uint32_t keccak256_sm3_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t* resNonces, int order); // CPU Hash extern "C" void keccak256_hash(void *state, const void *input) @@ -52,13 +50,13 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; const int dev_id = device_map[thr_id]; - uint32_t throughput = cuda_default_throughput(thr_id, 1U << 21); // 256*256*8*4 + uint32_t throughput; + uint32_t intensity = 23; if(!use_compat_kernels[thr_id]) { - uint32_t intensity = 23; if (strstr(device_name[dev_id], "GTX 1070")) intensity = 25; if (strstr(device_name[dev_id], "GTX 1080")) intensity = 26; - throughput = cuda_default_throughput(thr_id, 1U << intensity); } + throughput = cuda_default_throughput(thr_id, 1U << intensity); if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) @@ -80,7 +78,6 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no keccak256_cpu_init(thr_id); } else { // really useful ? - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], throughput * 64)); keccak256_sm3_init(thr_id, throughput); } @@ -107,7 +104,7 @@ extern "C" int scanhash_keccak256(int thr_id, struct work* work, uint32_t max_no *hashes_done = pdata[19] - first_nonce + throughput; if(use_compat_kernels[thr_id]) - work->nonces[0] = keccak256_sm3_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + keccak256_sm3_hash_80(thr_id, throughput, pdata[19], work->nonces, order++); else { keccak256_cpu_hash_80(thr_id, throughput, pdata[19], work->nonces, highTarget); } @@ -170,7 +167,6 @@ extern "C" void free_keccak256(int thr_id) if(!use_compat_kernels[thr_id]) keccak256_cpu_free(thr_id); else { - cudaFree(d_hash[thr_id]); keccak256_sm3_free(thr_id); }