From c88750332c9a66ffc1ff9a73118c5ddb178cecba Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 23 Nov 2014 19:05:27 +0100 Subject: [PATCH] simd512: restore SM3/3.5 perfs Simple change which affect all algos based on SIMD512 fresh, qubit, s3, x11 to x17... --- x11/cuda_x11_echo.cu | 3 --- x11/cuda_x11_simd512.cu | 17 +++++++++++------ x11/x11.cu | 14 +++++++++++--- x13/x13.cu | 2 +- 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/x11/cuda_x11_echo.cu b/x11/cuda_x11_echo.cu index cd8a83b..0866f8c 100644 --- a/x11/cuda_x11_echo.cu +++ b/x11/cuda_x11_echo.cu @@ -3,9 +3,6 @@ #include "cuda_helper.h" -// aus heavy.cu -extern cudaError_t MyStreamSynchronize(cudaStream_t stream, int situation, int thr_id); - #include "cuda_x11_aes.cu" __device__ __forceinline__ void AES_2ROUND( diff --git a/x11/cuda_x11_simd512.cu b/x11/cuda_x11_simd512.cu index 4c1e537..fb63c15 100644 --- a/x11/cuda_x11_simd512.cu +++ b/x11/cuda_x11_simd512.cu @@ -576,7 +576,7 @@ x11_simd512_gpu_expand_64(int threads, uint32_t startNounce, uint64_t *g_hash, u } } -/* + __global__ void __launch_bounds__(TPB, 4) x11_simd512_gpu_compress1_64(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state) { @@ -604,7 +604,7 @@ x11_simd512_gpu_compress2_64(int threads, uint32_t startNounce, uint64_t *g_hash Compression2(hashPosition, g_fft4, g_state); } } -*/ + __global__ void __launch_bounds__(TPB, 4) x11_simd512_gpu_compress_64_maxwell(int threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector, uint4 *g_fft4, uint32_t *g_state) @@ -670,13 +670,18 @@ void x11_simd512_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint const int threadsperblock = TPB; dim3 block(threadsperblock); - dim3 grid8(((threads + threadsperblock-1)/threadsperblock)*8); + dim3 grid((threads + threadsperblock-1) / threadsperblock); - x11_simd512_gpu_expand_64 <<>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id]); + dim3 grid8(((threads + threadsperblock - 1) / threadsperblock) * 8); - dim3 grid((threads + threadsperblock-1)/threadsperblock); + x11_simd512_gpu_expand_64 <<>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id]); + if (device_sm[device_map[thr_id]] >= 500) { + x11_simd512_gpu_compress_64_maxwell <<< grid8, block >>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); + } else { + x11_simd512_gpu_compress1_64 <<< grid, block >>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); + x11_simd512_gpu_compress2_64 <<< grid, block >>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); + } - x11_simd512_gpu_compress_64_maxwell << > > (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); x11_simd512_gpu_final_64 <<>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector, d_temp4[thr_id], d_state[thr_id]); MyStreamSynchronize(NULL, order, thr_id); diff --git a/x11/x11.cu b/x11/x11.cu index 4043be4..55d4dc0 100644 --- a/x11/x11.cu +++ b/x11/x11.cu @@ -58,7 +58,11 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order); -// X11 Hashfunktion +// to check... new sp method +//extern void x11_echo512_cpu_setTarget(const void *ptarget); +//extern uint32_t x11_echo512_cpu_hash_64_final(int thr_id, int threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +// X11 CPU Hash extern "C" void x11hash(void *output, const void *input) { // blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11 @@ -146,7 +150,7 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, throughput = min(throughput, (int)(max_nonce - first_nonce)); if (opt_benchmark) - ((uint32_t*)ptarget)[7] = 0x000f; + ((uint32_t*)ptarget)[7] = 0x5; if (!init[thr_id]) { @@ -176,6 +180,8 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, be32enc(&endiandata[k], ((uint32_t*)pdata)[k]); quark_blake512_cpu_setBlock_80((void*)endiandata); + + //x11_echo512_cpu_setTarget(ptarget); cuda_check_cpu_setTarget(ptarget); do { @@ -196,8 +202,10 @@ extern "C" int scanhash_x11(int thr_id, uint32_t *pdata, x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - // Scan nach Gewinner Hashes auf der GPU + // todo... + //foundNonce = x11_echo512_cpu_hash_64_final(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); //foundNonce = cuda_check_hash_fast(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + foundNonce = cuda_check_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); if (foundNonce != 0xffffffff) { diff --git a/x13/x13.cu b/x13/x13.cu index 36c6cd7..e9e3cd4 100644 --- a/x13/x13.cu +++ b/x13/x13.cu @@ -67,7 +67,7 @@ extern void quark_compactTest_cpu_init(int thr_id, int threads); extern void quark_compactTest_cpu_hash_64(int thr_id, int threads, uint32_t startNounce, uint32_t *inpHashes, uint32_t *d_noncesTrue, size_t *nrmTrue, uint32_t *d_noncesFalse, size_t *nrmFalse, int order); -// X13 Hashfunktion +// X13 CPU Hash extern "C" void x13hash(void *output, const void *input) { // blake1-bmw2-grs3-skein4-jh5-keccak6-luffa7-cubehash8-shavite9-simd10-echo11-hamsi12-fugue13