From 41543b5c0b95d2517d934d318ae5e195003ec58e Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Fri, 6 Nov 2015 19:20:44 +0100 Subject: [PATCH] lyra2v2: add support for SM 2.1 devices and improve a bit SM 3 perf --- bench.cpp | 1 - lyra2/cuda_lyra2_vectors.h | 12 +++++ lyra2/cuda_lyra2v2.cu | 9 ++-- lyra2/cuda_lyra2v2_sm3.cuh | 104 +++++++++++++++++++++++++++++++++---- lyra2/lyra2REv2.cu | 29 ++++++++--- 5 files changed, 133 insertions(+), 22 deletions(-) diff --git a/bench.cpp b/bench.cpp index bd71fd3..e3f9856 100644 --- a/bench.cpp +++ b/bench.cpp @@ -103,7 +103,6 @@ bool bench_algo_switch_next(int thr_id) if (algo == ALGO_GROESTL) algo++; if (algo == ALGO_MYR_GR) algo++; if (algo == ALGO_JACKPOT) algo++; // compact shuffle - if (algo == ALGO_LYRA2v2) algo++; if (algo == ALGO_NEOSCRYPT) algo++; if (algo == ALGO_WHIRLPOOLX) algo++; } diff --git a/lyra2/cuda_lyra2_vectors.h b/lyra2/cuda_lyra2_vectors.h index 568acb1..1733b96 100644 --- a/lyra2/cuda_lyra2_vectors.h +++ b/lyra2/cuda_lyra2_vectors.h @@ -12,6 +12,10 @@ #include "cuda_helper.h" +#if __CUDA_ARCH__ < 300 +#define __shfl(x, y) (x) +#endif + #if __CUDA_ARCH__ < 320 && !defined(__ldg4) #define __ldg4(x) (*(x)) #endif @@ -545,6 +549,7 @@ static __forceinline__ __device__ uint16 swapvec(const uint16 &buf) static __device__ __forceinline__ uint28 shuffle4(const uint28 &var, int lane) { +#if __CUDA_ARCH__ >= 300 uint28 res; res.x.x = __shfl(var.x.x, lane); res.x.y = __shfl(var.x.y, lane); @@ -555,10 +560,14 @@ static __device__ __forceinline__ uint28 shuffle4(const uint28 &var, int lane) res.w.x = __shfl(var.w.x, lane); res.w.y = __shfl(var.w.y, lane); return res; +#else + return var; +#endif } static __device__ __forceinline__ ulonglong4 shuffle4(ulonglong4 var, int lane) { +#if __CUDA_ARCH__ >= 300 ulonglong4 res; uint2 temp; temp = vectorize(var.x); @@ -578,6 +587,9 @@ static __device__ __forceinline__ ulonglong4 shuffle4(ulonglong4 var, int lane) temp.y = __shfl(temp.y, lane); res.w = devectorize(temp); return res; +#else + return var; +#endif } #endif // #ifndef CUDA_LYRA_VECTOR_H diff --git a/lyra2/cuda_lyra2v2.cu b/lyra2/cuda_lyra2v2.cu index 952839f..a1e5dc0 100644 --- a/lyra2/cuda_lyra2v2.cu +++ b/lyra2/cuda_lyra2v2.cu @@ -342,7 +342,7 @@ void lyra2v2_gpu_hash_32(const uint32_t threads, uint32_t startNounce, uint2 *g_ } #else #include "cuda_helper.h" -#if __CUDA_ARCH__ < 300 +#if __CUDA_ARCH__ < 200 __device__ void* DMatrix; #endif __global__ void lyra2v2_gpu_hash_32(const uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} @@ -362,9 +362,10 @@ void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uin int dev_id = device_map[thr_id % MAX_GPUS]; uint32_t tpb = TPB52; - if (device_sm[dev_id] == 500 || cuda_arch[dev_id] == 500) tpb = TPB50; - else if (device_sm[dev_id] == 350 || cuda_arch[dev_id] == 350) tpb = TPB35; - else if (device_sm[dev_id] < 350 || cuda_arch[dev_id] < 350) tpb = TPB30; + if (cuda_arch[dev_id] == 500) tpb = TPB50; + else if (cuda_arch[dev_id] >= 350) tpb = TPB35; + else if (cuda_arch[dev_id] >= 300) tpb = TPB30; + else if (cuda_arch[dev_id] >= 200) tpb = TPB20; dim3 grid((threads + tpb - 1) / tpb); dim3 block(tpb); diff --git a/lyra2/cuda_lyra2v2_sm3.cuh b/lyra2/cuda_lyra2v2_sm3.cuh index 0a150ad..1b20485 100644 --- a/lyra2/cuda_lyra2v2_sm3.cuh +++ b/lyra2/cuda_lyra2v2_sm3.cuh @@ -1,4 +1,4 @@ -/* SM 3/3.5 Variant for lyra2REv2 */ +/* SM 2/3/3.5 Variant for lyra2REv2 */ #ifdef __INTELLISENSE__ /* just for vstudio code colors */ @@ -6,10 +6,11 @@ #define __CUDA_ARCH__ 350 #endif -#define TPB30 16 +#define TPB20 64 +#define TPB30 64 #define TPB35 64 -#if __CUDA_ARCH__ >= 300 && __CUDA_ARCH__ < 500 +#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500 #include "cuda_lyra2_vectors.h" @@ -165,6 +166,7 @@ void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, v } } +#if __CUDA_ARCH__ >= 300 __global__ __launch_bounds__(TPB35, 1) void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) { @@ -177,14 +179,14 @@ void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outpu if (threadIdx.x == 0) { ((uint16*)blake2b_IV)[0] = make_uint16( - 0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85 , - 0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a , - 0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c , + 0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85, + 0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a, + 0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c, 0xfb41bd6b, 0x1f83d9ab , 0x137e2179, 0x5be0cd19 ); ((uint16*)padding)[0] = make_uint16( - 0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0 , - 0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000 + 0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0, + 0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000 ); } @@ -194,6 +196,7 @@ void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outpu ((uint2*)state)[1] = __ldg(&outputHash[thread + threads]); ((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]); ((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]); + state[1] = state[0]; state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0); state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0); @@ -246,9 +249,90 @@ void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outpu } //thread } +#elif __CUDA_ARCH__ >= 200 +__global__ __launch_bounds__(TPB20, 1) +void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + vectype state[4]; + vectype blake2b_IV[2]; + vectype padding[2]; + + ((uint16*)blake2b_IV)[0] = make_uint16( + 0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85, + 0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a, + 0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c, + 0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19 + ); + ((uint16*)padding)[0] = make_uint16( + 0x20, 0x0, 0x20, 0x0, 0x20, 0x0, 0x01, 0x0, + 0x04, 0x0, 0x04, 0x0, 0x80, 0x0, 0x0, 0x01000000 + ); + + if (thread < threads) + { + + ((uint2*)state)[0] = outputHash[thread]; + ((uint2*)state)[1] = outputHash[thread + threads]; + ((uint2*)state)[2] = outputHash[thread + 2 * threads]; + ((uint2*)state)[3] = outputHash[thread + 3 * threads]; + + state[1] = state[0]; + state[2] = ((vectype*)blake2b_IV)[0]; + state[3] = ((vectype*)blake2b_IV)[1]; + + for (int i = 0; i<12; i++) + round_lyra_v35(state); + + state[0] ^= ((vectype*)padding)[0]; + state[1] ^= ((vectype*)padding)[1]; + + for (int i = 0; i<12; i++) + round_lyra_v35(state); + + uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread); + + //#pragma unroll 4 + for (int i = 0; i < 4; i++) + { + uint32_t s1 = ps1 - 4 * memshift * i; + for (int j = 0; j < 3; j++) + (DMatrix + s1)[j] = (state)[j]; + + round_lyra_v35(state); + } + + reduceDuplexV3(state, thread); + reduceDuplexRowSetupV3(1, 0, 2, state, thread); + reduceDuplexRowSetupV3(2, 1, 3, state, thread); + + uint32_t rowa; + int prev = 3; + for (int i = 0; i < 4; i++) + { + rowa = ((uint2*)state)[0].x & 3; reduceDuplexRowtV3(prev, rowa, i, state, thread); + prev = i; + } + + uint32_t shift = (memshift * rowa + 16 * memshift * thread); + + for (int j = 0; j < 3; j++) + state[j] ^= __ldg4(&(DMatrix + shift)[j]); + + for (int i = 0; i < 12; i++) + round_lyra_v35(state); + + outputHash[thread] = ((uint2*)state)[0]; + outputHash[thread + threads] = ((uint2*)state)[1]; + outputHash[thread + 2 * threads] = ((uint2*)state)[2]; + outputHash[thread + 3 * threads] = ((uint2*)state)[3]; + + } //thread +} +#endif #else -/* if __CUDA_ARCH__ < 300 .. */ +/* host & sm5+ */ __global__ void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) {} #endif - diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu index 418c000..2308d0c 100644 --- a/lyra2/lyra2REv2.cu +++ b/lyra2/lyra2REv2.cu @@ -73,6 +73,21 @@ void lyra2v2_hash(void *state, const void *input) memcpy(state, hashA, 32); } +#ifdef _DEBUG +#define TRACE(algo) { \ + if (max_nonce == 1 && pdata[19] <= 1) { \ + uint32_t* debugbuf = NULL; \ + cudaMallocHost(&debugbuf, 32); \ + cudaMemcpy(debugbuf, d_hash[thr_id], 32, cudaMemcpyDeviceToHost); \ + printf("lyra2 %s %08x %08x %08x %08x...%08x... \n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \ + swab32(debugbuf[2]), swab32(debugbuf[3]), swab32(debugbuf[7])); \ + cudaFreeHost(debugbuf); \ + } \ +} +#else +#define TRACE(algo) {} +#endif + static bool init[MAX_GPUS] = { 0 }; extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) @@ -113,12 +128,6 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); - if (device_sm[dev_id] < 300) { - gpulog(LOG_ERR, thr_id, "Device SM 3.0 or more recent required!"); - proper_exit(1); - return -1; - } - api_set_throughput(thr_id, throughput); init[thr_id] = true; } @@ -135,11 +144,17 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc uint32_t foundNonces[2] = { 0, 0 }; blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("blake :"); keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("keccak :"); cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("cube :"); lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("lyra2 :"); skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("skein :"); cubehash256_cpu_hash_32(thr_id, throughput,pdata[19], d_hash[thr_id], order++); + TRACE("cube :"); bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonces); @@ -181,7 +196,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc } pdata[19] += throughput; - } while (!work_restart[thr_id].restart); + } while (!work_restart[thr_id].restart && !abort_flag); *hashes_done = pdata[19] - first_nonce; return 0;