From 2520f9a3889b70b29c81796d48abf2a6d9f33678 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Wed, 10 Aug 2016 14:23:18 +0200 Subject: [PATCH] lyra2: import latest nanashi code (v1) --- README.txt | 1 + lyra2/cuda_lyra2.cu | 603 +++++++++++++++++++++++--------- lyra2/cuda_lyra2_sm2.cuh | 7 +- lyra2/cuda_lyra2_sm5.cuh | 693 +++++++++++++++++++++++++++++++++++++ lyra2/cuda_lyra2_vectors.h | 2 +- lyra2/lyra2RE.cu | 63 ++-- 6 files changed, 1188 insertions(+), 181 deletions(-) create mode 100644 lyra2/cuda_lyra2_sm5.cuh diff --git a/README.txt b/README.txt index 20c4b7c..8e64c20 100644 --- a/README.txt +++ b/README.txt @@ -247,6 +247,7 @@ features. Aug. 10th 2016 v1.8.1 SIA Blake2-B Algo (getwork over stratum for Suprnova) SIA Nanopool RPC (getwork over http) + Update also the older lyra2 with Nanashi version July 20th 2016 v1.8.0 Pascal support with cuda 8 diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu index 88d4bce..a8fbb3c 100644 --- a/lyra2/cuda_lyra2.cu +++ b/lyra2/cuda_lyra2.cu @@ -1,40 +1,214 @@ /** - * Lyra2 (v1) cuda implementation based on djm34 work - SM 5/5.2 - * tpruvot@github 2015 + * Lyra2 (v1) cuda implementation based on djm34 work + * tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2) */ #include #include -#define TPB50 16 -#define TPB52 8 +#define TPB52 32 #include "cuda_lyra2_sm2.cuh" +#include "cuda_lyra2_sm5.cuh" #ifdef __INTELLISENSE__ /* just for vstudio code colors */ -#define __CUDA_ARCH__ 500 +#define __CUDA_ARCH__ 520 #endif -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 500 +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ > 500 -#include "cuda_vector_uint2x4.h" +#include "cuda_lyra2_vectors.h" -#define memshift 3 +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +__device__ uint32_t __shfl(uint32_t a, uint32_t b, uint32_t c); +#endif +#define Nrow 8 #define Ncol 8 -#define NcolMask 0x7 +#define memshift 3 + +#define BUF_COUNT 0 + +__device__ uint2 *DMatrix; + +__device__ __forceinline__ void LD4S(uint2 res[3], const int row, const int col, const int thread, const int threads) +{ +#if BUF_COUNT != 8 + extern __shared__ uint2 shared_mem[]; + const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift; +#endif +#if BUF_COUNT != 0 + const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x; +#endif + +#if BUF_COUNT == 8 + #pragma unroll + for (int j = 0; j < 3; j++) + res[j] = *(DMatrix + d0 + j * threads * blockDim.x); +#elif BUF_COUNT == 0 + #pragma unroll + for (int j = 0; j < 3; j++) + res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x]; +#else + if (row < BUF_COUNT) + { + #pragma unroll + for (int j = 0; j < 3; j++) + res[j] = *(DMatrix + d0 + j * threads * blockDim.x); + } + else + { + #pragma unroll + for (int j = 0; j < 3; j++) + res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x]; + } +#endif +} + +__device__ __forceinline__ void ST4S(const int row, const int col, const uint2 data[3], const int thread, const int threads) +{ +#if BUF_COUNT != 8 + extern __shared__ uint2 shared_mem[]; + const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift; +#endif +#if BUF_COUNT != 0 + const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x; +#endif + +#if BUF_COUNT == 8 + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + d0 + j * threads * blockDim.x) = data[j]; + +#elif BUF_COUNT == 0 + #pragma unroll + for (int j = 0; j < 3; j++) + shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j]; + +#else + if (row < BUF_COUNT) + { + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + d0 + j * threads * blockDim.x) = data[j]; + } + else + { + #pragma unroll + for (int j = 0; j < 3; j++) + shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j]; + } +#endif +} + +#if __CUDA_ARCH__ >= 300 +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + return __shfl(a, b, c); +} -__device__ uint2x4* DMatrix; +__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) +{ + return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c)); +} + +__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + a1 = WarpShuffle(a1, b1, c); + a2 = WarpShuffle(a2, b2, c); + a3 = WarpShuffle(a3, b3, c); +} + +#else +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + uint32_t *_ptr = (uint32_t*)shared_mem; + + __threadfence_block(); + uint32_t buf = _ptr[thread]; + + _ptr[thread] = a; + __threadfence_block(); + uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))]; + + __threadfence_block(); + _ptr[thread] = buf; + + __threadfence_block(); + return result; +} + +__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + __threadfence_block(); + uint2 buf = shared_mem[thread]; + + shared_mem[thread] = a; + __threadfence_block(); + uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))]; + + __threadfence_block(); + shared_mem[thread] = buf; + + __threadfence_block(); + return result; +} +__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + __threadfence_block(); + uint2 buf = shared_mem[thread]; + + shared_mem[thread] = a1; + __threadfence_block(); + a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))]; + __threadfence_block(); + shared_mem[thread] = a2; + __threadfence_block(); + a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))]; + __threadfence_block(); + shared_mem[thread] = a3; + __threadfence_block(); + a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))]; + + __threadfence_block(); + shared_mem[thread] = buf; + __threadfence_block(); +} + +#endif + +#if __CUDA_ARCH__ > 500 || !defined(__CUDA_ARCH) static __device__ __forceinline__ void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d) { - a += b; d ^= a; d = SWAPUINT2(d); - c += d; b ^= c; b = ROR2(b, 24); - a += b; d ^= a; d = ROR2(d, 16); + a += b; uint2 tmp = d; d.y = a.x ^ tmp.x; d.x = a.y ^ tmp.y; + c += d; b ^= c; b = ROR24(b); + a += b; d ^= a; d = ROR16(d); c += d; b ^= c; b = ROR2(b, 63); } +#endif + +__device__ __forceinline__ void round_lyra(uint2 s[4]) +{ + Gfunc(s[0], s[1], s[2], s[3]); + WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4); + Gfunc(s[0], s[1], s[2], s[3]); + WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4); +} static __device__ __forceinline__ void round_lyra(uint2x4* s) @@ -50,21 +224,24 @@ void round_lyra(uint2x4* s) } static __device__ __forceinline__ -void reduceDuplex(uint2x4 state[4], uint32_t thread) +void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads) { - uint2x4 state1[3]; + uint2 state1[3]; - const uint32_t ps1 = (256 * thread); - const uint32_t ps2 = (memshift * 7 + memshift * 8 + 256 * thread); +#if __CUDA_ARCH__ > 500 +#pragma unroll +#endif + for (int i = 0; i < Nrow; i++) + { + ST4S(0, Ncol - i - 1, state, thread, threads); + + round_lyra(state); + } #pragma unroll 4 - for (int i = 0; i < 8; i++) + for (int i = 0; i < Nrow; i++) { - const uint32_t s1 = ps1 + i*memshift; - const uint32_t s2 = ps2 - i*memshift; - - for (int j = 0; j < 3; j++) - state1[j] = __ldg4(&(DMatrix+s1)[j]); + LD4S(state1, 0, i, thread, threads); for (int j = 0; j < 3; j++) state[j] ^= state1[j]; @@ -72,208 +249,324 @@ void reduceDuplex(uint2x4 state[4], uint32_t thread) for (int j = 0; j < 3; j++) state1[j] ^= state[j]; - for (int j = 0; j < 3; j++) - (DMatrix + s2)[j] = state1[j]; + ST4S(1, Ncol - i - 1, state1, thread, threads); } } static __device__ __forceinline__ -void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2x4 state[4], uint32_t thread) +void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], uint32_t thread, const uint32_t threads) { - uint2x4 state1[3], state2[3]; - - const uint32_t ps1 = ( memshift*8 * rowIn + 256 * thread); - const uint32_t ps2 = ( memshift*8 * rowInOut + 256 * thread); - const uint32_t ps3 = (memshift*7 + memshift*8 * rowOut + 256 * thread); + uint2 state1[3], state2[3]; #pragma unroll 1 - for (int i = 0; i < 8; i++) + for (int i = 0; i < Nrow; i++) { - const uint32_t s1 = ps1 + i*memshift; - const uint32_t s2 = ps2 + i*memshift; + LD4S(state1, rowIn, i, thread, threads); + LD4S(state2, rowInOut, i, thread, threads); for (int j = 0; j < 3; j++) - state1[j]= __ldg4(&(DMatrix + s1)[j]); + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + + #pragma unroll for (int j = 0; j < 3; j++) - state2[j]= __ldg4(&(DMatrix + s2)[j]); - for (int j = 0; j < 3; j++) { - uint2x4 tmp = state1[j] + state2[j]; - state[j] ^= tmp; + state1[j] ^= state[j]; + + ST4S(rowOut, Ncol - i - 1, state1, thread, threads); + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } else { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; } + ST4S(rowInOut, i, state2, thread, threads); + } +} + +static __device__ __forceinline__ +void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads) +{ + for (int i = 0; i < Nrow; i++) + { + uint2 state1[3], state2[3]; + + LD4S(state1, rowIn, i, thread, threads); + LD4S(state2, rowInOut, i, thread, threads); + +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + round_lyra(state); - for (int j = 0; j < 3; j++) { - const uint32_t s3 = ps3 - i*memshift; - state1[j] ^= state[j]; - (DMatrix + s3)[j] = state1[j]; + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; } - ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + ST4S(rowInOut, i, state2, thread, threads); - for (int j = 0; j < 11; j++) - ((uint2*)state2)[j+1] ^= ((uint2*)state)[j]; + LD4S(state1, rowOut, i, thread, threads); +#pragma unroll for (int j = 0; j < 3; j++) - (DMatrix + s2)[j] = state2[j]; + state1[j] ^= state[j]; + + ST4S(rowOut, i, state1, thread, threads); } } static __device__ __forceinline__ -void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2x4* state, const uint32_t thread) +void reduceDuplexRowt_8(const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads) { - const uint32_t ps1 = (memshift * 8 * rowIn + 256 * thread); - const uint32_t ps2 = (memshift * 8 * rowInOut + 256 * thread); - const uint32_t ps3 = (memshift * 8 * rowOut + 256 * thread); + uint2 state1[3], state2[3], last[3]; - #pragma unroll 1 - for (int i = 0; i < 8; i++) - { - uint2x4 state1[3], state2[3]; + LD4S(state1, 2, 0, thread, threads); + LD4S(last, rowInOut, 0, thread, threads); - const uint32_t s1 = ps1 + i*memshift; - const uint32_t s2 = ps2 + i*memshift; + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + last[j]; - for (int j = 0; j < 3; j++) { - state1[j] = __ldg4(&(DMatrix + s1)[j]); - state2[j] = __ldg4(&(DMatrix + s2)[j]); - } + round_lyra(state); - #pragma unroll - for (int j = 0; j < 3; j++) { - state1[j] += state2[j]; - state[j] ^= state1[j]; - } + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - round_lyra(state); + if (threadIdx.x == 0) + { + last[0] ^= Data2; + last[1] ^= Data0; + last[2] ^= Data1; + } else { + last[0] ^= Data0; + last[1] ^= Data1; + last[2] ^= Data2; + } - ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + if (rowInOut == 5) + { + #pragma unroll + for (int j = 0; j < 3; j++) + last[j] ^= state[j]; + } - for (int j = 0; j < 11; j++) - ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; + for (int i = 1; i < Nrow; i++) + { + LD4S(state1, 2, i, thread, threads); + LD4S(state2, rowInOut, i, thread, threads); - if (rowInOut == rowOut) { - for (int j = 0; j < 3; j++) { - state2[j] ^= state[j]; - (DMatrix + s2)[j]=state2[j]; - } - } else { - const uint32_t s3 = ps3 + i*memshift; - for (int j = 0; j < 3; j++) { - (DMatrix + s2)[j] = state2[j]; - (DMatrix + s3)[j] ^= state[j]; - } - } + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); } + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= last[j]; } -#if __CUDA_ARCH__ == 500 -__global__ __launch_bounds__(TPB50, 1) -#else -__global__ __launch_bounds__(TPB52, 2) -#endif -void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +__constant__ uint2x4 blake2b_IV[2] = { + 0xf3bcc908lu, 0x6a09e667lu, + 0x84caa73blu, 0xbb67ae85lu, + 0xfe94f82blu, 0x3c6ef372lu, + 0x5f1d36f1lu, 0xa54ff53alu, + 0xade682d1lu, 0x510e527flu, + 0x2b3e6c1flu, 0x9b05688clu, + 0xfb41bd6blu, 0x1f83d9ablu, + 0x137e2179lu, 0x5be0cd19lu +}; + +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - const uint2x4 blake2b_IV[2] = { - {{ 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a }}, - {{ 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 }} - }; - if (thread < threads) { uint2x4 state[4]; - ((uint2*)state)[0] = __ldg(&g_hash[thread]); - ((uint2*)state)[1] = __ldg(&g_hash[thread + threads]); - ((uint2*)state)[2] = __ldg(&g_hash[thread + threads*2]); - ((uint2*)state)[3] = __ldg(&g_hash[thread + threads*3]); - - state[1] = state[0]; + state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]); + state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]); + state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]); + state[0].w = state[1].w = __ldg(&g_hash[thread + threads * 3]); state[2] = blake2b_IV[0]; state[3] = blake2b_IV[1]; for (int i = 0; i<24; i++) round_lyra(state); //because 12 is not enough - const uint32_t ps1 = (memshift * 7 + 256 * thread); - for (int i = 0; i < 8; i++) - { - const uint32_t s1 = ps1 - memshift * i; - for (int j = 0; j < 3; j++) - (DMatrix + s1)[j] = (state)[j]; - round_lyra(state); - } + ((uint2x4*)DMatrix)[threads * 0 + thread] = state[0]; + ((uint2x4*)DMatrix)[threads * 1 + thread] = state[1]; + ((uint2x4*)DMatrix)[threads * 2 + thread] = state[2]; + ((uint2x4*)DMatrix)[threads * 3 + thread] = state[3]; + } +} - reduceDuplex(state, thread); - - reduceDuplexRowSetup(1, 0, 2, state, thread); - reduceDuplexRowSetup(2, 1, 3, state, thread); - reduceDuplexRowSetup(3, 0, 4, state, thread); - reduceDuplexRowSetup(4, 3, 5, state, thread); - reduceDuplexRowSetup(5, 2, 6, state, thread); - reduceDuplexRowSetup(6, 1, 7, state, thread); - - uint32_t rowa = state[0].x.x & 7; - reduceDuplexRowt(7, rowa, 0, state, thread); - rowa = state[0].x.x & 7; - reduceDuplexRowt(0, rowa, 3, state, thread); - rowa = state[0].x.x & 7; - reduceDuplexRowt(3, rowa, 6, state, thread); - rowa = state[0].x.x & 7; - reduceDuplexRowt(6, rowa, 1, state, thread); - rowa = state[0].x.x & 7; - reduceDuplexRowt(1, rowa, 4, state, thread); - rowa = state[0].x.x & 7; - reduceDuplexRowt(4, rowa, 7, state, thread); - rowa = state[0].x.x & 7; - reduceDuplexRowt(7, rowa, 2, state, thread); - rowa = state[0].x.x & 7; - reduceDuplexRowt(2, rowa, 5, state, thread); - - const int32_t shift = (memshift * 8 * rowa + 256 * thread); +__global__ +__launch_bounds__(TPB52, 1) +void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) +{ + const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y; - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= __ldg4(&(DMatrix + shift)[j]); + if (thread < threads) + { + uint2 state[4]; + state[0] = __ldg(&DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x]); + state[1] = __ldg(&DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x]); + state[2] = __ldg(&DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x]); + state[3] = __ldg(&DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x]); + + reduceDuplex(state, thread, threads); + reduceDuplexRowSetup(1, 0, 2, state, thread, threads); + reduceDuplexRowSetup(2, 1, 3, state, thread, threads); + reduceDuplexRowSetup(3, 0, 4, state, thread, threads); + reduceDuplexRowSetup(4, 3, 5, state, thread, threads); + reduceDuplexRowSetup(5, 2, 6, state, thread, threads); + reduceDuplexRowSetup(6, 1, 7, state, thread, threads); + + uint32_t rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(7, rowa, 0, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(0, rowa, 3, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(3, rowa, 6, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(6, rowa, 1, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(1, rowa, 4, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(4, rowa, 7, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(7, rowa, 2, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt_8(rowa, state, thread, threads); + + DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x] = state[0]; + DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x] = state[1]; + DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x] = state[2]; + DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x] = state[3]; + } +} + +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +{ + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + + uint28 state[4]; + + if (thread < threads) + { + state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]); + state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]); + state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]); + state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]); for (int i = 0; i < 12; i++) round_lyra(state); - g_hash[thread] = ((uint2*)state)[0]; - g_hash[thread + threads] = ((uint2*)state)[1]; - g_hash[thread + threads*2] = ((uint2*)state)[2]; - g_hash[thread + threads*3] = ((uint2*)state)[3]; - } + g_hash[thread + threads * 0] = state[0].x; + g_hash[thread + threads * 1] = state[0].y; + g_hash[thread + threads * 2] = state[0].z; + g_hash[thread + threads * 3] = state[0].w; + + } //thread } #else +#if __CUDA_ARCH__ < 500 + /* for unsupported SM arch */ __device__ void* DMatrix; -__global__ void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} +#endif +__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {} +__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} #endif __host__ -void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix) +void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix) { - cuda_get_arch(thr_id); + // just assign the device pointer allocated in main loop cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice); } __host__ -void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order) +void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, bool gtx750ti) { int dev_id = device_map[thr_id % MAX_GPUS]; + uint32_t tpb = TPB52; - if (device_sm[dev_id] == 500) tpb = TPB50; - if (device_sm[dev_id] == 350) tpb = TPB30; // to enhance (or not) - if (device_sm[dev_id] <= 300) tpb = TPB30; - dim3 grid((threads + tpb - 1) / tpb); - dim3 block(tpb); + if (cuda_arch[dev_id] >= 520) tpb = TPB52; + else if (cuda_arch[dev_id] >= 500) tpb = TPB50; + else if (cuda_arch[dev_id] >= 200) tpb = TPB20; - if (device_sm[dev_id] >= 500) - lyra2_gpu_hash_32 <<< grid, block >>> (threads, startNounce, (uint2*)d_hash); - else - lyra2_gpu_hash_32_sm2 <<< grid, block >>> (threads, startNounce, d_hash); + dim3 grid1((threads * 4 + tpb - 1) / tpb); + dim3 block1(4, tpb >> 2); + + dim3 grid2((threads + 64 - 1) / 64); + dim3 block2(64); + dim3 grid3((threads + tpb - 1) / tpb); + dim3 block3(tpb); + + if (cuda_arch[dev_id] >= 520) + { + lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash); + + lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, startNounce, d_hash); + + lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash); + } + else if (cuda_arch[dev_id] >= 500) + { + size_t shared_mem = 0; + + if (gtx750ti) + // 8Warpに調整のため、8192バイト確保する + shared_mem = 8192; + else + // 10Warpに調整のため、6144バイト確保する + shared_mem = 6144; + + lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash); + + lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, startNounce, (uint2*)d_hash); + + lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash); + } + else + lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, startNounce, d_hash); } diff --git a/lyra2/cuda_lyra2_sm2.cuh b/lyra2/cuda_lyra2_sm2.cuh index 7998d17..94e8756 100644 --- a/lyra2/cuda_lyra2_sm2.cuh +++ b/lyra2/cuda_lyra2_sm2.cuh @@ -3,15 +3,16 @@ #ifdef __INTELLISENSE__ /* just for vstudio code colors */ #undef __CUDA_ARCH__ -#define __CUDA_ARCH__ 300 +#define __CUDA_ARCH__ 500 #endif #include "cuda_helper.h" #define TPB30 160 +#define TPB20 160 #if (__CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350) || !defined(__CUDA_ARCH__) -__constant__ static uint2 blake2b_IV[8] = { +__constant__ static uint2 blake2b_IV_sm2[8] = { { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, @@ -149,7 +150,7 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_h #pragma unroll for (int i = 0; i<8; i++) { - state[i + 8] = blake2b_IV[i]; + state[i + 8] = blake2b_IV_sm2[i]; } // blake2blyra x2 diff --git a/lyra2/cuda_lyra2_sm5.cuh b/lyra2/cuda_lyra2_sm5.cuh new file mode 100644 index 0000000..82fd380 --- /dev/null +++ b/lyra2/cuda_lyra2_sm5.cuh @@ -0,0 +1,693 @@ +#include + +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +#undef __CUDA_ARCH__ +#define __CUDA_ARCH__ 500 +#endif + +#include "cuda_helper.h" + +#define TPB50 32 + +#if __CUDA_ARCH__ == 500 +#include "cuda_lyra2_vectors.h" + +#define Nrow 8 +#define Ncol 8 +#define memshift 3 + +__device__ uint2 *DMatrix; + +__device__ __forceinline__ uint2 LD4S(const int index) +{ + extern __shared__ uint2 shared_mem[]; + + return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x]; +} + +__device__ __forceinline__ void ST4S(const int index, const uint2 data) +{ + extern __shared__ uint2 shared_mem[]; + + shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data; +} + +#if __CUDA_ARCH__ == 300 +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + return __shfl(a, b, c); +} + +__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) +{ + return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c)); +} + +__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + a1 = WarpShuffle(a1, b1, c); + a2 = WarpShuffle(a2, b2, c); + a3 = WarpShuffle(a3, b3, c); +} + +#else +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + uint32_t *_ptr = (uint32_t*)shared_mem; + + __threadfence_block(); + uint32_t buf = _ptr[thread]; + + _ptr[thread] = a; + __threadfence_block(); + uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))]; + + __threadfence_block(); + _ptr[thread] = buf; + + __threadfence_block(); + return result; +} + +__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + __threadfence_block(); + uint2 buf = shared_mem[thread]; + + shared_mem[thread] = a; + __threadfence_block(); + uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))]; + + __threadfence_block(); + shared_mem[thread] = buf; + + __threadfence_block(); + return result; +} + +__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + __threadfence_block(); + uint2 buf = shared_mem[thread]; + + shared_mem[thread] = a1; + __threadfence_block(); + a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))]; + __threadfence_block(); + shared_mem[thread] = a2; + __threadfence_block(); + a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))]; + __threadfence_block(); + shared_mem[thread] = a3; + __threadfence_block(); + a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))]; + + __threadfence_block(); + shared_mem[thread] = buf; + __threadfence_block(); +} + +#endif + +#if __CUDA_ARCH__ >= 300 +static __device__ __forceinline__ +void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d) +{ + a += b; d ^= a; d = SWAPUINT2(d); + c += d; b ^= c; b = ROR24(b); //ROR2(b, 24); + a += b; d ^= a; d = ROR16(d); + c += d; b ^= c; b = ROR2(b, 63); +} +#endif + +__device__ __forceinline__ void round_lyra(uint2 s[4]) +{ + Gfunc(s[0], s[1], s[2], s[3]); + WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4); + Gfunc(s[0], s[1], s[2], s[3]); + WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4); +} + +static __device__ __forceinline__ +void round_lyra(uint2x4* s) +{ + Gfunc(s[0].x, s[1].x, s[2].x, s[3].x); + Gfunc(s[0].y, s[1].y, s[2].y, s[3].y); + Gfunc(s[0].z, s[1].z, s[2].z, s[3].z); + Gfunc(s[0].w, s[1].w, s[2].w, s[3].w); + Gfunc(s[0].x, s[1].y, s[2].z, s[3].w); + Gfunc(s[0].y, s[1].z, s[2].w, s[3].x); + Gfunc(s[0].z, s[1].w, s[2].x, s[3].y); + Gfunc(s[0].w, s[1].x, s[2].y, s[3].z); +} + +static __device__ __forceinline__ +void reduceDuplexV5(uint2 state[4], const uint32_t thread, const uint32_t threads) +{ + uint2 state1[3], state2[3]; + + const uint32_t ps0 = (memshift * Ncol * 0 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps1 = (memshift * Ncol * 1 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps2 = (memshift * Ncol * 2 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps3 = (memshift * Ncol * 3 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps4 = (memshift * Ncol * 4 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps5 = (memshift * Ncol * 5 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps6 = (memshift * Ncol * 6 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps7 = (memshift * Ncol * 7 * threads + thread)*blockDim.x + threadIdx.x; + + for (int i = 0; i < 8; i++) + { + const uint32_t s0 = memshift * Ncol * 0 + (Ncol - 1 - i) * memshift; + #pragma unroll + for (int j = 0; j < 3; j++) + ST4S(s0 + j, state[j]); + round_lyra(state); + } + + for (int i = 0; i < 8; i++) + { + const uint32_t s0 = memshift * Ncol * 0 + i * memshift; + const uint32_t s1 = ps1 + (7 - i)*memshift* threads*blockDim.x; + #pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = LD4S(s0 + j); + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j]; + + round_lyra(state); + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s1 + j*threads*blockDim.x) = state1[j] ^ state[j]; + } + + // 1, 0, 2 + for (int i = 0; i < 8; i++) + { + const uint32_t s0 = memshift * Ncol * 0 + i * memshift; + const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x; + const uint32_t s2 = ps2 + (7 - i)*memshift* threads*blockDim.x; + #pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = *(DMatrix + s1 + j*threads*blockDim.x); + #pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = LD4S(s0 + j); + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s2 + j*threads*blockDim.x) = state1[j] ^ state[j]; + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + #pragma unroll + for (int j = 0; j < 3; j++) + ST4S(s0 + j, state2[j]); + } + + // 2, 1, 3 + for (int i = 0; i < 8; i++) + { + const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x; + const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x; + const uint32_t s3 = ps3 + (7 - i)*memshift* threads*blockDim.x; + #pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = *(DMatrix + s2 + j*threads*blockDim.x); + #pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = *(DMatrix + s1 + j*threads*blockDim.x); + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s3 + j*threads*blockDim.x) = state1[j] ^ state[j]; + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } else { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s1 + j*threads*blockDim.x) = state2[j]; + } + + // 3, 0, 4 + for (int i = 0; i < 8; i++) + { + const uint32_t ls0 = memshift * Ncol * 0 + i * memshift; + const uint32_t s0 = ps0 + i * memshift* threads*blockDim.x; + const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x; + const uint32_t s4 = ps4 + (7 - i)*memshift* threads*blockDim.x; + #pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = *(DMatrix + s3 + j*threads*blockDim.x); + #pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = LD4S(ls0 + j); + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s4 + j*threads*blockDim.x) = state1[j] ^ state[j]; + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } else { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s0 + j*threads*blockDim.x) = state2[j]; + } + + // 4, 3, 5 + for (int i = 0; i < 8; i++) + { + const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x; + const uint32_t s4 = ps4 + i * memshift* threads*blockDim.x; + const uint32_t s5 = ps5 + (7 - i)*memshift* threads*blockDim.x; + #pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = *(DMatrix + s4 + j*threads*blockDim.x); + #pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = *(DMatrix + s3 + j*threads*blockDim.x); + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s5 + j*threads*blockDim.x) = state1[j] ^ state[j]; + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s3 + j*threads*blockDim.x) = state2[j]; + } + + // 5, 2, 6 + for (int i = 0; i < 8; i++) + { + const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x; + const uint32_t s5 = ps5 + i * memshift* threads*blockDim.x; + const uint32_t s6 = ps6 + (7 - i)*memshift* threads*blockDim.x; + #pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = *(DMatrix + s5 + j*threads*blockDim.x); + #pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = *(DMatrix + s2 + j*threads*blockDim.x); + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s6 + j*threads*blockDim.x) = state1[j] ^ state[j]; + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s2 + j*threads*blockDim.x) = state2[j]; + } + + // 6, 1, 7 + for (int i = 0; i < 8; i++) + { + const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x; + const uint32_t s6 = ps6 + i * memshift* threads*blockDim.x; + const uint32_t s7 = ps7 + (7 - i)*memshift* threads*blockDim.x; + #pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = *(DMatrix + s6 + j*threads*blockDim.x); + #pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = *(DMatrix + s1 + j*threads*blockDim.x); + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s7 + j*threads*blockDim.x) = state1[j] ^ state[j]; + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } else { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + #pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s1 + j*threads*blockDim.x) = state2[j]; + } +} + +static __device__ __forceinline__ +void reduceDuplexRowV50(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads) +{ + const uint32_t ps1 = (memshift * Ncol * rowIn*threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps3 = (memshift * Ncol * rowOut*threads + thread)*blockDim.x + threadIdx.x; + + #pragma unroll 1 + for (int i = 0; i < 8; i++) + { + uint2 state1[3], state2[3]; + + const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x; + const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x; + const uint32_t s3 = ps3 + i*memshift*threads *blockDim.x; + + #pragma unroll + for (int j = 0; j < 3; j++) { + state1[j] = *(DMatrix + s1 + j*threads*blockDim.x); + state2[j] = *(DMatrix + s2 + j*threads*blockDim.x); + } + + #pragma unroll + for (int j = 0; j < 3; j++) { + state1[j] += state2[j]; + state[j] ^= state1[j]; + } + + round_lyra(state); + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } else { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + #pragma unroll + for (int j = 0; j < 3; j++) + { + *(DMatrix + s2 + j*threads*blockDim.x) = state2[j]; + *(DMatrix + s3 + j*threads*blockDim.x) ^= state[j]; + } + } +} + +static __device__ __forceinline__ +void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thread, const uint32_t threads) +{ + const uint32_t ps1 = (memshift * Ncol * 2*threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x; + // const uint32_t ps3 = (memshift * Ncol * 5*threads + thread)*blockDim.x + threadIdx.x; + + uint2 state1[3], last[3]; + + #pragma unroll + for (int j = 0; j < 3; j++) { + state1[j] = *(DMatrix + ps1 + j*threads*blockDim.x); + last[j] = *(DMatrix + ps2 + j*threads*blockDim.x); + } + + #pragma unroll + for (int j = 0; j < 3; j++) { + state1[j] += last[j]; + state[j] ^= state1[j]; + } + + round_lyra(state); + + //一個手前のスレッドからデータを貰う(同時に一個先のスレッドにデータを送る) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + last[0] ^= Data2; + last[1] ^= Data0; + last[2] ^= Data1; + } else { + last[0] ^= Data0; + last[1] ^= Data1; + last[2] ^= Data2; + } + + if (rowInOut == 5) + { + #pragma unroll + for (int j = 0; j < 3; j++) + last[j] ^= state[j]; + } + + for (int i = 1; i < 8; i++) + { + const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x; + const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x; + + #pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= *(DMatrix + s1 + j*threads*blockDim.x) + *(DMatrix + s2 + j*threads*blockDim.x); + + round_lyra(state); + } + + +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= last[j]; + +} + +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + const uint2x4 blake2b_IV[2] = { + { { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } }, + { { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } } + }; + + if (thread < threads) + { + uint2x4 state[4]; + + ((uint2*)state)[0] = __ldg(&g_hash[thread]); + ((uint2*)state)[1] = __ldg(&g_hash[thread + threads]); + ((uint2*)state)[2] = __ldg(&g_hash[thread + threads * 2]); + ((uint2*)state)[3] = __ldg(&g_hash[thread + threads * 3]); + + state[1] = state[0]; + state[2] = blake2b_IV[0]; + state[3] = blake2b_IV[1]; + + for (int i = 0; i < 24; i++) + round_lyra(state); //because 12 is not enough + + ((uint2x4*)DMatrix)[0 * threads + thread] = state[0]; + ((uint2x4*)DMatrix)[1 * threads + thread] = state[1]; + ((uint2x4*)DMatrix)[2 * threads + thread] = state[2]; + ((uint2x4*)DMatrix)[3 * threads + thread] = state[3]; + } +} + +__global__ __launch_bounds__(TPB50, 1) +void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +{ + const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y); + + if (thread < threads) + { + uint2 state[4]; + + state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]); + state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]); + state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]); + state[3] = __ldg(&DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x]); + + reduceDuplexV5(state, thread, threads); + + uint32_t rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(7, rowa, 0, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(0, rowa, 3, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(3, rowa, 6, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(6, rowa, 1, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(1, rowa, 4, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(4, rowa, 7, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(7, rowa, 2, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50_8(rowa, state, thread, threads); + + DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x] = state[0]; + DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x] = state[1]; + DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x] = state[2]; + DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x] = state[3]; + } +} + +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + if (thread < threads) + { + uint2x4 state[4]; + + state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]); + state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]); + state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]); + state[3] = __ldg4(&((uint2x4*)DMatrix)[3 * threads + thread]); + + for (int i = 0; i < 12; i++) + round_lyra(state); + + g_hash[thread] = ((uint2*)state)[0]; + g_hash[thread + threads] = ((uint2*)state)[1]; + g_hash[thread + threads * 2] = ((uint2*)state)[2]; + g_hash[thread + threads * 3] = ((uint2*)state)[3]; + } +} + +#else +/* if __CUDA_ARCH__ != 500 .. host */ +__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} +#endif diff --git a/lyra2/cuda_lyra2_vectors.h b/lyra2/cuda_lyra2_vectors.h index 1733b96..d69efa2 100644 --- a/lyra2/cuda_lyra2_vectors.h +++ b/lyra2/cuda_lyra2_vectors.h @@ -13,7 +13,7 @@ #include "cuda_helper.h" #if __CUDA_ARCH__ < 300 -#define __shfl(x, y) (x) +#define __shfl(x, y, z) (x) #endif #if __CUDA_ARCH__ < 320 && !defined(__ldg4) diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu index d74bb16..9cd9ccb 100644 --- a/lyra2/lyra2RE.cu +++ b/lyra2/lyra2RE.cu @@ -23,7 +23,7 @@ extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNon extern void skein256_cpu_init(int thr_id, uint32_t threads); extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); -extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); +extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti); extern void groestl256_cpu_init(int thr_id, uint32_t threads); extern void groestl256_cpu_free(int thr_id); @@ -79,36 +79,55 @@ extern "C" void lyra2re_hash(void *state, const void *input) } static bool init[MAX_GPUS] = { 0 }; +static uint32_t throughput[MAX_GPUS] = { 0 }; extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) { uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; - int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 17 : 16; - uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4; - if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) - ptarget[7] = 0x000f; + ptarget[7] = 0x00ff; + static __thread bool gtx750ti; if (!init[thr_id]) { - cudaSetDevice(device_map[thr_id]); + int dev_id = device_map[thr_id]; + cudaSetDevice(dev_id); CUDA_LOG_ERROR(); - blake256_cpu_init(thr_id, throughput); - keccak256_cpu_init(thr_id,throughput); - skein256_cpu_init(thr_id, throughput); - groestl256_cpu_init(thr_id, throughput); + int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16; + if (device_sm[device_map[thr_id]] == 500) intensity = 15; + int temp = intensity; + throughput[thr_id] = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4; + if (init[thr_id]) throughput[thr_id] = min(throughput[thr_id], max_nonce - first_nonce); - // DMatrix - cudaMalloc(&d_matrix[thr_id], (size_t)16 * 8 * 8 * sizeof(uint64_t) * throughput); - lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]); + cudaDeviceProp props; + cudaGetDeviceProperties(&props, dev_id); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); + if (strstr(props.name, "750 Ti")) gtx750ti = true; + else gtx750ti = false; + + blake256_cpu_init(thr_id, throughput[thr_id]); + keccak256_cpu_init(thr_id, throughput[thr_id]); + skein256_cpu_init(thr_id, throughput[thr_id]); + groestl256_cpu_init(thr_id, throughput[thr_id]); + + if (device_sm[dev_id] >= 500) + { + size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4; + CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput[thr_id])); + lyra2_cpu_init(thr_id, throughput[thr_id], d_matrix[thr_id]); + } + + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput[thr_id])); init[thr_id] = true; + if (temp != intensity){ + gpulog(LOG_INFO, thr_id, "Intensity set to %u, %u cuda threads", + intensity, throughput[thr_id]); + } } uint32_t _ALIGN(128) endiandata[20]; @@ -122,15 +141,15 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, int order = 0; uint32_t foundNonce; - blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + blake256_cpu_hash_80(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++); + keccak256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++); + lyra2_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], gtx750ti); + skein256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++); TRACE("S") - *hashes_done = pdata[19] - first_nonce + throughput; + *hashes_done = pdata[19] - first_nonce + throughput[thr_id]; - foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + foundNonce = groestl256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++); if (foundNonce != UINT32_MAX) { uint32_t _ALIGN(64) vhash64[8]; @@ -162,11 +181,11 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, } } - if ((uint64_t)throughput + pdata[19] >= max_nonce) { + if ((uint64_t)throughput[thr_id] + pdata[19] >= max_nonce) { pdata[19] = max_nonce; break; } - pdata[19] += throughput; + pdata[19] += throughput[thr_id]; } while (!work_restart[thr_id].restart);