From b3adebdf2abd51b1496b520e619d056fa2a7abdb Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 4 Oct 2015 19:25:03 +0200 Subject: [PATCH] lyra2v2: improve speed on SM 5.2 (Cuda 6.5) with sp unrolls Reduce a bit the 750Ti speed but improve a lot the 9xx speed. Keep compat for SM 3/3.5 in a second file.. Note: With this code and Cuda 7.5, the speed won is the reverse... May be "reverted" soon --- ccminer.vcxproj | 1 + ccminer.vcxproj.filters | 5 +- lyra2/cuda_lyra2v2.cu | 429 ++++++++++++------------------------- lyra2/cuda_lyra2v2_sm3.cuh | 253 ++++++++++++++++++++++ 4 files changed, 397 insertions(+), 291 deletions(-) create mode 100644 lyra2/cuda_lyra2v2_sm3.cuh diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 2baca0e..809a631 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -334,6 +334,7 @@ + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index d4b6802..8e77e8d 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -610,6 +610,9 @@ Source Files\CUDA + + Source Files\CUDA + Source Files\CUDA @@ -674,4 +677,4 @@ Ressources - + diff --git a/lyra2/cuda_lyra2v2.cu b/lyra2/cuda_lyra2v2.cu index ac48277..67ca417 100644 --- a/lyra2/cuda_lyra2v2.cu +++ b/lyra2/cuda_lyra2v2.cu @@ -6,66 +6,51 @@ #define __CUDA_ARCH__ 500 #endif +#define TPB52 10 +#define TPB50 16 + #include "cuda_lyra2_vectors.h" -#define TPB 16 +#include "cuda_lyra2v2_sm3.cuh" + +#ifndef __CUDA_ARCH__ +__device__ void *DMatrix; +#endif + +#if __CUDA_ARCH__ >= 500 #define Nrow 4 #define Ncol 4 - -#if __CUDA_ARCH__ < 500 -#define vectype ulonglong4 -#define u64type uint64_t -#define memshift 4 -#elif __CUDA_ARCH__ == 500 #define u64type uint2 #define vectype uint28 #define memshift 3 -#else -#define u64type uint2 -#define vectype uint28 -#define memshift 3 -#endif __device__ vectype *DMatrix; -#if __CUDA_ARCH__ >= 300 - -#if __CUDA_ARCH__ >= 500 -static __device__ __forceinline__ -void Gfunc_v35(uint2 &a, uint2 &b, uint2 &c, uint2 &d) +__device__ __forceinline__ +void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d) { a += b; d ^= a; d = SWAPUINT2(d); c += d; b ^= c; b = ROR24(b); a += b; d ^= a; d = ROR16(d); c += d; b ^= c; b = ROR2(b, 63); } -#else -static __device__ __forceinline__ -void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d) -{ - a += b; d ^= a; d = ROTR64(d, 32); - c += d; b ^= c; b = ROTR64(b, 24); - a += b; d ^= a; d = ROTR64(d, 16); - c += d; b ^= c; b = ROTR64(b, 63); -} -#endif -static __device__ __forceinline__ -void round_lyra_v35(vectype* s) +__device__ __forceinline__ +void round_lyra_v5(vectype* s) { - Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x); - Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y); - Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z); - Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w); - - Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w); - Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x); - Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y); - Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z); + Gfunc_v5(s[0].x, s[1].x, s[2].x, s[3].x); + Gfunc_v5(s[0].y, s[1].y, s[2].y, s[3].y); + Gfunc_v5(s[0].z, s[1].z, s[2].z, s[3].z); + Gfunc_v5(s[0].w, s[1].w, s[2].w, s[3].w); + + Gfunc_v5(s[0].x, s[1].y, s[2].z, s[3].w); + Gfunc_v5(s[0].y, s[1].z, s[2].w, s[3].x); + Gfunc_v5(s[0].z, s[1].w, s[2].x, s[3].y); + Gfunc_v5(s[0].w, s[1].x, s[2].y, s[3].z); } -static __device__ __forceinline__ +__device__ __forceinline__ void reduceDuplex(vectype state[4], uint32_t thread) { vectype state1[3]; @@ -82,360 +67,217 @@ void reduceDuplex(vectype state[4], uint32_t thread) for (int j = 0; j < 3; j++) state1[j] = __ldg4(&(DMatrix+s1)[j]); + #pragma unroll for (int j = 0; j < 3; j++) state[j] ^= state1[j]; - round_lyra_v35(state); + round_lyra_v5(state); #pragma unroll for (int j = 0; j < 3; j++) state1[j] ^= state[j]; + #pragma unroll for (int j = 0; j < 3; j++) (DMatrix + s2)[j] = state1[j]; } } -static __device__ __forceinline__ -void reduceDuplexV3(vectype state[4], uint32_t thread) +__device__ __forceinline__ +void reduceDuplex50(vectype state[4], uint32_t thread) { - vectype state1[3]; uint32_t ps1 = (Nrow * Ncol * memshift * thread); - uint32_t ps2 = (memshift * (Ncol - 1) * Nrow + memshift * 1 + Nrow * Ncol * memshift * thread); + uint32_t ps2 = (memshift * (Ncol - 1) + memshift * Ncol + Nrow * Ncol * memshift * thread); #pragma unroll 4 for (int i = 0; i < Ncol; i++) { - uint32_t s1 = ps1 + Nrow * i *memshift; - uint32_t s2 = ps2 - Nrow * i *memshift; - - for (int j = 0; j < 3; j++) - state1[j] = __ldg4(&(DMatrix + s1)[j]); + uint32_t s1 = ps1 + i*memshift; + uint32_t s2 = ps2 - i*memshift; + #pragma unroll for (int j = 0; j < 3; j++) - state[j] ^= state1[j]; - round_lyra_v35(state); + state[j] ^= __ldg4(&(DMatrix + s1)[j]); - for (int j = 0; j < 3; j++) - state1[j] ^= state[j]; + round_lyra_v5(state); + #pragma unroll for (int j = 0; j < 3; j++) - (DMatrix + s2)[j] = state1[j]; + (DMatrix + s2)[j] = __ldg4(&(DMatrix + s1)[j]) ^ state[j]; } } -static __device__ __forceinline__ +__device__ __forceinline__ void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread) { - vectype state2[3],state1[3]; + vectype state2[3], state1[3]; uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread); uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread); uint32_t ps3 = (memshift * (Ncol-1) + memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread); - //#pragma unroll 1 for (int i = 0; i < Ncol; i++) { uint32_t s1 = ps1 + i*memshift; uint32_t s2 = ps2 + i*memshift; uint32_t s3 = ps3 - i*memshift; +#if __CUDA_ARCH__ == 500 + + #pragma unroll for (int j = 0; j < 3; j++) - state1[j]= __ldg4(&(DMatrix + s1)[j]); + state[j] = state[j] ^ (__ldg4(&(DMatrix + s1)[j]) + __ldg4(&(DMatrix + s2)[j])); + + round_lyra_v5(state); + #pragma unroll for (int j = 0; j < 3; j++) - state2[j]= __ldg4(&(DMatrix + s2)[j]); - for (int j = 0; j < 3; j++) { - vectype tmp = state1[j] + state2[j]; - state[j] ^= tmp; - } + state1[j] = __ldg4(&(DMatrix + s1)[j]); - round_lyra_v35(state); + #pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = __ldg4(&(DMatrix + s2)[j]); - for (int j = 0; j < 3; j++) { + #pragma unroll + for (int j = 0; j < 3; j++) + { state1[j] ^= state[j]; (DMatrix + s3)[j] = state1[j]; } - ((uint2*)state2)[0] ^= ((uint2*)state)[11]; - - for (int j = 0; j < 11; j++) - ((uint2*)state2)[j+1] ^= ((uint2*)state)[j]; +#else /* 5.2 */ + #pragma unroll for (int j = 0; j < 3; j++) - (DMatrix + s2)[j] = state2[j]; - } -} - -static __device__ __forceinline__ -void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread) -{ - vectype state2[3], state1[3]; - - uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread); - uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread); - uint32_t ps3 = (Nrow * memshift * (Ncol - 1) + memshift * rowOut + Nrow * Ncol * memshift * thread); - - for (int i = 0; i < Ncol; i++) - { - uint32_t s1 = ps1 + Nrow*i*memshift; - uint32_t s2 = ps2 + Nrow*i*memshift; - uint32_t s3 = ps3 - Nrow*i*memshift; - + state1[j] = __ldg4(&(DMatrix + s1)[j]); + #pragma unroll for (int j = 0; j < 3; j++) - state1[j] = __ldg4(&(DMatrix + s1 )[j]); + state2[j] = __ldg4(&(DMatrix + s2)[j]); + #pragma unroll for (int j = 0; j < 3; j++) - state2[j] = __ldg4(&(DMatrix + s2 )[j]); - for (int j = 0; j < 3; j++) { + { vectype tmp = state1[j] + state2[j]; state[j] ^= tmp; } - round_lyra_v35(state); + round_lyra_v5(state); - for (int j = 0; j < 3; j++) { + #pragma unroll + for (int j = 0; j < 3; j++) + { state1[j] ^= state[j]; (DMatrix + s3)[j] = state1[j]; } +#endif ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + + #pragma unroll for (int j = 0; j < 11; j++) - ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; + ((uint2*)state2)[j+1] ^= ((uint2*)state)[j]; + #pragma unroll for (int j = 0; j < 3; j++) (DMatrix + s2)[j] = state2[j]; } } -static __device__ __forceinline__ +__device__ __forceinline__ void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread) { vectype state1[3],state2[3]; - uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread); + uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread); uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread); - uint32_t ps3 = (memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread); + uint32_t ps3 = (memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread); - //#pragma unroll 1 for (int i = 0; i < Ncol; i++) { uint32_t s1 = ps1 + i*memshift; uint32_t s2 = ps2 + i*memshift; uint32_t s3 = ps3 + i*memshift; + #pragma unroll for (int j = 0; j < 3; j++) state1[j] = __ldg4(&(DMatrix + s1)[j]); + + #pragma unroll for (int j = 0; j < 3; j++) state2[j] = __ldg4(&(DMatrix + s2)[j]); + #pragma unroll for (int j = 0; j < 3; j++) state1[j] += state2[j]; + #pragma unroll for (int j = 0; j < 3; j++) state[j] ^= state1[j]; - round_lyra_v35(state); + round_lyra_v5(state); ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + + #pragma unroll for (int j = 0; j < 11; j++) ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; - if (rowInOut != rowOut) { - - for (int j = 0; j < 3; j++) - (DMatrix + s2)[j] = state2[j]; - +#if __CUDA_ARCH__ == 500 + if (rowInOut != rowOut) + { + #pragma unroll for (int j = 0; j < 3; j++) (DMatrix + s3)[j] ^= state[j]; - } else { - + } + if (rowInOut == rowOut) + { + #pragma unroll for (int j = 0; j < 3; j++) state2[j] ^= state[j]; - - for (int j = 0; j < 3; j++) - (DMatrix + s2)[j]=state2[j]; } - - } -} - -static __device__ __forceinline__ -void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread) -{ - vectype state1[3], state2[3]; - uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread); - uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread); - uint32_t ps3 = (memshift * rowOut + Nrow * Ncol * memshift * thread); - - #pragma nounroll - for (int i = 0; i < Ncol; i++) - { - uint32_t s1 = ps1 + Nrow * i*memshift; - uint32_t s2 = ps2 + Nrow * i*memshift; - uint32_t s3 = ps3 + Nrow * i*memshift; - - for (int j = 0; j < 3; j++) - state1[j] = __ldg4(&(DMatrix + s1)[j]); - - for (int j = 0; j < 3; j++) - state2[j] = __ldg4(&(DMatrix + s2)[j]); - - for (int j = 0; j < 3; j++) - state1[j] += state2[j]; - - for (int j = 0; j < 3; j++) - state[j] ^= state1[j]; - - round_lyra_v35(state); - - ((uint2*)state2)[0] ^= ((uint2*)state)[11]; - - for (int j = 0; j < 11; j++) - ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; - - if (rowInOut != rowOut) { - - for (int j = 0; j < 3; j++) - (DMatrix + s2)[j] = state2[j]; - +#else + if (rowInOut != rowOut) + { + #pragma unroll for (int j = 0; j < 3; j++) (DMatrix + s3)[j] ^= state[j]; - } else { - + #pragma unroll for (int j = 0; j < 3; j++) state2[j] ^= state[j]; - - for (int j = 0; j < 3; j++) - (DMatrix + s2)[j] = state2[j]; } +#endif + #pragma unroll + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; } } -#if __CUDA_ARCH__ < 500 -__global__ __launch_bounds__(128, 1) -#elif __CUDA_ARCH__ == 500 -__global__ __launch_bounds__(16, 1) +#if __CUDA_ARCH__ == 500 +__global__ __launch_bounds__(TPB50, 1) #else -__global__ __launch_bounds__(TPB, 1) +__global__ __launch_bounds__(TPB52, 1) #endif -void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) +void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHash) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); vectype state[4]; - uint28 blake2b_IV[2]; - uint28 padding[2]; - - if (threadIdx.x == 0) { - - ((uint16*)blake2b_IV)[0] = make_uint16( - 0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85 , - 0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a , - 0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c , - 0xfb41bd6b, 0x1f83d9ab , 0x137e2179, 0x5be0cd19 - ); - ((uint16*)padding)[0] = make_uint16( - 0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0 , - 0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000 - ); - } -#if __CUDA_ARCH__ <= 350 - if (thread < threads) -#endif - { - ((uint2*)state)[0] = __ldg(&outputHash[thread]); - ((uint2*)state)[1] = __ldg(&outputHash[thread + threads]); - ((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]); - ((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]); - state[1] = state[0]; - state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0); - state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0); - - for (int i = 0; i<12; i++) - round_lyra_v35(state); - - state[0] ^= shuffle4(((vectype*)padding)[0], 0); - state[1] ^= shuffle4(((vectype*)padding)[1], 0); - - for (int i = 0; i<12; i++) - round_lyra_v35(state); - - uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread); - - //#pragma unroll 4 - for (int i = 0; i < 4; i++) - { - uint32_t s1 = ps1 - 4 * memshift * i; - for (int j = 0; j < 3; j++) - (DMatrix + s1)[j] = (state)[j]; - - round_lyra_v35(state); - } - - reduceDuplexV3(state, thread); - reduceDuplexRowSetupV3(1, 0, 2, state, thread); - reduceDuplexRowSetupV3(2, 1, 3, state, thread); - - uint32_t rowa; - int prev = 3; - for (int i = 0; i < 4; i++) - { - rowa = ((uint2*)state)[0].x & 3; reduceDuplexRowtV3(prev, rowa, i, state, thread); - prev = i; - } - - uint32_t shift = (memshift * rowa + 16 * memshift * thread); - - for (int j = 0; j < 3; j++) - state[j] ^= __ldg4(&(DMatrix + shift)[j]); - - for (int i = 0; i < 12; i++) - round_lyra_v35(state); - - outputHash[thread] = ((uint2*)state)[0]; - outputHash[thread + threads] = ((uint2*)state)[1]; - outputHash[thread + 2 * threads] = ((uint2*)state)[2]; - outputHash[thread + 3 * threads] = ((uint2*)state)[3]; - //((vectype*)outputHash)[thread] = state[0]; - - } //thread -} - -#if __CUDA_ARCH__ < 500 -__global__ __launch_bounds__(64, 1) -#elif __CUDA_ARCH__ == 500 -__global__ __launch_bounds__(32, 1) -#else -__global__ __launch_bounds__(TPB, 1) -#endif -void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHash) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - vectype state[4]; uint28 blake2b_IV[2]; - uint28 padding[2]; if (threadIdx.x == 0) { ((uint16*)blake2b_IV)[0] = make_uint16( - 0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85, - 0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a, - 0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c, - 0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19 - ); - ((uint16*)padding)[0] = make_uint16( - 0x20, 0x0, 0x20, 0x0, 0x20, 0x0, 0x01, 0x0, - 0x04, 0x0, 0x04, 0x0, 0x80, 0x0, 0x0, 0x01000000 + 0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85, + 0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a, + 0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c, + 0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19 ); } -#if __CUDA_ARCH__ <= 350 if (thread < threads) -#endif { ((uint2*)state)[0] = __ldg(&outputHash[thread]); ((uint2*)state)[1] = __ldg(&outputHash[thread + threads]); @@ -444,61 +286,67 @@ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHa state[1] = state[0]; - state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0); - state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0); + state[2] = ((blake2b_IV)[0]); + state[3] = ((blake2b_IV)[1]); for (int i = 0; i<12; i++) - round_lyra_v35(state); + round_lyra_v5(state); - state[0] ^= shuffle4(((vectype*)padding)[0], 0); - state[1] ^= shuffle4(((vectype*)padding)[1], 0); + ((uint2*)state)[0].x ^= 0x20; + ((uint2*)state)[1].x ^= 0x20; + ((uint2*)state)[2].x ^= 0x20; + ((uint2*)state)[3].x ^= 0x01; + ((uint2*)state)[4].x ^= 0x04; + ((uint2*)state)[5].x ^= 0x04; + ((uint2*)state)[6].x ^= 0x80; + ((uint2*)state)[7].y ^= 0x01000000; for (int i = 0; i<12; i++) - round_lyra_v35(state); + round_lyra_v5(state); uint32_t ps1 = (memshift * (Ncol - 1) + Nrow * Ncol * memshift * thread); for (int i = 0; i < Ncol; i++) { - uint32_t s1 = ps1 - memshift * i; - for (int j = 0; j < 3; j++) - (DMatrix + s1)[j] = (state)[j]; - - round_lyra_v35(state); + const uint32_t s1 = ps1 - memshift * i; + DMatrix[s1] = state[0]; + DMatrix[s1+1] = state[1]; + DMatrix[s1+2] = state[2]; + round_lyra_v5(state); } - reduceDuplex(state, thread); + reduceDuplex50(state, thread); - reduceDuplexRowSetupV2(1, 0, 2, state, thread); - reduceDuplexRowSetupV2(2, 1, 3, state, thread); + reduceDuplexRowSetupV2(1, 0, 2, state, thread); + reduceDuplexRowSetupV2(2, 1, 3, state, thread); uint32_t rowa; int prev=3; - for (int i = 0; i < 4; i++) { + for (int i = 0; i < 4; i++) + { rowa = ((uint2*)state)[0].x & 3; reduceDuplexRowtV2(prev, rowa, i, state, thread); prev=i; } - uint32_t shift = (memshift * Ncol * rowa + Nrow * Ncol * memshift * thread); + const uint32_t shift = (memshift * Ncol * rowa + Nrow * Ncol * memshift * thread); + #pragma unroll for (int j = 0; j < 3; j++) state[j] ^= __ldg4(&(DMatrix + shift)[j]); for (int i = 0; i < 12; i++) - round_lyra_v35(state); + round_lyra_v5(state); - outputHash[thread] = ((uint2*)state)[0]; + outputHash[thread] = ((uint2*)state)[0]; outputHash[thread + threads] = ((uint2*)state)[1]; outputHash[thread + 2 * threads] = ((uint2*)state)[2]; outputHash[thread + 3 * threads] = ((uint2*)state)[3]; } } #else -/* if __CUDA_ARCH__ < 300 .. */ __global__ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHash) {} -__global__ void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) {} #endif __host__ @@ -512,12 +360,14 @@ __host__ void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order) { uint32_t tpb; - if (device_sm[device_map[thr_id]] == 350) - tpb = 64; + if (device_sm[device_map[thr_id]] < 350) + tpb = TPB30; + else if (device_sm[device_map[thr_id]] == 350) + tpb = TPB35; else if (device_sm[device_map[thr_id]] == 500) - tpb = 32; + tpb = TPB50; else - tpb = TPB; + tpb = TPB52; dim3 grid((threads + tpb - 1) / tpb); dim3 block(tpb); @@ -527,6 +377,5 @@ void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uin else lyra2v2_gpu_hash_32_v3 <<>> (threads, startNounce, (uint2*)d_outputHash); - MyStreamSynchronize(NULL, order, thr_id); + //MyStreamSynchronize(NULL, order, thr_id); } - diff --git a/lyra2/cuda_lyra2v2_sm3.cuh b/lyra2/cuda_lyra2v2_sm3.cuh new file mode 100644 index 0000000..e9d992d --- /dev/null +++ b/lyra2/cuda_lyra2v2_sm3.cuh @@ -0,0 +1,253 @@ +/* SM 3/3.5 Variant for lyra2REv2 */ + +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +#undef __CUDA_ARCH__ +#define __CUDA_ARCH__ 350 +#endif + +#define TPB30 16 +#define TPB35 64 + +#if __CUDA_ARCH__ >= 300 && __CUDA_ARCH__ < 500 + +#define Nrow 4 +#define Ncol 4 + +#define vectype ulonglong4 +#define u64type uint64_t +#define memshift 4 + +__device__ vectype *DMatrix; + +static __device__ __forceinline__ +void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d) +{ + a += b; d ^= a; d = ROTR64(d, 32); + c += d; b ^= c; b = ROTR64(b, 24); + a += b; d ^= a; d = ROTR64(d, 16); + c += d; b ^= c; b = ROTR64(b, 63); +} + +static __device__ __forceinline__ +void round_lyra_v35(vectype* s) +{ + Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x); + Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y); + Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z); + Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w); + + Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w); + Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x); + Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y); + Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z); +} + +static __device__ __forceinline__ +void reduceDuplexV3(vectype state[4], uint32_t thread) +{ + vectype state1[3]; + uint32_t ps1 = (Nrow * Ncol * memshift * thread); + uint32_t ps2 = (memshift * (Ncol - 1) * Nrow + memshift * 1 + Nrow * Ncol * memshift * thread); + + #pragma unroll 4 + for (int i = 0; i < Ncol; i++) + { + uint32_t s1 = ps1 + Nrow * i *memshift; + uint32_t s2 = ps2 - Nrow * i *memshift; + + for (int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix + s1)[j]); + + for (int j = 0; j < 3; j++) + state[j] ^= state1[j]; + round_lyra_v35(state); + + for (int j = 0; j < 3; j++) + state1[j] ^= state[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state1[j]; + } +} + +static __device__ __forceinline__ +void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread) +{ + vectype state2[3], state1[3]; + + uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread); + uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread); + uint32_t ps3 = (Nrow * memshift * (Ncol - 1) + memshift * rowOut + Nrow * Ncol * memshift * thread); + + for (int i = 0; i < Ncol; i++) + { + uint32_t s1 = ps1 + Nrow*i*memshift; + uint32_t s2 = ps2 + Nrow*i*memshift; + uint32_t s3 = ps3 - Nrow*i*memshift; + + for (int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix + s1 )[j]); + for (int j = 0; j < 3; j++) + state2[j] = __ldg4(&(DMatrix + s2 )[j]); + for (int j = 0; j < 3; j++) { + vectype tmp = state1[j] + state2[j]; + state[j] ^= tmp; + } + + round_lyra_v35(state); + + for (int j = 0; j < 3; j++) { + state1[j] ^= state[j]; + (DMatrix + s3)[j] = state1[j]; + } + + ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + for (int j = 0; j < 11; j++) + ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + } +} + +static __device__ __forceinline__ +void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread) +{ + vectype state1[3], state2[3]; + uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread); + uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread); + uint32_t ps3 = (memshift * rowOut + Nrow * Ncol * memshift * thread); + + #pragma nounroll + for (int i = 0; i < Ncol; i++) + { + uint32_t s1 = ps1 + Nrow * i*memshift; + uint32_t s2 = ps2 + Nrow * i*memshift; + uint32_t s3 = ps3 + Nrow * i*memshift; + + for (int j = 0; j < 3; j++) + state1[j] = __ldg4(&(DMatrix + s1)[j]); + + for (int j = 0; j < 3; j++) + state2[j] = __ldg4(&(DMatrix + s2)[j]); + + for (int j = 0; j < 3; j++) + state1[j] += state2[j]; + + for (int j = 0; j < 3; j++) + state[j] ^= state1[j]; + + round_lyra_v35(state); + + ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + + for (int j = 0; j < 11; j++) + ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; + + if (rowInOut != rowOut) { + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s3)[j] ^= state[j]; + + } else { + + for (int j = 0; j < 3; j++) + state2[j] ^= state[j]; + + for (int j = 0; j < 3; j++) + (DMatrix + s2)[j] = state2[j]; + } + } +} + +__global__ __launch_bounds__(TPB35, 1) +void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + vectype state[4]; + uint28 blake2b_IV[2]; + uint28 padding[2]; + + if (threadIdx.x == 0) { + + ((uint16*)blake2b_IV)[0] = make_uint16( + 0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85 , + 0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a , + 0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c , + 0xfb41bd6b, 0x1f83d9ab , 0x137e2179, 0x5be0cd19 + ); + ((uint16*)padding)[0] = make_uint16( + 0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0 , + 0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000 + ); + } + + if (thread < threads) + { + ((uint2*)state)[0] = __ldg(&outputHash[thread]); + ((uint2*)state)[1] = __ldg(&outputHash[thread + threads]); + ((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]); + ((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]); + state[1] = state[0]; + state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0); + state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0); + + for (int i = 0; i<12; i++) + round_lyra_v35(state); + + state[0] ^= shuffle4(((vectype*)padding)[0], 0); + state[1] ^= shuffle4(((vectype*)padding)[1], 0); + + for (int i = 0; i<12; i++) + round_lyra_v35(state); + + uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread); + + //#pragma unroll 4 + for (int i = 0; i < 4; i++) + { + uint32_t s1 = ps1 - 4 * memshift * i; + for (int j = 0; j < 3; j++) + (DMatrix + s1)[j] = (state)[j]; + + round_lyra_v35(state); + } + + reduceDuplexV3(state, thread); + reduceDuplexRowSetupV3(1, 0, 2, state, thread); + reduceDuplexRowSetupV3(2, 1, 3, state, thread); + + uint32_t rowa; + int prev = 3; + for (int i = 0; i < 4; i++) + { + rowa = ((uint2*)state)[0].x & 3; reduceDuplexRowtV3(prev, rowa, i, state, thread); + prev = i; + } + + uint32_t shift = (memshift * rowa + 16 * memshift * thread); + + for (int j = 0; j < 3; j++) + state[j] ^= __ldg4(&(DMatrix + shift)[j]); + + for (int i = 0; i < 12; i++) + round_lyra_v35(state); + + outputHash[thread] = ((uint2*)state)[0]; + outputHash[thread + threads] = ((uint2*)state)[1]; + outputHash[thread + 2 * threads] = ((uint2*)state)[2]; + outputHash[thread + 3 * threads] = ((uint2*)state)[3]; + + } //thread +} + +#else +/* if __CUDA_ARCH__ < 300 .. */ +__global__ void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) {} +#endif +