From 6a9280a04590beaf70b2252101982959796e8a5f Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Wed, 14 Oct 2015 21:55:40 +0200 Subject: [PATCH] lyra2v2: set a better TPB for intensity 20 (sm52) use sp forced unroll in skein and do some cleanup... --- Algo256/cuda_skein256.cu | 31 +++++++---- lyra2/cuda_lyra2v2.cu | 108 +++++++++++++++++++-------------------- lyra2/lyra2REv2.cu | 4 +- util.cpp | 2 +- 4 files changed, 78 insertions(+), 67 deletions(-) diff --git a/Algo256/cuda_skein256.cu b/Algo256/cuda_skein256.cu index 2f1bc82..cbeb660 100644 --- a/Algo256/cuda_skein256.cu +++ b/Algo256/cuda_skein256.cu @@ -119,10 +119,16 @@ void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outp p6 = h[6] + t12[1]; p7 = h[7]; - #pragma unroll - for (int i = 1; i<19; i+=2) { - Round_8_512v35(h,t12,p0,p1,p2,p3,p4,p5,p6,p7,i); - } + // forced unroll required + Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 1); + Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 3); + Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 5); + Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 7); + Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 9); + Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 11); + Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 13); + Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 15); + Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 17); p0 ^= dt0; p1 ^= dt1; @@ -143,11 +149,17 @@ void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outp p5 += t12[3]; //p5 already equal h[5] p6 += t12[4]; - #pragma unroll - for (int i = 1; i<17; i+=2) { - Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, i); - } + // forced unroll + Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 1); + Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 3); + Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 5); + Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 7); + Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 9); + Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 11); + Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 13); + Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 15); Round_8_512v35_final(h, t, p0, p1, p2, p3, p4, p5, p6, p7); + outputHash[thread] = devectorize(p0); outputHash[threads+thread] = devectorize(p1); outputHash[2*threads+thread] = devectorize(p2); @@ -285,12 +297,13 @@ __host__ void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order) { const uint32_t threadsperblock = 256; + int dev_id = device_map[thr_id]; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); // only 1kH/s perf change between kernels on a 960... - if (device_sm[device_map[thr_id]] > 300 && cuda_arch[device_map[thr_id]] > 300) + if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300) skein256_gpu_hash_32<<>>(threads, startNounce, d_outputHash); else skein256_gpu_hash_32_v30<<>>(threads, startNounce, d_outputHash); diff --git a/lyra2/cuda_lyra2v2.cu b/lyra2/cuda_lyra2v2.cu index 5493c1a..bd2ad25 100644 --- a/lyra2/cuda_lyra2v2.cu +++ b/lyra2/cuda_lyra2v2.cu @@ -2,7 +2,7 @@ #include #include -#define TPB52 10 +#define TPB52 8 #define TPB50 16 #include "cuda_lyra2v2_sm3.cuh" @@ -27,8 +27,8 @@ __device__ __forceinline__ void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d) { a += b; d ^= a; d = SWAPUINT2(d); - c += d; b ^= c; b = ROR24(b); - a += b; d ^= a; d = ROR16(d); + c += d; b ^= c; b = ROR2(b, 24); + a += b; d ^= a; d = ROR2(d, 16); c += d; b ^= c; b = ROR2(b, 63); } @@ -47,11 +47,11 @@ void round_lyra_v5(uint4x2* s) } __device__ __forceinline__ -void reduceDuplex(uint4x2 state[4], uint32_t thread) +void reduceDuplex(uint4x2 state[4], const uint32_t thread) { uint4x2 state1[3]; - uint32_t ps1 = (Nrow * Ncol * memshift * thread); - uint32_t ps2 = (memshift * (Ncol-1) + memshift * Ncol + Nrow * Ncol * memshift * thread); + const uint32_t ps1 = (Nrow * Ncol * memshift * thread); + const uint32_t ps2 = (memshift * (Ncol-1) + memshift * Ncol + Nrow * Ncol * memshift * thread); #pragma unroll 4 for (int i = 0; i < Ncol; i++) @@ -80,16 +80,16 @@ void reduceDuplex(uint4x2 state[4], uint32_t thread) } __device__ __forceinline__ -void reduceDuplex50(uint4x2 state[4], uint32_t thread) +void reduceDuplex50(uint4x2 state[4], const uint32_t thread) { - uint32_t ps1 = (Nrow * Ncol * memshift * thread); - uint32_t ps2 = (memshift * (Ncol - 1) + memshift * Ncol + Nrow * Ncol * memshift * thread); + const uint32_t ps1 = (Nrow * Ncol * memshift * thread); + const uint32_t ps2 = (memshift * (Ncol - 1) + memshift * Ncol + Nrow * Ncol * memshift * thread); #pragma unroll 4 for (int i = 0; i < Ncol; i++) { - uint32_t s1 = ps1 + i*memshift; - uint32_t s2 = ps2 - i*memshift; + const uint32_t s1 = ps1 + i*memshift; + const int32_t s2 = ps2 - i*memshift; #pragma unroll for (int j = 0; j < 3; j++) @@ -104,19 +104,19 @@ void reduceDuplex50(uint4x2 state[4], uint32_t thread) } __device__ __forceinline__ -void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOut, uint4x2 state[4], uint32_t thread) +void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOut, uint4x2 state[4], const uint32_t thread) { uint4x2 state2[3], state1[3]; - uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread); - uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread); - uint32_t ps3 = (memshift * (Ncol-1) + memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread); + const uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread); + const uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread); + const uint32_t ps3 = (memshift * (Ncol-1) + memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread); for (int i = 0; i < Ncol; i++) { - uint32_t s1 = ps1 + i*memshift; - uint32_t s2 = ps2 + i*memshift; - uint32_t s3 = ps3 - i*memshift; + const uint32_t s1 = ps1 + i*memshift; + const uint32_t s2 = ps2 + i*memshift; + const uint32_t s3 = ps3 - i*memshift; #if __CUDA_ARCH__ == 500 @@ -179,18 +179,18 @@ void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOu __device__ __forceinline__ -void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, uint4x2* state, uint32_t thread) +void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, uint4x2* state, const uint32_t thread) { - uint4x2 state1[3],state2[3]; - uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread); - uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread); - uint32_t ps3 = (memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread); + uint4x2 state1[3], state2[3]; + const uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread); + const uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread); + const uint32_t ps3 = (memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread); for (int i = 0; i < Ncol; i++) { - uint32_t s1 = ps1 + i*memshift; - uint32_t s2 = ps2 + i*memshift; - uint32_t s3 = ps3 + i*memshift; + const uint32_t s1 = ps1 + i*memshift; + const uint32_t s2 = ps2 + i*memshift; + const uint32_t s3 = ps3 + i*memshift; #pragma unroll for (int j = 0; j < 3; j++) @@ -255,12 +255,10 @@ __global__ __launch_bounds__(TPB50, 1) #else __global__ __launch_bounds__(TPB52, 1) #endif -void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHash) +void lyra2v2_gpu_hash_32(const uint32_t threads, uint32_t startNounce, uint2 *g_hash) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - uint4x2 state[4]; - uint4x2 blake2b_IV[2]; if (threadIdx.x == 0) { @@ -275,10 +273,12 @@ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHa if (thread < threads) { - ((uint2*)state)[0] = __ldg(&outputHash[thread]); - ((uint2*)state)[1] = __ldg(&outputHash[thread + threads]); - ((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]); - ((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]); + uint4x2 state[4]; + + ((uint2*)state)[0] = __ldg(&g_hash[thread]); + ((uint2*)state)[1] = __ldg(&g_hash[thread + threads]); + ((uint2*)state)[2] = __ldg(&g_hash[thread + threads*2]); + ((uint2*)state)[3] = __ldg(&g_hash[thread + threads*3]); state[1] = state[0]; @@ -286,7 +286,7 @@ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHa state[3] = ((blake2b_IV)[1]); for (int i = 0; i<12; i++) - round_lyra_v5(state); + round_lyra_v5(state); ((uint2*)state)[0].x ^= 0x20; ((uint2*)state)[1].x ^= 0x20; @@ -298,9 +298,9 @@ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHa ((uint2*)state)[7].y ^= 0x01000000; for (int i = 0; i<12; i++) - round_lyra_v5(state); + round_lyra_v5(state); - uint32_t ps1 = (memshift * (Ncol - 1) + Nrow * Ncol * memshift * thread); + const uint32_t ps1 = (memshift * (Ncol - 1) + Nrow * Ncol * memshift * thread); for (int i = 0; i < Ncol; i++) { @@ -323,7 +323,7 @@ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHa { rowa = ((uint2*)state)[0].x & 3; reduceDuplexRowtV2(prev, rowa, i, state, thread); - prev=i; + prev = i; } const uint32_t shift = (memshift * Ncol * rowa + Nrow * Ncol * memshift * thread); @@ -335,10 +335,10 @@ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHa for (int i = 0; i < 12; i++) round_lyra_v5(state); - outputHash[thread] = ((uint2*)state)[0]; - outputHash[thread + threads] = ((uint2*)state)[1]; - outputHash[thread + 2 * threads] = ((uint2*)state)[2]; - outputHash[thread + 3 * threads] = ((uint2*)state)[3]; + g_hash[thread] = ((uint2*)state)[0]; + g_hash[thread + threads] = ((uint2*)state)[1]; + g_hash[thread + threads*2] = ((uint2*)state)[2]; + g_hash[thread + threads*3] = ((uint2*)state)[3]; } } #else @@ -346,36 +346,34 @@ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHa #if __CUDA_ARCH__ < 300 __device__ void* DMatrix; #endif -__global__ void lyra2v2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHash) {} +__global__ void lyra2v2_gpu_hash_32(const uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} #endif __host__ void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix) { + cuda_get_arch(thr_id); // just assign the device pointer allocated in main loop cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice); } __host__ -void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order) +void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, int order) { - uint32_t tpb; - if (device_sm[device_map[thr_id]] < 350) - tpb = TPB30; - else if (device_sm[device_map[thr_id]] == 350) - tpb = TPB35; - else if (device_sm[device_map[thr_id]] == 500) - tpb = TPB50; - else - tpb = TPB52; + int dev_id = device_map[thr_id % MAX_GPUS]; + uint32_t tpb = TPB52; + + if (device_sm[dev_id] == 500 || cuda_arch[dev_id] == 500) tpb = TPB50; + else if (device_sm[dev_id] == 350 || cuda_arch[dev_id] == 350) tpb = TPB35; + else if (device_sm[dev_id] < 350 || cuda_arch[dev_id] < 350) tpb = TPB30; dim3 grid((threads + tpb - 1) / tpb); dim3 block(tpb); - if (device_sm[device_map[thr_id]] >= 500) - lyra2v2_gpu_hash_32 <<>> (threads, startNounce, (uint2*)d_outputHash); + if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500) + lyra2v2_gpu_hash_32 <<>> (threads, startNounce, (uint2*)g_hash); else - lyra2v2_gpu_hash_32_v3 <<>> (threads, startNounce, (uint2*)d_outputHash); + lyra2v2_gpu_hash_32_v3 <<>> (threads, startNounce, (uint2*)g_hash); //MyStreamSynchronize(NULL, order, thr_id); } diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu index d8b590e..54b8303 100644 --- a/lyra2/lyra2REv2.cu +++ b/lyra2/lyra2REv2.cu @@ -11,7 +11,7 @@ extern "C" { #include "cuda_helper.h" -static _ALIGN(64) uint64_t *d_hash[MAX_GPUS]; +static uint64_t *d_hash[MAX_GPUS]; static uint64_t* d_matrix[MAX_GPUS]; extern void blake256_cpu_init(int thr_id, uint32_t threads); @@ -86,7 +86,7 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) - ptarget[7] = 0x00ff; + ptarget[7] = 0x000f; if (!init[thr_id]) { diff --git a/util.cpp b/util.cpp index fb9e12d..97e4b28 100644 --- a/util.cpp +++ b/util.cpp @@ -1849,7 +1849,7 @@ static uint32_t zrtest[20] = { void do_gpu_tests(void) { -#if 1 //def _DEBUG +#ifdef _DEBUG unsigned long done; char s[128] = { '\0' }; uchar buf[160];