From 63e3387dbbe62fd21690d2f56d58903fe31b09ed Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Tue, 16 Dec 2014 14:18:12 +0100 Subject: [PATCH] lyra2: add sm30 device compat (skein256) --- Algo256/cuda_skein256.cu | 130 +++++++++++++++++++++++++++++++++++++-- lyra2/cuda_lyra2.cu | 77 +---------------------- 2 files changed, 126 insertions(+), 81 deletions(-) diff --git a/Algo256/cuda_skein256.cu b/Algo256/cuda_skein256.cu index 9e4013f..5cc0904 100644 --- a/Algo256/cuda_skein256.cu +++ b/Algo256/cuda_skein256.cu @@ -2,14 +2,12 @@ #include "cuda_helper.h" -#if 0 static __constant__ uint64_t SKEIN_IV512_256[8] = { 0xCCD044A12FDB3E13, 0xE83590301A79A9EB, 0x55AEA0614F816E6F, 0x2A2767A4AE9B94DB, 0xEC06025E74DD7683, 0xE7A436CDC4746251, 0xC36FBAF9393AD185, 0x3EEDBA1833EDFC13 }; -#endif static __constant__ uint2 vSKEIN_IV512_256[8] = { { 0x2FDB3E13, 0xCCD044A1 }, @@ -35,6 +33,8 @@ static __constant__ int ROT256[8][4] = }; static __constant__ uint2 skein_ks_parity = { 0xA9FC1A22,0x1BD11BDA}; +static __constant__ uint64_t skein_ks_parity64 = 0x1BD11BDAA9FC1A22ull; + static __constant__ uint2 t12[6] = { { 0x20, 0 }, { 0, 0xf0000000 }, @@ -44,7 +44,6 @@ static __constant__ uint2 t12[6] = { { 0x08, 0xff000000 } }; -#if 0 static __constant__ uint64_t t12_30[6] = { 0x20, 0xf000000000000000, @@ -53,7 +52,6 @@ static __constant__ uint64_t t12_30[6] = { 0xff00000000000000, 0xff00000000000008 }; -#endif static __forceinline__ __device__ void Round512v35(uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int ROT) @@ -175,6 +173,125 @@ void skein256_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHas } } +static __forceinline__ __device__ +void Round512v30(uint64_t &p0, uint64_t &p1, uint64_t &p2, uint64_t &p3, + uint64_t &p4, uint64_t &p5, uint64_t &p6, uint64_t &p7, int ROT) +{ + p0 += p1; p1 = ROTL64(p1, ROT256[ROT][0]); p1 ^= p0; + p2 += p3; p3 = ROTL64(p3, ROT256[ROT][1]); p3 ^= p2; + p4 += p5; p5 = ROTL64(p5, ROT256[ROT][2]); p5 ^= p4; + p6 += p7; p7 = ROTL64(p7, ROT256[ROT][3]); p7 ^= p6; +} + +static __forceinline__ __device__ +void Round_8_512v30(uint64_t *ks, uint64_t *ts, uint64_t &p0, uint64_t &p1, uint64_t &p2, uint64_t &p3, + uint64_t &p4, uint64_t &p5, uint64_t &p6, uint64_t &p7, int R) +{ + Round512v30(p0, p1, p2, p3, p4, p5, p6, p7, 0); + Round512v30(p2, p1, p4, p7, p6, p5, p0, p3, 1); + Round512v30(p4, p1, p6, p3, p0, p5, p2, p7, 2); + Round512v30(p6, p1, p0, p7, p2, p5, p4, p3, 3); + p0 += ks[((R)+0) % 9]; /* inject the key schedule value */ + p1 += ks[((R)+1) % 9]; + p2 += ks[((R)+2) % 9]; + p3 += ks[((R)+3) % 9]; + p4 += ks[((R)+4) % 9]; + p5 += ks[((R)+5) % 9] + ts[((R)+0) % 3]; + p6 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; + p7 += ks[((R)+7) % 9] + R; + Round512v30(p0, p1, p2, p3, p4, p5, p6, p7, 4); + Round512v30(p2, p1, p4, p7, p6, p5, p0, p3, 5); + Round512v30(p4, p1, p6, p3, p0, p5, p2, p7, 6); + Round512v30(p6, p1, p0, p7, p2, p5, p4, p3, 7); + p0 += ks[((R)+1) % 9]; /* inject the key schedule value */ + p1 += ks[((R)+2) % 9]; + p2 += ks[((R)+3) % 9]; + p3 += ks[((R)+4) % 9]; + p4 += ks[((R)+5) % 9]; + p5 += ks[((R)+6) % 9] + ts[((R)+1) % 3]; + p6 += ks[((R)+7) % 9] + ts[((R)+2) % 3]; + p7 += ks[((R)+8) % 9] + (R)+1; +} + +__global__ __launch_bounds__(256, 3) +void skein256_gpu_hash_32_v30(int threads, uint32_t startNounce, uint64_t *outputHash) +{ + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint64_t h[9]; + uint64_t t[3]; + uint64_t dt0, dt1, dt2, dt3; + uint64_t p0, p1, p2, p3, p4, p5, p6, p7; + h[8] = skein_ks_parity64; + for (int i = 0; i<8; i++) { + h[i] = SKEIN_IV512_256[i]; + h[8] ^= h[i]; + } + + t[0] = devectorize(t12[0]); + t[1] = devectorize(t12[1]); + t[2] = devectorize(t12[2]); + + dt0 = outputHash[thread]; + dt1 = outputHash[threads+thread]; + dt2 = outputHash[2*threads+thread]; + dt3 = outputHash[3*threads+thread]; + + p0 = h[0] + dt0; + p1 = h[1] + dt1; + p2 = h[2] + dt2; + p3 = h[3] + dt3; + p4 = h[4]; + p5 = h[5] + t[0]; + p6 = h[6] + t[1]; + p7 = h[7]; + + #pragma unroll + for (int i = 1; i<19; i += 2) { + Round_8_512v30(h, t, p0, p1, p2, p3, p4, p5, p6, p7, i); + } + + p0 ^= dt0; + p1 ^= dt1; + p2 ^= dt2; + p3 ^= dt3; + + h[0] = p0; + h[1] = p1; + h[2] = p2; + h[3] = p3; + h[4] = p4; + h[5] = p5; + h[6] = p6; + h[7] = p7; + h[8] = skein_ks_parity64; + + #pragma unroll 8 + for (int i = 0; i<8; i++) { + h[8] ^= h[i]; + } + + t[0] = t12_30[3]; + t[1] = t12_30[4]; + t[2] = t12_30[5]; + + p5 += t[0]; //p5 already equal h[5] + p6 += t[1]; + + #pragma unroll + for (int i = 1; i<19; i += 2) { + Round_8_512v30(h, t, p0, p1, p2, p3, p4, p5, p6, p7, i); + } + + outputHash[thread] = p0; + outputHash[threads + thread] = p1; + outputHash[2 * threads + thread] = p2; + outputHash[3 * threads + thread] = p3; + + } //thread +} + __host__ void skein256_cpu_init(int thr_id, int threads) { @@ -189,7 +306,10 @@ void skein256_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_ dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - skein256_gpu_hash_32<<>>(threads, startNounce, d_outputHash); + if (device_sm[device_map[thr_id]] >= 320) + skein256_gpu_hash_32<<>>(threads, startNounce, d_outputHash); + else + skein256_gpu_hash_32_v30<<>>(threads, startNounce, d_outputHash); MyStreamSynchronize(NULL, order, thr_id); } diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu index 95d61f5..3afd225 100644 --- a/lyra2/cuda_lyra2.cu +++ b/lyra2/cuda_lyra2.cu @@ -434,81 +434,6 @@ void lyra2_gpu_hash_32(int threads, uint32_t startNounce, uint64_t *outputHash) } //thread } -#if 0 -__global__ __launch_bounds__(TPB, 1) -void lyra2_gpu_hash_32_test(int threads, uint32_t startNounce, uint64_t *outputHash) -{ - int thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint2 state[16]; - #pragma unroll - for (int i = 0; i<4; i++) { LOHI(state[i].x, state[i].y, outputHash[threads*i + thread]); } //password - #pragma unroll - for (int i = 0; i<4; i++) { state[i + 4] = state[i]; } //salt - #pragma unroll - for (int i = 0; i<8; i++) { state[i + 8] = blake2b_IV[i]; } - - // blake2blyra x2 - #pragma unroll 24 - for (int i = 0; i<24; i++) { round_lyra_v35(state); } //because 12 is not enough - - uint2 Matrix[12][8][8]; // not cool - - // reducedSqueezeRow0 - #pragma unroll 8 - for (int i = 0; i < 8; i++) { - #pragma unroll 12 - for (int j = 0; j<12; j++) { Matrix[j][7-i][0] = state[j]; } - round_lyra_v35(state); - } - - // reducedSqueezeRow1 - #pragma unroll 8 - for (int i = 0; i < 8; i++) - { - #pragma unroll 12 - for (int j = 0; j<12; j++) { state[j] ^= Matrix[j][i][0]; } - round_lyra_v35(state); - #pragma unroll 12 - for (int j = 0; j<12; j++) { Matrix[j][7-i][1] = Matrix[j][i][0] ^ state[j]; } - } - - reduceDuplexRowSetup_test(1, 0, 2); - reduceDuplexRowSetup_test(2, 1, 3); - reduceDuplexRowSetup_test(3, 0, 4); - reduceDuplexRowSetup_test(4, 3, 5); - reduceDuplexRowSetup_test(5, 2, 6); - reduceDuplexRowSetup_test(6, 1, 7); - - uint64_t rowa; - rowa = devectorize(state[0]) & 7; - reduceDuplexRow_test(7, rowa, 0); - rowa = devectorize(state[0]) & 7; - reduceDuplexRow_test(0, rowa, 3); - rowa = devectorize(state[0]) & 7; - reduceDuplexRow_test(3, rowa, 6); - rowa = devectorize(state[0]) & 7; - reduceDuplexRow_test(6, rowa, 1); - rowa = devectorize(state[0]) & 7; - reduceDuplexRow_test(1, rowa, 4); - rowa = devectorize(state[0]) & 7; - reduceDuplexRow_test(4, rowa, 7); - rowa = devectorize(state[0]) & 7; - reduceDuplexRow_test(7, rowa, 2); - rowa = devectorize(state[0]) & 7; - reduceDuplexRow_test(2, rowa, 5); - - absorbblock_test(rowa); - - #pragma unroll - for (int i = 0; i<4; i++) { - outputHash[threads*i + thread] = devectorize(state[i]); - } //password - - } //thread -} -#endif __host__ void lyra2_cpu_init(int thr_id, int threads) @@ -524,7 +449,7 @@ void lyra2_cpu_hash_32(int thr_id, int threads, uint32_t startNounce, uint64_t * dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - if (device_sm[device_map[thr_id]] >= 350) { + if (device_sm[device_map[thr_id]] >= 320) { lyra2_gpu_hash_32 <<>> (threads, startNounce, d_outputHash); } else { // kernel for compute30 card