From 275a028935325e9ff6fb3c8b75d78c0a47d7b59c Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Thu, 16 Apr 2015 01:58:46 +0200 Subject: [PATCH] skein: compute midstate first "Real" optimization based on KlausT precalc --- configure.ac | 2 +- cpuminer-config.h | 6 +- quark/cuda_skein512.cu | 175 +++++++++++++++++++++++++++-------------- 3 files changed, 122 insertions(+), 61 deletions(-) diff --git a/configure.ac b/configure.ac index 61b47e7..2664707 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [1.6.1]) +AC_INIT([ccminer], [1.6.2-git]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpuminer-config.h b/cpuminer-config.h index 1264f97..ca8c8cc 100644 --- a/cpuminer-config.h +++ b/cpuminer-config.h @@ -159,7 +159,7 @@ #define PACKAGE_NAME "ccminer" /* Define to the full name and version of this package. */ -#define PACKAGE_STRING "ccminer 1.6.1" +#define PACKAGE_STRING "ccminer 1.6.2-git" /* Define to the one symbol short name of this package. */ #define PACKAGE_TARNAME "ccminer" @@ -168,7 +168,7 @@ #define PACKAGE_URL "http://github.com/tpruvot/ccminer" /* Define to the version of this package. */ -#define PACKAGE_VERSION "1.6.1" +#define PACKAGE_VERSION "1.6.2-git" /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be @@ -191,7 +191,7 @@ /* undef USE_XOP */ /* Version number of package */ -#define VERSION "1.6.1" +#define VERSION "1.6.2-git" /* Define curl_free() as free() if our version of curl lacks curl_free. */ /* #undef curl_free */ diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu index 669ec4a..472341e 100644 --- a/quark/cuda_skein512.cu +++ b/quark/cuda_skein512.cu @@ -4,7 +4,7 @@ #include "cuda_helper.h" -static __constant__ uint64_t c_PaddedMessage80[16]; // padded message (80 bytes + padding) +static __constant__ uint64_t c_PaddedMessage80[20]; // padded message (80 bytes + 72 bytes midstate + align) // Take a look at: https://www.schneier.com/skein1.3.pdf @@ -303,6 +303,47 @@ uint64_t skein_rotl64(const uint64_t x, const int offset) TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \ } +/* uint64_t midstate for skein 80 */ + +#define TFBIG_ADDKEY_PRE(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \ + w0 = (w0 + SKBI(k, s, 0)); \ + w1 = (w1 + SKBI(k, s, 1)); \ + w2 = (w2 + SKBI(k, s, 2)); \ + w3 = (w3 + SKBI(k, s, 3)); \ + w4 = (w4 + SKBI(k, s, 4)); \ + w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \ + w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \ + w7 = (w7 + SKBI(k, s, 7) + (s)); \ + } + +#define TFBIG_MIX_PRE(x0, x1, rc) { \ + x0 = x0 + x1; \ + x1 = ROTL64(x1, rc) ^ x0; \ + } + +#define TFBIG_MIX8_PRE(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \ + TFBIG_MIX_PRE(w0, w1, rc0); \ + TFBIG_MIX_PRE(w2, w3, rc1); \ + TFBIG_MIX_PRE(w4, w5, rc2); \ + TFBIG_MIX_PRE(w6, w7, rc3); \ + } + +#define TFBIG_4e_PRE(s) { \ + TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ + TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \ + TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \ + TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \ + TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); \ + } + +#define TFBIG_4o_PRE(s) { \ + TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ + TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \ + TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \ + TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \ + TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \ + } + /* uint2 variant for SM3.2+ */ #define TFBIG_KINIT_UI2(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) { \ @@ -560,69 +601,30 @@ void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *outp uint2 h0, h1, h2, h3, h4, h5, h6, h7, h8; uint2 t0, t1, t2; - // Init - h0 = vectorize(0x4903ADFF749C51CEull); - h1 = vectorize(0x0D95DE399746DF03ull); - h2 = vectorize(0x8FD1934127C79BCEull); - h3 = vectorize(0x9A255629FF352CB1ull); - h4 = vectorize(0x5DB62599DF6CA7B0ull); - h5 = vectorize(0xEABE394CA9D5C3F4ull); - h6 = vectorize(0x991112C71A75B523ull); - h7 = vectorize(0xAE18A40B660FCC33ull); + h0 = vectorize(c_PaddedMessage80[10]); + h1 = vectorize(c_PaddedMessage80[11]); + h2 = vectorize(c_PaddedMessage80[12]); + h3 = vectorize(c_PaddedMessage80[13]); + h4 = vectorize(c_PaddedMessage80[14]); + h5 = vectorize(c_PaddedMessage80[15]); + h6 = vectorize(c_PaddedMessage80[16]); + h7 = vectorize(c_PaddedMessage80[17]); - // 1st step -> etype = 0xE0, ptr = 64, bcount = 0, extra = 0 - t0 = vectorize(64); // ptr - //t1 = vectorize(0xE0ull << 55); // etype - t1 = vectorize(0x7000000000000000ull); - TFBIG_KINIT_UI2(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); - - uint2 p[8]; - #pragma unroll 8 - for (int i = 0; i<8; i++) - p[i] = vectorize(c_PaddedMessage80[i]); - - TFBIG_4e_UI2(0); - TFBIG_4o_UI2(1); - TFBIG_4e_UI2(2); - TFBIG_4o_UI2(3); - TFBIG_4e_UI2(4); - TFBIG_4o_UI2(5); - TFBIG_4e_UI2(6); - TFBIG_4o_UI2(7); - TFBIG_4e_UI2(8); - TFBIG_4o_UI2(9); - TFBIG_4e_UI2(10); - TFBIG_4o_UI2(11); - TFBIG_4e_UI2(12); - TFBIG_4o_UI2(13); - TFBIG_4e_UI2(14); - TFBIG_4o_UI2(15); - TFBIG_4e_UI2(16); - TFBIG_4o_UI2(17); - TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); - - h0 = vectorize(c_PaddedMessage80[0]) ^ p[0]; - h1 = vectorize(c_PaddedMessage80[1]) ^ p[1]; - h2 = vectorize(c_PaddedMessage80[2]) ^ p[2]; - h3 = vectorize(c_PaddedMessage80[3]) ^ p[3]; - h4 = vectorize(c_PaddedMessage80[4]) ^ p[4]; - h5 = vectorize(c_PaddedMessage80[5]) ^ p[5]; - h6 = vectorize(c_PaddedMessage80[6]) ^ p[6]; - h7 = vectorize(c_PaddedMessage80[7]) ^ p[7]; + t2 = vectorize(c_PaddedMessage80[18]); uint32_t nonce = swap ? cuda_swab32(startNounce + thread) : startNounce + thread; - uint2 nounce2 = make_uint2(_LOWORD(c_PaddedMessage80[9]), nonce); + uint2 nonce2 = make_uint2(_LOWORD(c_PaddedMessage80[9]), nonce); - // skein_big_close -> etype = 0x160, ptr = 16, bcount = 1, extra = 16 + uint2 p[8]; p[0] = vectorize(c_PaddedMessage80[8]); - p[1] = nounce2; + p[1] = nonce2; #pragma unroll for (int i = 2; i < 8; i++) p[i] = vectorize(0ull); - t0 = vectorize(0x50ull); // SPH_T64(bcount << 6) + (sph_u64)(extra); - t1 = vectorize(0xB000000000000000ull); // (bcount >> 58) + ((sph_u64)(etype) << 55); + t0 = vectorize(0x50ull); + t1 = vectorize(0xB000000000000000ull); TFBIG_KINIT_UI2(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); TFBIG_4e_UI2(0); TFBIG_4o_UI2(1); @@ -646,7 +648,7 @@ void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *outp uint64_t *outpHash = &output64[thread * 8]; outpHash[0] = c_PaddedMessage80[8] ^ devectorize(p[0]); - outpHash[1] = devectorize(nounce2 ^ p[1]); + outpHash[1] = devectorize(nonce2 ^ p[1]); #pragma unroll for(int i=2; i<8; i++) outpHash[i] = devectorize(p[i]); @@ -888,12 +890,71 @@ void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNoun /* skein / skein2 */ +__host__ +static void skein512_precalc_80(uint64_t* message) +{ + uint64_t h0, h1, h2, h3, h4, h5, h6, h7, h8; + uint64_t t0, t1, t2; + + h0 = 0x4903ADFF749C51CEull; + h1 = 0x0D95DE399746DF03ull; + h2 = 0x8FD1934127C79BCEull; + h3 = 0x9A255629FF352CB1ull; + h4 = 0x5DB62599DF6CA7B0ull; + h5 = 0xEABE394CA9D5C3F4ull; + h6 = 0x991112C71A75B523ull; + h7 = 0xAE18A40B660FCC33ull; + h8 = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^ h6 ^ h7 ^ SPH_C64(0x1BD11BDAA9FC1A22); + + t0 = 64; // ptr + t1 = 0x7000000000000000ull; + t2 = 0x7000000000000040ull; + + uint64_t p[8]; + for (int i = 0; i<8; i++) + p[i] = message[i]; + + TFBIG_4e_PRE(0); + TFBIG_4o_PRE(1); + TFBIG_4e_PRE(2); + TFBIG_4o_PRE(3); + TFBIG_4e_PRE(4); + TFBIG_4o_PRE(5); + TFBIG_4e_PRE(6); + TFBIG_4o_PRE(7); + TFBIG_4e_PRE(8); + TFBIG_4o_PRE(9); + TFBIG_4e_PRE(10); + TFBIG_4o_PRE(11); + TFBIG_4e_PRE(12); + TFBIG_4o_PRE(13); + TFBIG_4e_PRE(14); + TFBIG_4o_PRE(15); + TFBIG_4e_PRE(16); + TFBIG_4o_PRE(17); + TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); + + message[10] = message[0] ^ p[0]; + message[11] = message[1] ^ p[1]; + message[12] = message[2] ^ p[2]; + message[13] = message[3] ^ p[3]; + message[14] = message[4] ^ p[4]; + message[15] = message[5] ^ p[5]; + message[16] = message[6] ^ p[6]; + message[17] = message[7] ^ p[7]; + + message[18] = t2; +} + __host__ void skein512_cpu_setBlock_80(void *pdata) { - cudaMemcpyToSymbol(c_PaddedMessage80, pdata, 80, 0, cudaMemcpyHostToDevice); + uint64_t message[20]; + memcpy(&message[0], pdata, 80); + skein512_precalc_80(message); + cudaMemcpyToSymbol(c_PaddedMessage80, message, sizeof(message), 0, cudaMemcpyHostToDevice); - CUDA_SAFE_CALL(cudaStreamSynchronize(NULL)); + CUDA_SAFE_CALL(cudaGetLastError()); } __host__