From feb99d020fc77cb04e5cb5a1ecd059e27ac523e7 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 28 Jan 2017 19:55:10 +0100 Subject: [PATCH] skein: merge the double implementations in one based on alexis skein kernels, tested ok on SM 2.1 and 3.0 code is a bit hard to read but... well... users dont care :p --- cuda_vectors.h | 34 + quark/cuda_skein512.cu | 1369 ++++++++++--------- quark/cuda_skein512_sp.cuh | 2634 ------------------------------------ 3 files changed, 761 insertions(+), 3276 deletions(-) create mode 100644 cuda_vectors.h delete mode 100644 quark/cuda_skein512_sp.cuh diff --git a/cuda_vectors.h b/cuda_vectors.h new file mode 100644 index 0000000..13d77d0 --- /dev/null +++ b/cuda_vectors.h @@ -0,0 +1,34 @@ +#include "cuda_helper.h" + +/* Macros for uint2 operations (used by skein) */ + +__device__ __forceinline__ +uint2 ROR8(const uint2 a) { + uint2 result; + result.x = __byte_perm(a.x, a.y, 0x4321); + result.y = __byte_perm(a.y, a.x, 0x4321); + return result; +} + +__device__ __forceinline__ +uint2 ROL24(const uint2 a) { + uint2 result; + result.x = __byte_perm(a.x, a.y, 0x0765); + result.y = __byte_perm(a.y, a.x, 0x0765); + return result; +} + +static __device__ __forceinline__ uint2 operator+ (const uint2 a, const uint32_t b) +{ +#if 0 && defined(__CUDA_ARCH__) && CUDA_VERSION < 7000 + uint2 result; + asm( + "add.cc.u32 %0,%2,%4; \n\t" + "addc.u32 %1,%3,%5; \n\t" + : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b), "r"(0)); + return result; +#else + return vectorize(devectorize(a) + b); +#endif +} + diff --git a/quark/cuda_skein512.cu b/quark/cuda_skein512.cu index 12ba9bb..6d706ed 100644 --- a/quark/cuda_skein512.cu +++ b/quark/cuda_skein512.cu @@ -1,49 +1,13 @@ -#define SP_KERNEL +/* SKEIN 64 and 80 based on Alexis Provost version */ -#ifdef SP_KERNEL -#include "cuda_skein512_sp.cuh" -#undef TFBIG_KINIT -#undef TFBIG_ADDKEY -#undef TFBIG_MIX -#else +#define TPB52 512 +#define TPB50 256 #include -#include -#include - -#include "cuda_helper.h" - -#endif - -static __constant__ uint64_t c_PaddedMessage80[20]; // padded message (80 bytes + 72 bytes midstate + align) - -// Take a look at: https://www.schneier.com/skein1.3.pdf - -#define SHL(x, n) ((x) << (n)) -#define SHR(x, n) ((x) >> (n)) +#include +#include -#if __CUDA_ARCH__ > 300 -__device__ -uint64_t skein_rotl64(const uint64_t x, const int offset) -{ - uint64_t res; - asm("{\n\t" - ".reg .u32 tl,th,vl,vh;\n\t" - ".reg .pred p;\n\t" - "mov.b64 {tl,th}, %1;\n\t" - "shf.l.wrap.b32 vl, tl, th, %2;\n\t" - "shf.l.wrap.b32 vh, th, tl, %2;\n\t" - "setp.lt.u32 p, %2, 32;\n\t" - "@!p mov.b64 %0, {vl,vh};\n\t" - "@p mov.b64 %0, {vh,vl};\n\t" - "}" - : "=l"(res) : "l"(x) , "r"(offset) - ); - return res; -} -#undef ROTL64 -#define ROTL64 skein_rotl64 -#endif +/* ************************ */ /* * M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7). @@ -269,357 +233,551 @@ uint64_t skein_rotl64(const uint64_t x, const int offset) #define SKBI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i)) #define SKBT(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v)) -#define TFBIG_KINIT(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) { \ - k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \ - ^ SPH_C64(0x1BD11BDAA9FC1A22); \ - t2 = t0 ^ t1; \ - } - #define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \ - w0 = (w0 + SKBI(k, s, 0)); \ - w1 = (w1 + SKBI(k, s, 1)); \ - w2 = (w2 + SKBI(k, s, 2)); \ - w3 = (w3 + SKBI(k, s, 3)); \ - w4 = (w4 + SKBI(k, s, 4)); \ - w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \ - w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \ - w7 = (w7 + SKBI(k, s, 7) + (uint64_t)s); \ - } + w0 = (w0 + SKBI(k, s, 0)); \ + w1 = (w1 + SKBI(k, s, 1)); \ + w2 = (w2 + SKBI(k, s, 2)); \ + w3 = (w3 + SKBI(k, s, 3)); \ + w4 = (w4 + SKBI(k, s, 4)); \ + w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \ + w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \ + w7 = (w7 + SKBI(k, s, 7) + make_uint2(s,0); \ +} #define TFBIG_MIX(x0, x1, rc) { \ - x0 = x0 + x1; \ - x1 = ROTL64(x1, rc) ^ x0; \ - } + x0 = x0 + x1; \ + x1 = ROL2(x1, rc) ^ x0; \ +} #define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \ - TFBIG_MIX(w0, w1, rc0); \ - TFBIG_MIX(w2, w3, rc1); \ - TFBIG_MIX(w4, w5, rc2); \ - TFBIG_MIX(w6, w7, rc3); \ - } + TFBIG_MIX(w0, w1, rc0); \ + TFBIG_MIX(w2, w3, rc1); \ + TFBIG_MIX(w4, w5, rc2); \ + TFBIG_MIX(w6, w7, rc3); \ +} #define TFBIG_4e(s) { \ - TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ - TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \ - TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \ - TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \ - TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); \ - } + TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ + TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \ + TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \ + TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \ + TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); \ +} #define TFBIG_4o(s) { \ - TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ - TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \ - TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \ - TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \ - TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \ - } - -/* uint64_t midstate for skein 80 */ - -#define TFBIG_ADDKEY_PRE(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \ - w0 = (w0 + SKBI(k, s, 0)); \ - w1 = (w1 + SKBI(k, s, 1)); \ - w2 = (w2 + SKBI(k, s, 2)); \ - w3 = (w3 + SKBI(k, s, 3)); \ - w4 = (w4 + SKBI(k, s, 4)); \ - w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \ - w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \ - w7 = (w7 + SKBI(k, s, 7) + (s)); \ - } - -#define TFBIG_MIX_PRE(x0, x1, rc) { \ - x0 = x0 + x1; \ - x1 = ROTL64(x1, rc) ^ x0; \ - } - -#define TFBIG_MIX8_PRE(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \ - TFBIG_MIX_PRE(w0, w1, rc0); \ - TFBIG_MIX_PRE(w2, w3, rc1); \ - TFBIG_MIX_PRE(w4, w5, rc2); \ - TFBIG_MIX_PRE(w6, w7, rc3); \ - } - -#define TFBIG_4e_PRE(s) { \ - TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ - TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \ - TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \ - TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \ - TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); \ - } - -#define TFBIG_4o_PRE(s) { \ - TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ - TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \ - TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \ - TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \ - TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \ - } + TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ + TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \ + TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \ + TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \ + TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \ +} /* uint2 variant for SM3.2+ */ #define TFBIG_KINIT_UI2(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) { \ - k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \ - ^ vectorize(SPH_C64(0x1BD11BDAA9FC1A22)); \ - t2 = t0 ^ t1; \ - } + k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \ + ^ vectorize(0x1BD11BDAA9FC1A22); \ + t2 = t0 ^ t1; \ +} #define TFBIG_ADDKEY_UI2(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \ - w0 = (w0 + SKBI(k, s, 0)); \ - w1 = (w1 + SKBI(k, s, 1)); \ - w2 = (w2 + SKBI(k, s, 2)); \ - w3 = (w3 + SKBI(k, s, 3)); \ - w4 = (w4 + SKBI(k, s, 4)); \ - w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \ - w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \ - w7 = (w7 + SKBI(k, s, 7) + vectorize(s)); \ - } + w0 = (w0 + SKBI(k, s, 0)); \ + w1 = (w1 + SKBI(k, s, 1)); \ + w2 = (w2 + SKBI(k, s, 2)); \ + w3 = (w3 + SKBI(k, s, 3)); \ + w4 = (w4 + SKBI(k, s, 4)); \ + w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \ + w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \ + w7 = (w7 + SKBI(k, s, 7) + vectorize(s)); \ +} -#define TFBIG_MIX_UI2(x0, x1, rc) { \ - x0 = x0 + x1; \ - x1 = ROL2(x1, rc) ^ x0; \ - } +#define TFBIG_ADDKEY_PRE(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \ + w0 = (w0 + SKBI(k, s, 0)); \ + w1 = (w1 + SKBI(k, s, 1)); \ + w2 = (w2 + SKBI(k, s, 2)); \ + w3 = (w3 + SKBI(k, s, 3)); \ + w4 = (w4 + SKBI(k, s, 4)); \ + w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \ + w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \ + w7 = (w7 + SKBI(k, s, 7) + (s)); \ +} -#define TFBIG_MIX8_UI2(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \ - TFBIG_MIX_UI2(w0, w1, rc0); \ - TFBIG_MIX_UI2(w2, w3, rc1); \ - TFBIG_MIX_UI2(w4, w5, rc2); \ - TFBIG_MIX_UI2(w6, w7, rc3); \ - } +#define TFBIG_MIX_UI2(x0, x1, rc) { \ + x0 = x0 + x1; \ + x1 = ROL2(x1, rc) ^ x0; \ +} -#define TFBIG_4e_UI2(s) { \ - TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ - TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \ - TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \ - TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \ - TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); \ - } +#define TFBIG_MIX_PRE(x0, x1, rc) { \ + x0 = x0 + x1; \ + x1 = ROTL64(x1, rc) ^ x0; \ +} -#define TFBIG_4o_UI2(s) { \ - TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ - TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \ - TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \ - TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \ - TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \ - } +#define TFBIG_MIX8_UI2(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \ + TFBIG_MIX_UI2(w0, w1, rc0); \ + TFBIG_MIX_UI2(w2, w3, rc1); \ + TFBIG_MIX_UI2(w4, w5, rc2); \ + TFBIG_MIX_UI2(w6, w7, rc3); \ +} +#define TFBIG_MIX8_PRE(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \ + TFBIG_MIX_PRE(w0, w1, rc0); \ + TFBIG_MIX_PRE(w2, w3, rc1); \ + TFBIG_MIX_PRE(w4, w5, rc2); \ + TFBIG_MIX_PRE(w6, w7, rc3); \ +} -__global__ -void quark_skein512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * const __restrict__ g_hash, uint32_t *g_nonceVector) -{ -#if !defined(SP_KERNEL) || __CUDA_ARCH__ < 500 - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - // Skein - uint2 h0, h1, h2, h3, h4, h5, h6, h7, h8; - uint2 t0, t1, t2; +#define TFBIG_4e_UI2(s) { \ + TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ + TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \ + TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \ + TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \ + TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); \ +} - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); +#define TFBIG_4e_PRE(s) { \ + TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ + TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \ + TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \ + TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \ + TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); \ +} - uint32_t hashPosition = nounce - startNounce; - uint2 *inpHash = (uint2*) (&g_hash[hashPosition * 8U]); +#define TFBIG_4o_UI2(s) { \ + TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ + TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \ + TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \ + TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \ + TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \ +} - // Init - h0 = vectorize(0x4903ADFF749C51CEull); - h1 = vectorize(0x0D95DE399746DF03ull); - h2 = vectorize(0x8FD1934127C79BCEull); - h3 = vectorize(0x9A255629FF352CB1ull); - h4 = vectorize(0x5DB62599DF6CA7B0ull); - h5 = vectorize(0xEABE394CA9D5C3F4ull); - h6 = vectorize(0x991112C71A75B523ull); - h7 = vectorize(0xAE18A40B660FCC33ull); +#define TFBIG_4o_PRE(s) { \ + TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ + TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \ + TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \ + TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \ + TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \ +} - uint2 p[8]; - // 1st Round -> etype = 480, ptr = 64, bcount = 0, data = msg - #pragma unroll 8 - for (int i = 0; i < 8; i++) - p[i] = inpHash[i]; +#define macro1() {\ + p[0] += p[1]; p[2] += p[3]; p[4] += p[5]; p[6] += p[7]; p[1] = ROL2(p[1],46) ^ p[0]; \ + p[3] = ROL2(p[3],36) ^ p[2]; p[5] = ROL2(p[5],19) ^ p[4]; p[7] = ROL2(p[7], 37) ^ p[6]; \ + p[2] += p[1]; p[4] += p[7]; p[6] += p[5]; p[0] += p[3]; p[1] = ROL2(p[1],33) ^ p[2]; \ + p[7] = ROL2(p[7],27) ^ p[4]; p[5] = ROL2(p[5],14) ^ p[6]; p[3] = ROL2(p[3], 42) ^ p[0]; \ + p[4] += p[1]; p[6] += p[3]; p[0] += p[5]; p[2] += p[7]; p[1] = ROL2(p[1],17) ^ p[4]; \ + p[3] = ROL2(p[3],49) ^ p[6]; p[5] = ROL2(p[5],36) ^ p[0]; p[7] = ROL2(p[7], 39) ^ p[2]; \ + p[6] += p[1]; p[0] += p[7]; p[2] += p[5]; p[4] += p[3]; p[1] = ROL2(p[1],44) ^ p[6]; \ + p[7] = ROL2(p[7], 9) ^ p[0]; p[5] = ROL2(p[5],54) ^ p[2]; p[3] = ROR8(p[3]) ^ p[4]; \ +} - t0 = make_uint2(0x40, 0); // 64 - t1 = vectorize(0xf000000000000000ULL); // 480ull << 55 (etype) +#define macro2() { \ + p[0] += p[1]; p[2] += p[3]; p[4] += p[5]; p[6] += p[7]; p[1] = ROL2(p[1], 39) ^ p[0]; \ + p[3] = ROL2(p[3], 30) ^ p[2]; p[5] = ROL2(p[5], 34) ^ p[4]; p[7] = ROL24(p[7]) ^ p[6]; \ + p[2] += p[1]; p[4] += p[7]; p[6] += p[5]; p[0] += p[3]; p[1] = ROL2(p[1], 13) ^ p[2]; \ + p[7] = ROL2(p[7], 50) ^ p[4]; p[5] = ROL2(p[5], 10) ^ p[6]; p[3] = ROL2(p[3], 17) ^ p[0]; \ + p[4] += p[1]; p[6] += p[3]; p[0] += p[5]; p[2] += p[7]; p[1] = ROL2(p[1], 25) ^ p[4]; \ + p[3] = ROL2(p[3], 29) ^ p[6]; p[5] = ROL2(p[5], 39) ^ p[0]; p[7] = ROL2(p[7], 43) ^ p[2]; \ + p[6] += p[1]; p[0] += p[7]; p[2] += p[5]; p[4] += p[3]; p[1] = ROL8(p[1]) ^ p[6]; \ + p[7] = ROL2(p[7], 35) ^ p[0]; p[5] = ROR8(p[5]) ^ p[2]; p[3] = ROL2(p[3], 22) ^ p[4]; \ +} -//#if CUDA_VERSION >= 7000 - // doesnt really affect x11 perfs. - __threadfence(); -//#endif - //TFBIG_KINIT_UI2(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); - h8 = vectorize(0xcab2076d98173ec4ULL); - t2 = vectorize(0xf000000000000040ULL); +#define macro3() { \ + hash64[0]+= hash64[1]; hash64[2]+= hash64[3]; hash64[4]+= hash64[5]; hash64[6]+= hash64[7]; \ + hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; \ + hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; \ + hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; \ + hash64[7] = ROL24(hash64[7]) ^ hash64[6]; \ + hash64[2]+= hash64[1]; hash64[4]+= hash64[7]; hash64[6]+= hash64[5]; hash64[0]+= hash64[3]; \ + hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; \ + hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; \ + hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; \ + hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; \ + hash64[4]+= hash64[1]; hash64[6]+= hash64[3]; hash64[0]+= hash64[5]; hash64[2]+= hash64[7]; \ + hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; \ + hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; \ + hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; \ + hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; \ + hash64[6]+= hash64[1]; hash64[0]+= hash64[7]; hash64[2]+= hash64[5]; hash64[4]+= hash64[3]; \ + hash64[1] = ROL8(hash64[1]) ^ hash64[6]; \ + hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; \ + hash64[5] = ROR8(hash64[5]) ^ hash64[2]; \ + hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; \ +} - TFBIG_4e_UI2(0); - TFBIG_4o_UI2(1); - TFBIG_4e_UI2(2); - TFBIG_4o_UI2(3); - TFBIG_4e_UI2(4); - TFBIG_4o_UI2(5); - TFBIG_4e_UI2(6); - TFBIG_4o_UI2(7); - TFBIG_4e_UI2(8); - TFBIG_4o_UI2(9); - TFBIG_4e_UI2(10); - TFBIG_4o_UI2(11); - TFBIG_4e_UI2(12); - TFBIG_4o_UI2(13); - TFBIG_4e_UI2(14); - TFBIG_4o_UI2(15); - TFBIG_4e_UI2(16); - TFBIG_4o_UI2(17); - TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); +#define macro4() {\ + hash64[0]+= hash64[1]; hash64[2]+= hash64[3]; hash64[4]+= hash64[5]; hash64[6]+= hash64[7]; \ + hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; \ + hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; \ + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; \ + hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; \ + hash64[2]+= hash64[1]; hash64[4]+= hash64[7]; hash64[6]+= hash64[5]; hash64[0]+= hash64[3]; \ + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; \ + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; \ + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; \ + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; \ + hash64[4]+= hash64[1]; hash64[6]+= hash64[3]; hash64[0]+= hash64[5]; hash64[2]+= hash64[7]; \ + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; \ + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; \ + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; \ + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; \ + hash64[6]+= hash64[1]; hash64[0]+= hash64[7]; hash64[2]+= hash64[5]; hash64[4]+= hash64[3]; \ + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; \ + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; \ + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; \ + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; \ +} - h0 = inpHash[0] ^ p[0]; - h1 = inpHash[1] ^ p[1]; - h2 = inpHash[2] ^ p[2]; - h3 = inpHash[3] ^ p[3]; - h4 = inpHash[4] ^ p[4]; - h5 = inpHash[5] ^ p[5]; - h6 = inpHash[6] ^ p[6]; - h7 = inpHash[7] ^ p[7]; +__constant__ const uint2 buffer[112] = { + {0x749C51CE, 0x4903ADFF}, {0x9746DF03, 0x0D95DE39}, {0x27C79BCE, 0x8FD19341}, {0xFF352CB1, 0x9A255629}, + {0xDF6CA7B0, 0x5DB62599}, {0xA9D5C434, 0xEABE394C}, {0x1A75B523, 0x891112C7}, {0x660FCC33, 0xAE18A40B}, + {0x9746DF03, 0x0D95DE39}, {0x27C79BCE, 0x8FD19341}, {0xFF352CB1, 0x9A255629}, {0xDF6CA7B0, 0x5DB62599}, + {0xA9D5C3F4, 0xEABE394C}, {0x1A75B523, 0x891112C7}, {0x660FCC73, 0x9E18A40B}, {0x98173EC5, 0xCAB2076D}, + {0x27C79BCE, 0x8FD19341}, {0xFF352CB1, 0x9A255629}, {0xDF6CA7B0, 0x5DB62599}, {0xA9D5C3F4, 0xEABE394C}, + {0x1A75B523, 0x991112C7}, {0x660FCC73, 0x9E18A40B}, {0x98173F04, 0xCAB2076D}, {0x749C51D0, 0x4903ADFF}, + {0xFF352CB1, 0x9A255629}, {0xDF6CA7B0, 0x5DB62599}, {0xA9D5C3F4, 0xEABE394C}, {0x1A75B523, 0x991112C7}, + {0x660FCC33, 0xAE18A40B}, {0x98173F04, 0xCAB2076D}, {0x749C51CE, 0x3903ADFF}, {0x9746DF06, 0x0D95DE39}, + {0xDF6CA7B0, 0x5DB62599}, {0xA9D5C3F4, 0xEABE394C}, {0x1A75B523, 0x991112C7}, {0x660FCC33, 0xAE18A40B}, + {0x98173EC4, 0xCAB2076D}, {0x749C51CE, 0x3903ADFF}, {0x9746DF43, 0xFD95DE39}, {0x27C79BD2, 0x8FD19341}, + {0xA9D5C3F4, 0xEABE394C}, {0x1A75B523, 0x991112C7}, {0x660FCC33, 0xAE18A40B}, {0x98173EC4, 0xCAB2076D}, + {0x749C51CE, 0x4903ADFF}, {0x9746DF43, 0xFD95DE39}, {0x27C79C0E, 0x8FD19341}, {0xFF352CB6, 0x9A255629}, + {0x1A75B523, 0x991112C7}, {0x660FCC33, 0xAE18A40B}, {0x98173EC4, 0xCAB2076D}, {0x749C51CE, 0x4903ADFF}, + {0x9746DF03, 0x0D95DE39}, {0x27C79C0E, 0x8FD19341}, {0xFF352CB1, 0x8A255629}, {0xDF6CA7B6, 0x5DB62599}, + {0x660FCC33, 0xAE18A40B}, {0x98173EC4, 0xCAB2076D}, {0x749C51CE, 0x4903ADFF}, {0x9746DF03, 0x0D95DE39}, + {0x27C79BCE, 0x8FD19341}, {0xFF352CB1, 0x8A255629}, {0xDF6CA7F0, 0x4DB62599}, {0xA9D5C3FB, 0xEABE394C}, + {0x98173EC4, 0xCAB2076D}, {0x749C51CE, 0x4903ADFF}, {0x9746DF03, 0x0D95DE39}, {0x27C79BCE, 0x8FD19341}, + {0xFF352CB1, 0x9A255629}, {0xDF6CA7F0, 0x4DB62599}, {0xA9D5C434, 0xEABE394C}, {0x1A75B52B, 0x991112C7}, + {0x749C51CE, 0x4903ADFF}, {0x9746DF03, 0x0D95DE39}, {0x27C79BCE, 0x8FD19341}, {0xFF352CB1, 0x9A255629}, + {0xDF6CA7B0, 0x5DB62599}, {0xA9D5C434, 0xEABE394C}, {0x1A75B523, 0x891112C7}, {0x660FCC3C, 0xAE18A40B}, + {0x9746DF03, 0x0D95DE39}, {0x27C79BCE, 0x8FD19341}, {0xFF352CB1, 0x9A255629}, {0xDF6CA7B0, 0x5DB62599}, + {0xA9D5C3F4, 0xEABE394C}, {0x1A75B523, 0x891112C7}, {0x660FCC73, 0x9E18A40B}, {0x98173ece, 0xcab2076d}, + {0x27C79BCE, 0x8FD19341}, {0xFF352CB1, 0x9A255629}, {0xDF6CA7B0, 0x5DB62599}, {0xA9D5C3F4, 0xEABE394C}, + {0x1A75B523, 0x991112C7}, {0x660FCC73, 0x9E18A40B}, {0x98173F04, 0xCAB2076D}, {0x749C51D9, 0x4903ADFF}, + {0xFF352CB1, 0x9A255629}, {0xDF6CA7B0, 0x5DB62599}, {0xA9D5C3F4, 0xEABE394C}, {0x1A75B523, 0x991112C7}, + {0x660FCC33, 0xAE18A40B}, {0x98173F04, 0xCAB2076D}, {0x749C51CE, 0x3903ADFF}, {0x9746DF0F, 0x0D95DE39}, + {0xDF6CA7B0, 0x5DB62599}, {0xA9D5C3F4, 0xEABE394C}, {0x1A75B523, 0x991112C7}, {0x660FCC33, 0xAE18A40B}, + {0x98173EC4, 0xCAB2076D}, {0x749C51CE, 0x3903ADFF}, {0x9746DF43, 0xFD95DE39}, {0x27C79BDB, 0x8FD19341} +}; - // 2. Runde -> etype = 510, ptr = 8, bcount = 0, data = 0 - #pragma unroll 8 - for(int i=0; i<8; i++) - p[i] = vectorize(0); +__global__ +#if __CUDA_ARCH__ > 500 +__launch_bounds__(TPB52, 3) +#else +__launch_bounds__(TPB50, 5) +#endif +void quark_skein512_gpu_hash_64(const uint32_t threads,uint64_t* __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - t0 = make_uint2(0x8, 0); - t1 = vectorize(0xff00000000000000ULL); // etype + if (thread < threads){ - TFBIG_KINIT_UI2(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); - TFBIG_4e_UI2(0); - TFBIG_4o_UI2(1); - TFBIG_4e_UI2(2); - TFBIG_4o_UI2(3); - TFBIG_4e_UI2(4); - TFBIG_4o_UI2(5); - TFBIG_4e_UI2(6); - TFBIG_4o_UI2(7); - TFBIG_4e_UI2(8); - TFBIG_4o_UI2(9); - TFBIG_4e_UI2(10); - TFBIG_4o_UI2(11); - TFBIG_4e_UI2(12); - TFBIG_4o_UI2(13); - TFBIG_4e_UI2(14); - TFBIG_4o_UI2(15); - TFBIG_4e_UI2(16); - TFBIG_4o_UI2(17); - TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); - - // output - uint64_t *outpHash = &g_hash[hashPosition * 8U]; - #pragma unroll 8 - for(int i=0; i<8; i++) - outpHash[i] = devectorize(p[i]); + // Skein + uint2 p[8], h[9]; + + const uint32_t hashPosition = (g_nonceVector == NULL) ? thread : g_nonceVector[thread]; + + uint64_t *Hash = &g_hash[hashPosition<<3]; + + uint2x4 *phash = (uint2x4*)Hash; + *(uint2x4*)&p[0] = __ldg4(&phash[0]); + *(uint2x4*)&p[4] = __ldg4(&phash[1]); + + h[0] = p[0]; h[1] = p[1]; h[2] = p[2]; h[3] = p[3]; + h[4] = p[4]; h[5] = p[5]; h[6] = p[6]; h[7] = p[7]; + + p[0] += buffer[ 0]; p[1] += buffer[ 1]; p[2] += buffer[ 2]; p[3] += buffer[3]; + p[4] += buffer[ 4]; p[5] += buffer[ 5]; p[6] += buffer[ 6]; p[7] += buffer[7]; + macro1(); + p[0] += buffer[ 8]; p[1] += buffer[ 9]; p[2] += buffer[ 10]; p[3] += buffer[11]; + p[4] += buffer[ 12]; p[5] += buffer[ 13]; p[6] += buffer[ 14]; p[7] += buffer[15]; + macro2(); + p[0] += buffer[ 16]; p[1] += buffer[ 17]; p[2] += buffer[ 18]; p[3] += buffer[19]; + p[4] += buffer[ 20]; p[5] += buffer[ 21]; p[6] += buffer[ 22]; p[7] += buffer[23]; + macro1(); + p[0] += buffer[ 24]; p[1] += buffer[ 25]; p[2] += buffer[ 26]; p[3] += buffer[27]; + p[4] += buffer[ 28]; p[5] += buffer[ 29]; p[6] += buffer[ 30]; p[7] += buffer[31]; + macro2(); + p[0] += buffer[ 32]; p[1] += buffer[ 33]; p[2] += buffer[ 34]; p[3] += buffer[35]; + p[4] += buffer[ 36]; p[5] += buffer[ 37]; p[6] += buffer[ 38]; p[7] += buffer[39]; + macro1(); + p[0] += buffer[ 40]; p[1] += buffer[ 41]; p[2] += buffer[ 42]; p[3] += buffer[43]; + p[4] += buffer[ 44]; p[5] += buffer[ 45]; p[6] += buffer[ 46]; p[7] += buffer[47]; + macro2(); + p[0] += buffer[ 48]; p[1] += buffer[ 49]; p[2] += buffer[ 50]; p[3] += buffer[51]; + p[4] += buffer[ 52]; p[5] += buffer[ 53]; p[6] += buffer[ 54]; p[7] += buffer[55]; + macro1(); + p[0] += buffer[ 56]; p[1] += buffer[ 57]; p[2] += buffer[ 58]; p[3] += buffer[59]; + p[4] += buffer[ 60]; p[5] += buffer[ 61]; p[6] += buffer[ 62]; p[7] += buffer[63]; + macro2(); + p[0] += buffer[ 64]; p[1] += buffer[ 65]; p[2] += buffer[ 66]; p[3] += buffer[67]; + p[4] += buffer[ 68]; p[5] += buffer[ 69]; p[6] += buffer[ 70]; p[7] += buffer[71]; + macro1(); + p[0] += buffer[ 72]; p[1] += buffer[ 73]; p[2] += buffer[ 74]; p[3] += buffer[75]; + p[4] += buffer[ 76]; p[5] += buffer[ 77]; p[6] += buffer[ 78]; p[7] += buffer[79]; + macro2(); + p[0] += buffer[ 80]; p[1] += buffer[ 81]; p[2] += buffer[ 82]; p[3] += buffer[83]; + p[4] += buffer[ 84]; p[5] += buffer[ 85]; p[6] += buffer[ 86]; p[7] += buffer[87]; + macro1(); + p[0] += buffer[ 88]; p[1] += buffer[ 89]; p[2] += buffer[ 90]; p[3] += buffer[91]; + p[4] += buffer[ 92]; p[5] += buffer[ 93]; p[6] += buffer[ 94]; p[7] += buffer[95]; + macro2(); + p[0] += buffer[ 96]; p[1] += buffer[ 97]; p[2] += buffer[ 98]; p[3] += buffer[99]; + p[4] += buffer[100]; p[5] += buffer[101]; p[6] += buffer[102]; p[7] += buffer[103]; + macro1(); + p[0] += buffer[104]; p[1] += buffer[105]; p[2] += buffer[106]; p[3] += buffer[107]; + p[4] += buffer[108]; p[5] += buffer[109]; p[6] += buffer[110]; p[7] += buffer[111]; + macro2(); + p[0]+= make_uint2(0xA9D5C3F4, 0xEABE394C); p[1]+= make_uint2(0x1A75B523, 0x991112C7); + p[2]+= make_uint2(0x660FCC33, 0xAE18A40B); p[3]+= make_uint2(0x98173EC4, 0xCAB2076D); + p[4]+= make_uint2(0x749C51CE, 0x4903ADFF); p[5]+= make_uint2(0x9746DF43, 0xFD95DE39); + p[6]+= make_uint2(0x27C79C0E, 0x8FD19341); p[7]+= make_uint2(0xFF352CBF, 0x9A255629); + macro1(); + p[0]+= make_uint2(0x1A75B523, 0x991112C7); p[1]+= make_uint2(0x660FCC33, 0xAE18A40B); + p[2]+= make_uint2(0x98173EC4, 0xCAB2076D); p[3]+= make_uint2(0x749C51CE, 0x4903ADFF); + p[4]+= make_uint2(0x9746DF03, 0x0D95DE39); p[5]+= make_uint2(0x27C79C0E, 0x8FD19341); + p[6]+= make_uint2(0xFF352CB1, 0x8A255629); p[7]+= make_uint2(0xDF6CA7BF, 0x5DB62599); + macro2(); + p[0] += vectorize(0xAE18A40B660FCC33); p[1] += vectorize(0xcab2076d98173ec4); + p[2] += vectorize(0x4903ADFF749C51CE); p[3] += vectorize(0x0D95DE399746DF03); + p[4] += vectorize(0x8FD1934127C79BCE); p[5] += vectorize(0x8A255629FF352CB1); + p[6] += vectorize(0x4DB62599DF6CA7F0); p[7] += vectorize(0xEABE394CA9D5C3F4 + 16); + macro1(); + p[0] += vectorize(0xcab2076d98173ec4); p[1] += vectorize(0x4903ADFF749C51CE); + p[2] += vectorize(0x0D95DE399746DF03); p[3] += vectorize(0x8FD1934127C79BCE); + p[4] += vectorize(0x9A255629FF352CB1); p[5] += vectorize(0x4DB62599DF6CA7F0); + p[6] += vectorize(0xEABE394CA9D5C3F4 + 0x0000000000000040); + p[7] += vectorize(0x991112C71A75B523 + 17); + macro2(); + p[0] += vectorize(0x4903ADFF749C51CE); p[1] += vectorize(0x0D95DE399746DF03); + p[2] += vectorize(0x8FD1934127C79BCE); p[3] += vectorize(0x9A255629FF352CB1); + p[4] += vectorize(0x5DB62599DF6CA7B0); p[5] += vectorize(0xEABE394CA9D5C3F4 + 0x0000000000000040); + p[6] += vectorize(0x891112C71A75B523); p[7] += vectorize(0xAE18A40B660FCC33 + 18); + + #define h0 p[0] + #define h1 p[1] + #define h2 p[2] + #define h3 p[3] + #define h4 p[4] + #define h5 p[5] + #define h6 p[6] + #define h7 p[7] + + h0 ^= h[0]; h1 ^= h[1]; h2 ^= h[2]; h3 ^= h[3]; + h4 ^= h[4]; h5 ^= h[5]; h6 ^= h[6]; h7 ^= h[7]; + + uint2 skein_h8 = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^ h6 ^ h7 ^ vectorize(0x1BD11BDAA9FC1A22); + + uint2 hash64[8]; + + hash64[5] = h5 + 8; + + hash64[0] = h0 + h1; + hash64[1] = ROL2(h1, 46) ^ hash64[0]; + hash64[2] = h2 + h3; + hash64[3] = ROL2(h3, 36) ^ hash64[2]; + hash64[4] = h4 + hash64[5]; + hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; + hash64[6] = (h6 + h7 + make_uint2(0,0xff000000)); + hash64[7] = ROL2(h7, 37) ^ hash64[6]; + hash64[2]+= hash64[1]; + hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; + hash64[4]+= hash64[7]; + hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; + hash64[6]+= hash64[5]; + hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; + hash64[0]+= hash64[3]; + hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; + hash64[4]+= hash64[1]; + hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; + hash64[6]+= hash64[3]; + hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; + hash64[0]+= hash64[5]; + hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; + hash64[2]+= hash64[7]; + hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; + hash64[6]+= hash64[1]; + hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; + hash64[0]+= hash64[7]; + hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; + hash64[2]+= hash64[5]; + hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; + hash64[4]+= hash64[3]; + hash64[3] = ROR8(hash64[3]) ^ hash64[4]; + + hash64[0]+= h1; hash64[1]+= h2; hash64[2]+= h3; hash64[3]+= h4; + hash64[4]+= h5; + hash64[5]+= h6 + make_uint2(0,0xff000000); + hash64[6]+= h7 + vectorize(0xff00000000000008); + hash64[7]+= skein_h8 + 1; + macro3(); + hash64[0]+= h2; hash64[1]+= h3; hash64[2]+= h4; hash64[3]+= h5; + hash64[4]+= h6; + hash64[5]+= h7 + vectorize(0xff00000000000008); + hash64[6]+= skein_h8 + 8; + hash64[7]+= h0 + 2; + macro4(); + hash64[0] = (hash64[0] + h3); hash64[1] = (hash64[1] + h4); + hash64[2] = (hash64[2] + h5); hash64[3] = (hash64[3] + h6); + hash64[4] = (hash64[4] + h7); hash64[5] = (hash64[5] + skein_h8 + 8); + hash64[6] = (hash64[6] + h0 + make_uint2(0,0xff000000)); + hash64[7] = (hash64[7] + h1 + 3); + macro3(); + hash64[0] = (hash64[0] + h4); hash64[1] = (hash64[1] + h5); + hash64[2] = (hash64[2] + h6); hash64[3] = (hash64[3] + h7); + hash64[4] = (hash64[4] + skein_h8); hash64[5] = (hash64[5] + h0 + make_uint2(0,0xff000000)); + hash64[6] = (hash64[6] + h1 + vectorize(0xff00000000000008)); + hash64[7] = (hash64[7] + h2 + 4); + macro4(); + hash64[0] = (hash64[0] + h5); hash64[1] = (hash64[1] + h6); + hash64[2] = (hash64[2] + h7); hash64[3] = (hash64[3] + skein_h8); + hash64[4] = (hash64[4] + h0); hash64[5] = (hash64[5] + h1 + vectorize(0xff00000000000008)); + hash64[6] = (hash64[6] + h2 + 8); hash64[7] = (hash64[7] + h3 + 5); + macro3(); + hash64[0] = (hash64[0] + h6); hash64[1] = (hash64[1] + h7); + hash64[2] = (hash64[2] + skein_h8); hash64[3] = (hash64[3] + h0); + hash64[4] = (hash64[4] + h1); hash64[5] = (hash64[5] + h2 + 8); + hash64[6] = (hash64[6] + h3 + make_uint2(0,0xff000000)); + hash64[7] = (hash64[7] + h4 + 6); + macro4(); + hash64[0] = (hash64[0] + h7); hash64[1] = (hash64[1] + skein_h8); + hash64[2] = (hash64[2] + h0); hash64[3] = (hash64[3] + h1); + hash64[4] = (hash64[4] + h2); hash64[5] = (hash64[5] + h3 + make_uint2(0,0xff000000)); + hash64[6] = (hash64[6] + h4 + vectorize(0xff00000000000008)); + hash64[7] = (hash64[7] + h5 + 7); + macro3(); + hash64[0] = (hash64[0] + skein_h8); hash64[1] = (hash64[1] + h0); + hash64[2] = (hash64[2] + h1); hash64[3] = (hash64[3] + h2); + hash64[4] = (hash64[4] + h3); hash64[5] = (hash64[5] + h4 + vectorize(0xff00000000000008)); + hash64[6] = (hash64[6] + h5 + 8); hash64[7] = (hash64[7] + h6 + 8); + macro4(); + hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h0)); + hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h1)); + hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h2)); + hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h3)); + hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h4)); + hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h5) + 8); + hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h6) + 0xff00000000000000); + hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h7) + 9); + macro3(); + hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h1)); + hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h2)); + hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h3)); + hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h4)); + hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h5)); + hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h6) + 0xff00000000000000); + hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h7) + 0xff00000000000008); + hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(skein_h8) + 10); + macro4(); + hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h2)); + hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h3)); + hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h4)); + hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h5)); + hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h6)); + hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h7) + 0xff00000000000008); + hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(skein_h8) + 8); + hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h0) + 11); + macro3(); + hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h3)); + hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h4)); + hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h5)); + hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h6)); + hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h7)); + hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(skein_h8) + 8); + hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h0) + 0xff00000000000000); + hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h1) + 12); + macro4(); + hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h4)); + hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h5)); + hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h6)); + hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h7)); + hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(skein_h8)); + hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h0) + 0xff00000000000000); + hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h1) + 0xff00000000000008); + hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h2) + 13); + macro3(); + hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h5)); + hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h6)); + hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h7)); + hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(skein_h8)); + hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h0)); + hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h1) + 0xff00000000000008); + hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h2) + 8); + hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h3) + 14); + macro4(); + hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h6)); + hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h7)); + hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(skein_h8)); + hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h0)); + hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h1)); + hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h2) + 8); + hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h3) + 0xff00000000000000); + hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h4) + 15); + macro3(); + hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h7)); + hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(skein_h8)); + hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h0)); + hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h1)); + hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h2)); + hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h3) + 0xff00000000000000); + hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h4) + 0xff00000000000008); + hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h5) + 16); + macro4(); + hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(skein_h8)); + hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h0)); + hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h1)); + hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h2)); + hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h3)); + hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h4) + 0xff00000000000008); + hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h5) + 8); + hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h6) + 17); + macro3(); + hash64[0] = vectorize(devectorize(hash64[0]) + devectorize(h0)); + hash64[1] = vectorize(devectorize(hash64[1]) + devectorize(h1)); + hash64[2] = vectorize(devectorize(hash64[2]) + devectorize(h2)); + hash64[3] = vectorize(devectorize(hash64[3]) + devectorize(h3)); + hash64[4] = vectorize(devectorize(hash64[4]) + devectorize(h4)); + hash64[5] = vectorize(devectorize(hash64[5]) + devectorize(h5) + 8); + hash64[6] = vectorize(devectorize(hash64[6]) + devectorize(h6) + 0xff00000000000000); + hash64[7] = vectorize(devectorize(hash64[7]) + devectorize(h7) + 18); + + phash = (uint2x4*)hash64; + uint2x4 *outpt = (uint2x4*)Hash; + outpt[0] = phash[0]; + outpt[1] = phash[1]; + + #undef h0 + #undef h1 + #undef h2 + #undef h3 + #undef h4 + #undef h5 + #undef h6 + #undef h7 } -#endif /* SM < 5.0 */ } -__global__ -void quark_skein512_gpu_hash_64_sm3(uint32_t threads, uint32_t startNounce, uint64_t * const __restrict__ g_hash, uint32_t *g_nonceVector) +__host__ +//void quark_skein512_cpu_hash_64(int thr_id,uint32_t threads, uint32_t *d_nonceVector, uint32_t *d_hash) +void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - // Skein - uint64_t p[8]; - uint64_t h0, h1, h2, h3, h4, h5, h6, h7, h8; - uint64_t t0, t1, t2; - - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - uint32_t hashPosition = nounce - startNounce; - uint64_t *inpHash = &g_hash[hashPosition * 8U]; - - // Init - h0 = 0x4903ADFF749C51CEull; - h1 = 0x0D95DE399746DF03ull; - h2 = 0x8FD1934127C79BCEull; - h3 = 0x9A255629FF352CB1ull; - h4 = 0x5DB62599DF6CA7B0ull; - h5 = 0xEABE394CA9D5C3F4ull; - h6 = 0x991112C71A75B523ull; - h7 = 0xAE18A40B660FCC33ull; - - // 1st Round -> etype = 480, ptr = 64, bcount = 0, data = msg - #pragma unroll 8 - for(int i=0; i<8; i++) - p[i] = inpHash[i]; - - t0 = 0x40; // 64. - t1 = 0xf000000000000000ULL; // 480ull << 55 (etype) - - //TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); - h8 = 0xcab2076d98173ec4ULL; - t2 = 0xf000000000000040ULL; - - TFBIG_4e(0); - TFBIG_4o(1); - TFBIG_4e(2); - TFBIG_4o(3); - TFBIG_4e(4); - TFBIG_4o(5); - TFBIG_4e(6); - TFBIG_4o(7); - TFBIG_4e(8); - TFBIG_4o(9); - TFBIG_4e(10); - TFBIG_4o(11); - TFBIG_4e(12); - TFBIG_4o(13); - TFBIG_4e(14); - TFBIG_4o(15); - TFBIG_4e(16); - TFBIG_4o(17); - TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); - - h0 = inpHash[0] ^ p[0]; - h1 = inpHash[1] ^ p[1]; - h2 = inpHash[2] ^ p[2]; - h3 = inpHash[3] ^ p[3]; - h4 = inpHash[4] ^ p[4]; - h5 = inpHash[5] ^ p[5]; - h6 = inpHash[6] ^ p[6]; - h7 = inpHash[7] ^ p[7]; - - // 2nd Round -> etype = 510, ptr = 8, bcount = 0, data = 0 - #pragma unroll 8 - for(int i=0; i<8; i++) - p[i] = 0ull; - - t0 = 8; // ptr - t1 = 510ull << 55; // etype - TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); - - TFBIG_4e(0); - TFBIG_4o(1); - TFBIG_4e(2); - TFBIG_4o(3); - TFBIG_4e(4); - TFBIG_4o(5); - TFBIG_4e(6); - TFBIG_4o(7); - TFBIG_4e(8); - TFBIG_4o(9); - TFBIG_4e(10); - TFBIG_4o(11); - TFBIG_4e(12); - TFBIG_4o(13); - TFBIG_4e(14); - TFBIG_4o(15); - TFBIG_4e(16); - TFBIG_4o(17); - TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); - - // output - uint64_t *outpHash = &g_hash[hashPosition * 8U]; + uint32_t tpb = TPB52; + int dev_id = device_map[thr_id]; + + if (device_sm[dev_id] <= 500) tpb = TPB50; + const dim3 grid((threads + tpb-1)/tpb); + const dim3 block(tpb); + quark_skein512_gpu_hash_64 << > >(threads, (uint64_t*)d_hash, d_nonceVector); - #pragma unroll 8 - for(int i=0; i<8; i++) - outpHash[i] = p[i]; - } } -__global__ __launch_bounds__(128,5) -void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *output64, int swap) +// 120 * 8 = 960 ... too big ? +static __constant__ uint2 c_buffer[120]; // padded message (80 bytes + 72*8 bytes midstate + align) + +__global__ +#if __CUDA_ARCH__ > 500 +__launch_bounds__(TPB52, 3) +#else +__launch_bounds__(TPB50, 5) +#endif +void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *output64) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -628,191 +786,130 @@ void skein512_gpu_hash_80(uint32_t threads, uint32_t startNounce, uint64_t *outp uint2 h0, h1, h2, h3, h4, h5, h6, h7, h8; uint2 t0, t1, t2; - h0 = vectorize(c_PaddedMessage80[10]); - h1 = vectorize(c_PaddedMessage80[11]); - h2 = vectorize(c_PaddedMessage80[12]); - h3 = vectorize(c_PaddedMessage80[13]); - h4 = vectorize(c_PaddedMessage80[14]); - h5 = vectorize(c_PaddedMessage80[15]); - h6 = vectorize(c_PaddedMessage80[16]); - h7 = vectorize(c_PaddedMessage80[17]); - - uint32_t nonce = swap ? cuda_swab32(startNounce + thread) : startNounce + thread; - uint2 nonce2 = make_uint2(_LODWORD(c_PaddedMessage80[9]), nonce); + uint32_t nonce = cuda_swab32(startNounce + thread); + uint2 nonce2 = make_uint2(c_buffer[0].x, nonce); uint2 p[8]; - p[0] = vectorize(c_PaddedMessage80[8]); p[1] = nonce2; - #pragma unroll - for (int i = 2; i < 8; i++) - p[i] = vectorize(0ull); - - t0 = make_uint2(0x50, 0); + h0 = c_buffer[ 1]; + h1 = c_buffer[ 2]; + h2 = c_buffer[ 3]; + h3 = c_buffer[ 4]; + h4 = c_buffer[ 5]; + h5 = c_buffer[ 6]; + h6 = c_buffer[ 7]; + h7 = c_buffer[ 8]; + h8 = c_buffer[ 9]; + + t0 = vectorize(0x50ull); t1 = vectorize(0xB000000000000000ull); + t2 = t0^t1; + + p[1]=nonce2 + h1; p[0]= c_buffer[10] + p[1]; + p[2]=c_buffer[11]; + p[3]=c_buffer[12]; + p[4]=c_buffer[13]; + p[5]=c_buffer[14]; + p[6]=c_buffer[15]; + p[7]=c_buffer[16]; + +// macro1(); + p[1] = ROL2(p[1], 46) ^ p[0]; + p[2] += p[1]; + p[0] += p[3]; + p[1] = ROL2(p[1], 33) ^ p[2]; + p[3] = c_buffer[17] ^ p[0]; + p[4] += p[1]; + p[6] += p[3]; + p[0] += p[5]; + p[2] += p[7]; + p[1] = ROL2(p[1], 17) ^ p[4]; + p[3] = ROL2(p[3], 49) ^ p[6]; + p[5] = c_buffer[18] ^ p[0]; + p[7] = c_buffer[19] ^ p[2]; + p[6] += p[1]; + p[0] += p[7]; + p[2] += p[5]; + p[4] += p[3]; + p[1] = ROL2(p[1], 44) ^ p[6]; + p[7] = ROL2(p[7], 9) ^ p[0]; + p[5] = ROL2(p[5], 54) ^ p[2]; + p[3] = ROR8(p[3]) ^ p[4]; + + p[0]+=h1; p[1]+=h2; p[2]+=h3; p[3]+=h4; p[4]+=h5; + p[5]+=c_buffer[20]; p[7]+=c_buffer[21]; p[6]+=c_buffer[22]; + macro2(); + p[0]+=h2; p[1]+=h3; p[2]+=h4; p[3]+=h5; p[4]+=h6; + p[5]+=c_buffer[22]; p[7]+=c_buffer[23]; p[6]+=c_buffer[24]; + macro1(); + p[0]+=h3; p[1]+=h4; p[2]+=h5; p[3]+=h6; p[4]+=h7; + p[5]+=c_buffer[24]; p[7]+=c_buffer[25]; p[6]+=c_buffer[26]; + macro2(); + p[0]+=h4; p[1]+=h5; p[2]+=h6; p[3]+=h7; p[4]+=h8; + p[5]+=c_buffer[26]; p[7]+=c_buffer[27]; p[6]+=c_buffer[28]; + macro1(); + p[0]+=h5; p[1]+=h6; p[2]+=h7; p[3]+=h8; p[4]+=h0; + p[5]+=c_buffer[28]; p[7]+=c_buffer[29]; p[6]+=c_buffer[30]; + macro2(); + p[0]+=h6; p[1]+=h7; p[2]+=h8; p[3]+=h0; p[4]+=h1; + p[5]+=c_buffer[30]; p[7]+=c_buffer[31]; p[6]+=c_buffer[32]; + macro1(); + p[0]+=h7; p[1]+=h8; p[2]+=h0; p[3]+=h1; p[4]+=h2; + p[5]+=c_buffer[32]; p[7]+=c_buffer[33]; p[6]+=c_buffer[34]; + macro2(); + p[0]+=h8; p[1]+=h0; p[2]+=h1; p[3]+=h2; p[4]+=h3; + p[5]+=c_buffer[34]; p[7]+=c_buffer[35]; p[6]+=c_buffer[36]; + macro1(); + p[0]+=h0; p[1]+=h1; p[2]+=h2; p[3]+=h3; p[4]+=h4; + p[5]+=c_buffer[36]; p[7]+=c_buffer[37]; p[6]+=c_buffer[38]; + macro2(); + p[0]+=h1; p[1]+=h2; p[2]+=h3; p[3]+=h4; p[4]+=h5; + p[5]+=c_buffer[38]; p[7]+=c_buffer[39]; p[6]+=c_buffer[40]; + macro1(); + p[0]+=h2; p[1]+=h3; p[2]+=h4; p[3]+=h5; p[4]+=h6; + p[5]+=c_buffer[40]; p[7]+=c_buffer[41]; p[6]+=c_buffer[42]; + macro2(); + p[0]+=h3; p[1]+=h4; p[2]+=h5; p[3]+=h6; p[4]+=h7; + p[5]+=c_buffer[42]; p[7]+=c_buffer[43]; p[6]+=c_buffer[44]; + macro1(); + p[0]+=h4; p[1]+=h5; p[2]+=h6; p[3]+=h7; p[4]+=h8; + p[5]+=c_buffer[44]; p[7]+=c_buffer[45]; p[6]+=c_buffer[46]; + macro2(); + p[0]+=h5; p[1]+=h6; p[2]+=h7; p[3]+=h8; p[4]+=h0; + p[5]+=c_buffer[46]; p[7]+=c_buffer[47]; p[6]+=c_buffer[48]; + macro1(); + p[0]+=h6; p[1]+=h7; p[2]+=h8; p[3]+=h0; p[4]+=h1; + p[5]+=c_buffer[48]; p[7]+=c_buffer[49]; p[6]+=c_buffer[50]; + macro2(); + p[0]+=h7; p[1]+=h8; p[2]+=h0; p[3]+=h1; p[4]+=h2; + p[5]+=c_buffer[50]; p[7]+=c_buffer[51]; p[6]+=c_buffer[52]; + macro1(); + p[0]+=h8; p[1]+=h0; p[2]+=h1; p[3]+=h2; p[4]+=h3; + p[5]+=c_buffer[52]; p[7]+=c_buffer[53]; p[6]+=c_buffer[54]; + macro2(); + p[0]+=h0; p[1]+=h1; p[2]+=h2; p[3]+=h3; p[4]+=h4; + p[5]+=c_buffer[54]; p[7]+=c_buffer[55]; p[6]+=c_buffer[56]; + + p[0]^= c_buffer[57]; + p[1]^= nonce2; + + t0 = vectorize(8); // extra + t1 = vectorize(0xFF00000000000000ull); // etype +// t2 = vectorize(0xB000000000000050ull); + + h0 = p[0]; + h1 = p[1]; + h2 = p[2]; + h3 = p[3]; + h4 = p[4]; + h5 = p[5]; + h6 = p[6]; + h7 = p[7]; - //TFBIG_KINIT_UI2(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); - h8 = vectorize(c_PaddedMessage80[18]); - t2 = vectorize(0xB000000000000050ull); // t0 ^ t1 - - TFBIG_4e_UI2(0); - TFBIG_4o_UI2(1); - TFBIG_4e_UI2(2); - TFBIG_4o_UI2(3); - TFBIG_4e_UI2(4); - TFBIG_4o_UI2(5); - TFBIG_4e_UI2(6); - TFBIG_4o_UI2(7); - TFBIG_4e_UI2(8); - TFBIG_4o_UI2(9); - TFBIG_4e_UI2(10); - TFBIG_4o_UI2(11); - TFBIG_4e_UI2(12); - TFBIG_4o_UI2(13); - TFBIG_4e_UI2(14); - TFBIG_4o_UI2(15); - TFBIG_4e_UI2(16); - TFBIG_4o_UI2(17); - TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); - - uint64_t *outpHash = &output64[thread * 8U]; - outpHash[0] = c_PaddedMessage80[8] ^ devectorize(p[0]); - outpHash[1] = devectorize(nonce2 ^ p[1]); - #pragma unroll - for(int i=2; i<8; i++) - outpHash[i] = devectorize(p[i]); - } -} - -__global__ -void skein512_gpu_hash_80_sm3(uint32_t threads, uint32_t startNounce, uint64_t *output64, int swap) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - // Init - uint64_t h0 = 0x4903ADFF749C51CEull; - uint64_t h1 = 0x0D95DE399746DF03ull; - uint64_t h2 = 0x8FD1934127C79BCEull; - uint64_t h3 = 0x9A255629FF352CB1ull; - uint64_t h4 = 0x5DB62599DF6CA7B0ull; - uint64_t h5 = 0xEABE394CA9D5C3F4ull; - uint64_t h6 = 0x991112C71A75B523ull; - uint64_t h7 = 0xAE18A40B660FCC33ull; - - uint64_t t0 = 0x40; // ptr = 64. - uint64_t t1 = 0x7000000000000000ull; // 0xE0ull << 55 // etype - - //TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); - uint64_t t2 = 0x7000000000000040ull; - uint64_t h8 = 0xcab2076d98173ec4ull; - - uint64_t p[8]; - #pragma unroll 8 - for (int i = 0; i<8; i++) - p[i] = c_PaddedMessage80[i]; - - TFBIG_4e(0); - TFBIG_4o(1); - TFBIG_4e(2); - TFBIG_4o(3); - TFBIG_4e(4); - TFBIG_4o(5); - TFBIG_4e(6); - TFBIG_4o(7); - TFBIG_4e(8); - TFBIG_4o(9); - TFBIG_4e(10); - TFBIG_4o(11); - TFBIG_4e(12); - TFBIG_4o(13); - TFBIG_4e(14); - TFBIG_4o(15); - TFBIG_4e(16); - TFBIG_4o(17); - TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); - - h0 = c_PaddedMessage80[0] ^ p[0]; - h1 = c_PaddedMessage80[1] ^ p[1]; - h2 = c_PaddedMessage80[2] ^ p[2]; - h3 = c_PaddedMessage80[3] ^ p[3]; - h4 = c_PaddedMessage80[4] ^ p[4]; - h5 = c_PaddedMessage80[5] ^ p[5]; - h6 = c_PaddedMessage80[6] ^ p[6]; - h7 = c_PaddedMessage80[7] ^ p[7]; - - uint32_t nonce = swap ? cuda_swab32(startNounce + thread) : startNounce + thread; - uint64_t nonce64 = MAKE_ULONGLONG(_LODWORD(c_PaddedMessage80[9]), nonce); - - // skein_big_close -> etype = 0x160, ptr = 16, bcount = 1, extra = 16 - p[0] = c_PaddedMessage80[8]; - p[1] = nonce64; - - #pragma unroll - for (int i = 2; i < 8; i++) - p[i] = 0; - - t0 = 0x50; // (bcount << 6) + extra; - t1 = 0xB000000000000000ull; // (bcount >> 58) + ((sph_u64)(etype) << 55); - - //TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); - t2 = 0xB000000000000050ull; - h8 = c_PaddedMessage80[18]; - - TFBIG_4e(0); - TFBIG_4o(1); - TFBIG_4e(2); - TFBIG_4o(3); - TFBIG_4e(4); - TFBIG_4o(5); - TFBIG_4e(6); - TFBIG_4o(7); - TFBIG_4e(8); - TFBIG_4o(9); - TFBIG_4e(10); - TFBIG_4o(11); - TFBIG_4e(12); - TFBIG_4o(13); - TFBIG_4e(14); - TFBIG_4o(15); - TFBIG_4e(16); - TFBIG_4o(17); - TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); - - // skein_big_close 2nd loop -> etype = 0x1fe, ptr = 8, bcount = 0 - // output - uint64_t *outpHash = &output64[thread * 8U]; - outpHash[0] = c_PaddedMessage80[8] ^ p[0]; - outpHash[1] = nonce64 ^ p[1]; - #pragma unroll - for(int i=2; i<8; i++) - outpHash[i] = p[i]; - } -} - -__global__ __launch_bounds__(128,6) -void skein512_gpu_hash_close(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint2 t0 = make_uint2(0x8, 0); // extra - uint2 t1 = vectorize(0xFF00000000000000ull); // etype - - uint64_t *state = &g_hash[thread * 8U]; - uint2 h0 = vectorize(state[0]); - uint2 h1 = vectorize(state[1]); - uint2 h2 = vectorize(state[2]); - uint2 h3 = vectorize(state[3]); - uint2 h4 = vectorize(state[4]); - uint2 h5 = vectorize(state[5]); - uint2 h6 = vectorize(state[6]); - uint2 h7 = vectorize(state[7]); - - uint2 h8, t2; TFBIG_KINIT_UI2(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); - uint2 p[8] = { 0 }; + p[0] = p[1] = p[2] = p[3] = p[4] =p[5] =p[6] = p[7] = vectorize(0); TFBIG_4e_UI2(0); TFBIG_4o_UI2(1); @@ -834,100 +931,33 @@ void skein512_gpu_hash_close(uint32_t threads, uint32_t startNounce, uint64_t *g TFBIG_4o_UI2(17); TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); - uint64_t *outpHash = state; + uint64_t *outpHash = &output64[thread<<3]; #pragma unroll 8 for (int i = 0; i < 8; i++) outpHash[i] = devectorize(p[i]); } } -__global__ __launch_bounds__(128,6) -void skein512_gpu_hash_close_sm3(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint64_t t0 = 8ull; // extra - uint64_t t1 = 0xFF00000000000000ull; // etype - - uint64_t *state = &g_hash[thread * 8U]; - - uint64_t h0 = state[0]; - uint64_t h1 = state[1]; - uint64_t h2 = state[2]; - uint64_t h3 = state[3]; - uint64_t h4 = state[4]; - uint64_t h5 = state[5]; - uint64_t h6 = state[6]; - uint64_t h7 = state[7]; - uint64_t h8, t2; - TFBIG_KINIT(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); - - uint64_t p[8] = { 0 }; - - TFBIG_4e(0); - TFBIG_4o(1); - TFBIG_4e(2); - TFBIG_4o(3); - TFBIG_4e(4); - TFBIG_4o(5); - TFBIG_4e(6); - TFBIG_4o(7); - TFBIG_4e(8); - TFBIG_4o(9); - TFBIG_4e(10); - TFBIG_4o(11); - TFBIG_4e(12); - TFBIG_4o(13); - TFBIG_4e(14); - TFBIG_4o(15); - TFBIG_4e(16); - TFBIG_4o(17); - TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); - - uint64_t *outpHash = state; - #pragma unroll 8 - for (int i = 0; i < 8; i++) - outpHash[i] = p[i]; - } -} - __host__ -void quark_skein512_cpu_init(int thr_id, uint32_t threads) +void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap) { - // store the binary SM version - cuda_get_arch(thr_id); -} - -__host__ -void quark_skein512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) -{ - const uint32_t threadsperblock = 256; - - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - + uint32_t tpb = TPB52; int dev_id = device_map[thr_id]; + if (device_sm[dev_id] <= 500) tpb = TPB50; - // uint2 uint64 variants for SM 3.2+ -#ifdef SP_KERNEL - if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500) - quark_skein512_cpu_hash_64(threads, startNounce, d_nonceVector, d_hash); /* sp.cuh */ - else -#endif - if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300) - quark_skein512_gpu_hash_64 <<>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector); - else - quark_skein512_gpu_hash_64_sm3 <<>> (threads, startNounce, (uint64_t*)d_hash, d_nonceVector); + const dim3 grid((threads + tpb-1)/tpb); + const dim3 block(tpb); - MyStreamSynchronize(NULL, order, thr_id); + // hash function is cut in 2 parts to reduce kernel size + skein512_gpu_hash_80 <<< grid, block >>> (threads, startNounce, (uint64_t*)d_hash); } -/* skein / skein2 */ - __host__ -static void skein512_precalc_80(uint64_t* message) +void skein512_cpu_setBlock_80(void *pdata) { + uint64_t message[20]; + memcpy(&message[0], pdata, 80); + uint64_t p[8]; uint64_t h0, h1, h2, h3, h4, h5, h6, h7, h8; uint64_t t0, t1, t2; @@ -978,41 +1008,96 @@ static void skein512_precalc_80(uint64_t* message) message[16] = message[6] ^ p[6]; message[17] = message[7] ^ p[7]; - // h8 - message[18] = 0x1BD11BDAA9FC1A22ULL; - for (int i=10; i<18; i++) - message[18] ^= message[i]; -} - -__host__ -void skein512_cpu_setBlock_80(void *pdata) -{ - uint64_t message[20]; - memcpy(&message[0], pdata, 80); - skein512_precalc_80(message); - cudaMemcpyToSymbol(c_PaddedMessage80, message, sizeof(message), 0, cudaMemcpyHostToDevice); - + message[18] = t2; + + uint64_t buffer[128]; + +// buffer[ 0] = message[ 8]; + buffer[ 0] = message[ 9]; + h0 = buffer[ 1] = message[10]; + h1 = buffer[ 2] = message[11]; + h2 = buffer[ 3] = message[12]; + h3 = buffer[ 4] = message[13]; + h4 = buffer[ 5] = message[14]; + h5 = buffer[ 6] = message[15]; + h6 = buffer[ 7] = message[16]; + h7 = buffer[ 8] = message[17]; + h8 = buffer[ 9] = h0^h1^h2^h3^h4^h5^h6^h7^0x1BD11BDAA9FC1A22ULL; + + t0 = 0x50ull; + t1 = 0xB000000000000000ull; + t2 = t0^t1; + + p[0] = message[ 8] + h0; + p[2] = h2; p[3] = h3; p[4] = h4; + p[5] = h5 + t0; p[6] = h6 + t1; p[7] = h7; + p[2] += p[3]; p[4] += p[5]; p[6] += p[7]; + p[3] = ROTL64(p[3], 36) ^ p[2]; + p[5] = ROTL64(p[5], 19) ^ p[4]; + p[7] = ROTL64(p[7], 37) ^ p[6]; + p[4] += p[7]; + p[6] += p[5]; + p[7] = ROTL64(p[7], 27) ^ p[4]; + p[5] = ROTL64(p[5], 14) ^ p[6]; + + buffer[10] = p[0]; + buffer[11] = p[2]; + buffer[12] = p[3]; + buffer[13] = p[4]; + buffer[14] = p[5]; + buffer[15] = p[6]; + buffer[16] = p[7]; + buffer[17] = ROTL64(p[3], 42); + buffer[18] = ROTL64(p[5], 36); + buffer[19] = ROTL64(p[7], 39); + + buffer[20] = h6+t1; + buffer[21] = h8+1; + buffer[22] = h7+t2; + buffer[23] = h0+2; + buffer[24] = h8+t0; + buffer[25] = h1+3; + buffer[26] = h0+t1; + buffer[27] = h2+4; + buffer[28] = h1+t2; + buffer[29] = h3+5; + buffer[30] = h2+t0; + buffer[31] = h4+6; + buffer[32] = h3+t1; + buffer[33] = h5+7; + buffer[34] = h4+t2; + buffer[35] = h6+8; + buffer[36] = h5+t0; + buffer[37] = h7+9; + buffer[38] = h6+t1; + buffer[39] = h8+10; + buffer[40] = h7+t2; + buffer[41] = h0+11; + buffer[42] = h8+t0; + buffer[43] = h1+12; + buffer[44] = h0+t1; + buffer[45] = h2+13; + buffer[46] = h1+t2; + buffer[47] = h3+14; + buffer[48] = h2+t0; + buffer[49] = h4+15; + buffer[50] = h3+t1; + buffer[51] = h5+16; + buffer[52] = h4+t2; + buffer[53] = h6+17; + buffer[54] = h5+t0; + buffer[55] = h7+18; + buffer[56] = h6+t1; + + buffer[57] = message[8]; + + cudaMemcpyToSymbol(c_buffer, buffer, sizeof(c_buffer), 0, cudaMemcpyHostToDevice); CUDA_SAFE_CALL(cudaGetLastError()); } __host__ -void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *g_hash, int swap) +void quark_skein512_cpu_init(int thr_id, uint32_t threads) { - const uint32_t threadsperblock = 128; - - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - int dev_id = device_map[thr_id]; - uint64_t *d_hash = (uint64_t*) g_hash; - - if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300) { - // hash function is cut in 2 parts to reduce kernel size - skein512_gpu_hash_80 <<< grid, block >>> (threads, startNounce, d_hash, swap); - skein512_gpu_hash_close <<< grid, block >>> (threads, startNounce, d_hash); - } else { - // variant without uint2 variables - skein512_gpu_hash_80_sm3 <<< grid, block >>> (threads, startNounce, d_hash, swap); - skein512_gpu_hash_close_sm3 <<< grid, block >>> (threads, startNounce, d_hash); - } + cuda_get_arch(thr_id); } + diff --git a/quark/cuda_skein512_sp.cuh b/quark/cuda_skein512_sp.cuh deleted file mode 100644 index 9e606ee..0000000 --- a/quark/cuda_skein512_sp.cuh +++ /dev/null @@ -1,2634 +0,0 @@ -/* sp unrolled implentation of skein, used only for SM 5+ and 64 bytes input */ - -//#define WANT_SKEIN_80 - -#include -#include -#include - -#include "cuda_vector_uint2x4.h" - -/* ******* SP to TP ******* */ -#define _LOWORD(x) _LODWORD(x) -#define _HIWORD(x) _HIDWORD(x) -// simplified, inline func not faster -#define vectorizelow(/* uint32_t*/ v) make_uint2(v,0) -#define vectorizehigh(/*uint32_t*/ v) make_uint2(0,v) - -__device__ __inline__ uint2 ROL24(const uint2 a) -{ - uint2 result; - result.x = __byte_perm(a.x, a.y, 0x0765); - result.y = __byte_perm(a.y, a.x, 0x0765); - return result; -} -__device__ __inline__ uint2 ROR8(const uint2 a) { - uint2 result; - result.x = __byte_perm(a.y, a.x, 0x0765); - result.y = __byte_perm(a.x, a.y, 0x0765); - return result; -} -/* ************************ */ - -#ifdef WANT_SKEIN_80 -__constant__ uint2 precalcvalues[9]; -__constant__ uint32_t sha256_endingTable[64]; -static __constant__ uint64_t c_PaddedMessage16[2]; -static uint32_t *d_found[MAX_GPUS]; -static uint32_t *d_nonce[MAX_GPUS]; -#endif - -/* - * M9_ ## s ## _ ## i evaluates to s+i mod 9 (0 <= s <= 18, 0 <= i <= 7). - */ - -#define M9_0_0 0 -#define M9_0_1 1 -#define M9_0_2 2 -#define M9_0_3 3 -#define M9_0_4 4 -#define M9_0_5 5 -#define M9_0_6 6 -#define M9_0_7 7 - -#define M9_1_0 1 -#define M9_1_1 2 -#define M9_1_2 3 -#define M9_1_3 4 -#define M9_1_4 5 -#define M9_1_5 6 -#define M9_1_6 7 -#define M9_1_7 8 - -#define M9_2_0 2 -#define M9_2_1 3 -#define M9_2_2 4 -#define M9_2_3 5 -#define M9_2_4 6 -#define M9_2_5 7 -#define M9_2_6 8 -#define M9_2_7 0 - -#define M9_3_0 3 -#define M9_3_1 4 -#define M9_3_2 5 -#define M9_3_3 6 -#define M9_3_4 7 -#define M9_3_5 8 -#define M9_3_6 0 -#define M9_3_7 1 - -#define M9_4_0 4 -#define M9_4_1 5 -#define M9_4_2 6 -#define M9_4_3 7 -#define M9_4_4 8 -#define M9_4_5 0 -#define M9_4_6 1 -#define M9_4_7 2 - -#define M9_5_0 5 -#define M9_5_1 6 -#define M9_5_2 7 -#define M9_5_3 8 -#define M9_5_4 0 -#define M9_5_5 1 -#define M9_5_6 2 -#define M9_5_7 3 - -#define M9_6_0 6 -#define M9_6_1 7 -#define M9_6_2 8 -#define M9_6_3 0 -#define M9_6_4 1 -#define M9_6_5 2 -#define M9_6_6 3 -#define M9_6_7 4 - -#define M9_7_0 7 -#define M9_7_1 8 -#define M9_7_2 0 -#define M9_7_3 1 -#define M9_7_4 2 -#define M9_7_5 3 -#define M9_7_6 4 -#define M9_7_7 5 - -#define M9_8_0 8 -#define M9_8_1 0 -#define M9_8_2 1 -#define M9_8_3 2 -#define M9_8_4 3 -#define M9_8_5 4 -#define M9_8_6 5 -#define M9_8_7 6 - -#define M9_9_0 0 -#define M9_9_1 1 -#define M9_9_2 2 -#define M9_9_3 3 -#define M9_9_4 4 -#define M9_9_5 5 -#define M9_9_6 6 -#define M9_9_7 7 - -#define M9_10_0 1 -#define M9_10_1 2 -#define M9_10_2 3 -#define M9_10_3 4 -#define M9_10_4 5 -#define M9_10_5 6 -#define M9_10_6 7 -#define M9_10_7 8 - -#define M9_11_0 2 -#define M9_11_1 3 -#define M9_11_2 4 -#define M9_11_3 5 -#define M9_11_4 6 -#define M9_11_5 7 -#define M9_11_6 8 -#define M9_11_7 0 - -#define M9_12_0 3 -#define M9_12_1 4 -#define M9_12_2 5 -#define M9_12_3 6 -#define M9_12_4 7 -#define M9_12_5 8 -#define M9_12_6 0 -#define M9_12_7 1 - -#define M9_13_0 4 -#define M9_13_1 5 -#define M9_13_2 6 -#define M9_13_3 7 -#define M9_13_4 8 -#define M9_13_5 0 -#define M9_13_6 1 -#define M9_13_7 2 - -#define M9_14_0 5 -#define M9_14_1 6 -#define M9_14_2 7 -#define M9_14_3 8 -#define M9_14_4 0 -#define M9_14_5 1 -#define M9_14_6 2 -#define M9_14_7 3 - -#define M9_15_0 6 -#define M9_15_1 7 -#define M9_15_2 8 -#define M9_15_3 0 -#define M9_15_4 1 -#define M9_15_5 2 -#define M9_15_6 3 -#define M9_15_7 4 - -#define M9_16_0 7 -#define M9_16_1 8 -#define M9_16_2 0 -#define M9_16_3 1 -#define M9_16_4 2 -#define M9_16_5 3 -#define M9_16_6 4 -#define M9_16_7 5 - -#define M9_17_0 8 -#define M9_17_1 0 -#define M9_17_2 1 -#define M9_17_3 2 -#define M9_17_4 3 -#define M9_17_5 4 -#define M9_17_6 5 -#define M9_17_7 6 - -#define M9_18_0 0 -#define M9_18_1 1 -#define M9_18_2 2 -#define M9_18_3 3 -#define M9_18_4 4 -#define M9_18_5 5 -#define M9_18_6 6 -#define M9_18_7 7 - -/* - * M3_ ## s ## _ ## i evaluates to s+i mod 3 (0 <= s <= 18, 0 <= i <= 1). - */ - -#define M3_0_0 0 -#define M3_0_1 1 -#define M3_1_0 1 -#define M3_1_1 2 -#define M3_2_0 2 -#define M3_2_1 0 -#define M3_3_0 0 -#define M3_3_1 1 -#define M3_4_0 1 -#define M3_4_1 2 -#define M3_5_0 2 -#define M3_5_1 0 -#define M3_6_0 0 -#define M3_6_1 1 -#define M3_7_0 1 -#define M3_7_1 2 -#define M3_8_0 2 -#define M3_8_1 0 -#define M3_9_0 0 -#define M3_9_1 1 -#define M3_10_0 1 -#define M3_10_1 2 -#define M3_11_0 2 -#define M3_11_1 0 -#define M3_12_0 0 -#define M3_12_1 1 -#define M3_13_0 1 -#define M3_13_1 2 -#define M3_14_0 2 -#define M3_14_1 0 -#define M3_15_0 0 -#define M3_15_1 1 -#define M3_16_0 1 -#define M3_16_1 2 -#define M3_17_0 2 -#define M3_17_1 0 -#define M3_18_0 0 -#define M3_18_1 1 - -#define XCAT(x, y) XCAT_(x, y) -#define XCAT_(x, y) x ## y - -#define SKBI(k, s, i) XCAT(k, XCAT(XCAT(XCAT(M9_, s), _), i)) -#define SKBT(t, s, v) XCAT(t, XCAT(XCAT(XCAT(M3_, s), _), v)) - -//vectorize(0x1BD11BDAA9FC1A22ULL); -#define TFBIG_ADDKEY(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \ - w0 = (w0 + SKBI(k, s, 0)); \ - w1 = (w1 + SKBI(k, s, 1)); \ - w2 = (w2 + SKBI(k, s, 2)); \ - w3 = (w3 + SKBI(k, s, 3)); \ - w4 = (w4 + SKBI(k, s, 4)); \ - w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \ - w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \ - w7 = (w7 + SKBI(k, s, 7) + vectorizelow(s)); \ - } - -#define TFBIG_MIX(x0, x1, rc) { \ - x0 = x0 + x1; \ - x1 = ROL2(x1, rc) ^ x0; \ - } - -#define TFBIG_MIX8(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \ - TFBIG_MIX(w0, w1, rc0); \ - TFBIG_MIX(w2, w3, rc1); \ - TFBIG_MIX(w4, w5, rc2); \ - TFBIG_MIX(w6, w7, rc3); \ - } - -#define TFBIG_4e(s) { \ - TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ - TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \ - TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \ - TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \ - TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); \ - } - -#define TFBIG_4o(s) { \ - TFBIG_ADDKEY(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ - TFBIG_MIX8(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \ - TFBIG_MIX8(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \ - TFBIG_MIX8(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \ - TFBIG_MIX8(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \ - } - -/* uint2 variant for SM3.2+ */ - -#define TFBIG_KINIT_UI2(k0, k1, k2, k3, k4, k5, k6, k7, k8, t0, t1, t2) { \ - k8 = ((k0 ^ k1) ^ (k2 ^ k3)) ^ ((k4 ^ k5) ^ (k6 ^ k7)) \ - ^ vectorize(SPH_C64(0x1BD11BDAA9FC1A22)); \ - t2 = t0 ^ t1; \ - } - -#define TFBIG_ADDKEY_UI2(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \ - w0 = (w0 + SKBI(k, s, 0)); \ - w1 = (w1 + SKBI(k, s, 1)); \ - w2 = (w2 + SKBI(k, s, 2)); \ - w3 = (w3 + SKBI(k, s, 3)); \ - w4 = (w4 + SKBI(k, s, 4)); \ - w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \ - w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \ - w7 = (w7 + SKBI(k, s, 7) + vectorize(s)); \ - } - -#define TFBIG_ADDKEY_PRE(w0, w1, w2, w3, w4, w5, w6, w7, k, t, s) { \ - w0 = (w0 + SKBI(k, s, 0)); \ - w1 = (w1 + SKBI(k, s, 1)); \ - w2 = (w2 + SKBI(k, s, 2)); \ - w3 = (w3 + SKBI(k, s, 3)); \ - w4 = (w4 + SKBI(k, s, 4)); \ - w5 = (w5 + SKBI(k, s, 5) + SKBT(t, s, 0)); \ - w6 = (w6 + SKBI(k, s, 6) + SKBT(t, s, 1)); \ - w7 = (w7 + SKBI(k, s, 7) + (s)); \ - } - -#define TFBIG_MIX_UI2(x0, x1, rc) { \ - x0 = x0 + x1; \ - x1 = ROL2(x1, rc) ^ x0; \ - } - -#define TFBIG_MIX_PRE(x0, x1, rc) { \ - x0 = x0 + x1; \ - x1 = ROTL64(x1, rc) ^ x0; \ - } - -#define TFBIG_MIX8_UI2(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \ - TFBIG_MIX_UI2(w0, w1, rc0); \ - TFBIG_MIX_UI2(w2, w3, rc1); \ - TFBIG_MIX_UI2(w4, w5, rc2); \ - TFBIG_MIX_UI2(w6, w7, rc3); \ - } - -#define TFBIG_MIX8_PRE(w0, w1, w2, w3, w4, w5, w6, w7, rc0, rc1, rc2, rc3) { \ - TFBIG_MIX_PRE(w0, w1, rc0); \ - TFBIG_MIX_PRE(w2, w3, rc1); \ - TFBIG_MIX_PRE(w4, w5, rc2); \ - TFBIG_MIX_PRE(w6, w7, rc3); \ - } - -#define TFBIG_4e_UI2(s) { \ - TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ - TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \ - TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \ - TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \ - TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); \ - } - -#define TFBIG_4e_PRE(s) { \ - TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ - TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 46, 36, 19, 37); \ - TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 33, 27, 14, 42); \ - TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 17, 49, 36, 39); \ - TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 44, 9, 54, 56); \ - } - -#define TFBIG_4o_UI2(s) { \ - TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ - TFBIG_MIX8_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \ - TFBIG_MIX8_UI2(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \ - TFBIG_MIX8_UI2(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \ - TFBIG_MIX8_UI2(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \ - } - -#define TFBIG_4o_PRE(s) { \ - TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, s); \ - TFBIG_MIX8_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], 39, 30, 34, 24); \ - TFBIG_MIX8_PRE(p[2], p[1], p[4], p[7], p[6], p[5], p[0], p[3], 13, 50, 10, 17); \ - TFBIG_MIX8_PRE(p[4], p[1], p[6], p[3], p[0], p[5], p[2], p[7], 25, 29, 39, 43); \ - TFBIG_MIX8_PRE(p[6], p[1], p[0], p[7], p[2], p[5], p[4], p[3], 8, 35, 56, 22); \ - } - -__global__ -#if __CUDA_ARCH__ > 500 -__launch_bounds__(480, 3) -#else -__launch_bounds__(240, 6) -#endif -void quark_skein512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * const __restrict__ g_hash, const uint32_t *const __restrict__ g_nonceVector) -{ - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - // Skein - uint2 skein_p[8], h[9]; - - const uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - - const int hashPosition = nounce - startNounce; - uint64_t *Hash = &g_hash[8 * hashPosition]; - - uint2 msg[8]; - - uint2x4 *phash = (uint2x4*)Hash; - uint2x4 *outpt = (uint2x4*)msg; - outpt[0] = phash[0]; - outpt[1] = phash[1]; - - h[0] = skein_p[0] = (msg[0]); - h[1] = skein_p[1] = (msg[1]); - h[2] = skein_p[2] = (msg[2]); - h[3] = skein_p[3] = (msg[3]); - h[4] = skein_p[4] = (msg[4]); - h[5] = skein_p[5] = (msg[5]); - h[6] = skein_p[6] = (msg[6]); - h[7] = skein_p[7] = (msg[7]); - - skein_p[0] += vectorize(0x4903ADFF749C51CEULL); - skein_p[1] += vectorize(0x0D95DE399746DF03ULL); - skein_p[2] += vectorize(0x8FD1934127C79BCEULL); - skein_p[3] += vectorize(0x9A255629FF352CB1ULL); - skein_p[4] += vectorize(0x5DB62599DF6CA7B0ULL); - skein_p[5] += vectorize(0xEABE394CA9D5C434ULL); - skein_p[6] += vectorize(0x891112C71A75B523ULL); - skein_p[7] += vectorize(0xAE18A40B660FCC33ULL); - skein_p[0] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; - skein_p[2] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; - skein_p[4] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; - skein_p[6] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; - skein_p[2] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; - skein_p[4] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; - skein_p[6] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; - skein_p[0] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; - skein_p[4] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; - skein_p[6] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; - skein_p[0] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; - skein_p[2] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; - skein_p[6] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; - skein_p[0] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; - skein_p[2] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; - skein_p[4] += skein_p[3]; - skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; - skein_p[0] += vectorize(0x0D95DE399746DF03ULL); - skein_p[1] += vectorize(0x8FD1934127C79BCEULL); - skein_p[2] += vectorize(0x9A255629FF352CB1ULL); - skein_p[3] += vectorize(0x5DB62599DF6CA7B0ULL); - skein_p[4] += vectorize(0xEABE394CA9D5C3F4ULL); - skein_p[5] += vectorize(0x891112C71A75B523ULL); - skein_p[6] += vectorize(0x9E18A40B660FCC73ULL); - skein_p[7] += vectorize(0xcab2076d98173ec4ULL+1); - skein_p[0] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; - skein_p[2] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; - skein_p[4] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; - skein_p[6] += skein_p[7]; - skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; - skein_p[2] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; - skein_p[4] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; - skein_p[6] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; - skein_p[0] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; - skein_p[4] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; - skein_p[6] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; - skein_p[0] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; - skein_p[2] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; - skein_p[6] += skein_p[1]; - skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; - skein_p[0] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; - skein_p[2] += skein_p[5]; - skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; - skein_p[4] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; - skein_p[0] += vectorize(0x8FD1934127C79BCEULL); - skein_p[1] += vectorize(0x9A255629FF352CB1ULL); - skein_p[2] += vectorize(0x5DB62599DF6CA7B0ULL); - skein_p[3] += vectorize(0xEABE394CA9D5C3F4ULL); - skein_p[4] += vectorize(0x991112C71A75B523ULL); - skein_p[5] += vectorize(0x9E18A40B660FCC73ULL); - skein_p[6] += vectorize(0xCAB2076D98173F04ULL); - skein_p[7] += vectorize(0x4903ADFF749C51D0ULL); - skein_p[0] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; - skein_p[2] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; - skein_p[4] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; - skein_p[6] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; - skein_p[2] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; - skein_p[4] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; - skein_p[6] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; - skein_p[0] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; - skein_p[4] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; - skein_p[6] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; - skein_p[0] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; - skein_p[2] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; - skein_p[6] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; - skein_p[0] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; - skein_p[2] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; - skein_p[4] += skein_p[3]; - skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; - skein_p[0] += vectorize(0x9A255629FF352CB1ULL); - skein_p[1] += vectorize(0x5DB62599DF6CA7B0ULL); - skein_p[2] += vectorize(0xEABE394CA9D5C3F4ULL); - skein_p[3] += vectorize(0x991112C71A75B523ULL); - skein_p[4] += vectorize(0xAE18A40B660FCC33ULL); - skein_p[5] += vectorize(0xcab2076d98173f04ULL); - skein_p[6] += vectorize(0x3903ADFF749C51CEULL); - skein_p[7] += vectorize(0x0D95DE399746DF03ULL+3); - skein_p[0] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; - skein_p[2] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; - skein_p[4] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; - skein_p[6] += skein_p[7]; - skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; - skein_p[2] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; - skein_p[4] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; - skein_p[6] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; - skein_p[0] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; - skein_p[4] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; - skein_p[6] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; - skein_p[0] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; - skein_p[2] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; - skein_p[6] += skein_p[1]; - skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; - skein_p[0] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; - skein_p[2] += skein_p[5]; - skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; - skein_p[4] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; - skein_p[0] += vectorize(0x5DB62599DF6CA7B0ULL); - skein_p[1] += vectorize(0xEABE394CA9D5C3F4ULL); - skein_p[2] += vectorize(0x991112C71A75B523ULL); - skein_p[3] += vectorize(0xAE18A40B660FCC33ULL); - skein_p[4] += vectorize(0xcab2076d98173ec4ULL); - skein_p[5] += vectorize(0x3903ADFF749C51CEULL); - skein_p[6] += vectorize(0xFD95DE399746DF43ULL); - skein_p[7] += vectorize(0x8FD1934127C79BD2ULL); - skein_p[0] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; - skein_p[2] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; - skein_p[4] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; - skein_p[6] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; - skein_p[2] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; - skein_p[4] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; - skein_p[6] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; - skein_p[0] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; - skein_p[4] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; - skein_p[6] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; - skein_p[0] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; - skein_p[2] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; - skein_p[6] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; - skein_p[0] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; - skein_p[2] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; - skein_p[4] += skein_p[3]; - skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; - skein_p[0] += vectorize(0xEABE394CA9D5C3F4ULL); - skein_p[1] += vectorize(0x991112C71A75B523ULL); - skein_p[2] += vectorize(0xAE18A40B660FCC33ULL); - skein_p[3] += vectorize(0xcab2076d98173ec4ULL); - skein_p[4] += vectorize(0x4903ADFF749C51CEULL); - skein_p[5] += vectorize(0x0D95DE399746DF03ULL + 0xf000000000000040ULL); - skein_p[6] += vectorize(0x8FD1934127C79BCEULL + 0x0000000000000040ULL); - skein_p[7] += vectorize(0x9A255629FF352CB1ULL + 5); - skein_p[0] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; - skein_p[2] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; - skein_p[4] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; - skein_p[6] += skein_p[7]; - skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; - skein_p[2] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; - skein_p[4] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; - skein_p[6] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; - skein_p[0] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; - skein_p[4] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; - skein_p[6] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; - skein_p[0] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; - skein_p[2] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; - skein_p[6] += skein_p[1]; - skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; - skein_p[0] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; - skein_p[2] += skein_p[5]; - skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; - skein_p[4] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; - skein_p[0] += vectorize(0x991112C71A75B523ULL); - skein_p[1] += vectorize(0xAE18A40B660FCC33ULL); - skein_p[2] += vectorize(0xcab2076d98173ec4ULL); - skein_p[3] += vectorize(0x4903ADFF749C51CEULL); - skein_p[4] += vectorize(0x0D95DE399746DF03ULL); - skein_p[5] += vectorize(0x8FD1934127C79BCEULL + 0x0000000000000040ULL); - skein_p[6] += vectorize(0x8A255629FF352CB1ULL); - skein_p[7] += vectorize(0x5DB62599DF6CA7B0ULL + 6); - skein_p[0] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; - skein_p[2] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; - skein_p[4] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; - skein_p[6] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; - skein_p[2] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; - skein_p[4] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; - skein_p[6] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; - skein_p[0] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; - skein_p[4] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; - skein_p[6] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; - skein_p[0] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; - skein_p[2] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; - skein_p[6] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; - skein_p[0] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; - skein_p[2] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; - skein_p[4] += skein_p[3]; - skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; - skein_p[0] += vectorize(0xAE18A40B660FCC33ULL); - skein_p[1] += vectorize(0xcab2076d98173ec4ULL); - skein_p[2] += vectorize(0x4903ADFF749C51CEULL); - skein_p[3] += vectorize(0x0D95DE399746DF03ULL); - skein_p[4] += vectorize(0x8FD1934127C79BCEULL); - skein_p[5] += vectorize(0x8A255629FF352CB1ULL); - skein_p[6] += vectorize(0x4DB62599DF6CA7F0ULL); - skein_p[7] += vectorize(0xEABE394CA9D5C3F4ULL + 7); - skein_p[0] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; - skein_p[2] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; - skein_p[4] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; - skein_p[6] += skein_p[7]; - skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; - skein_p[2] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; - skein_p[4] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; - skein_p[6] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; - skein_p[0] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; - skein_p[4] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; - skein_p[6] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; - skein_p[0] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; - skein_p[2] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; - skein_p[6] += skein_p[1]; - skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; - skein_p[0] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; - skein_p[2] += skein_p[5]; - skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; - skein_p[4] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; - skein_p[0] += vectorize(0xcab2076d98173ec4ULL); - skein_p[1] += vectorize(0x4903ADFF749C51CEULL); - skein_p[2] += vectorize(0x0D95DE399746DF03ULL); - skein_p[3] += vectorize(0x8FD1934127C79BCEULL); - skein_p[4] += vectorize(0x9A255629FF352CB1ULL); - skein_p[5] += vectorize(0x4DB62599DF6CA7F0ULL); - skein_p[6] += vectorize(0xEABE394CA9D5C434ULL); - skein_p[7] += vectorize(0x991112C71A75B52BULL); - skein_p[0] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; - skein_p[2] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; - skein_p[4] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; - skein_p[6] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; - skein_p[2] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; - skein_p[4] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; - skein_p[6] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; - skein_p[0] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; - skein_p[4] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; - skein_p[6] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; - skein_p[0] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; - skein_p[2] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; - skein_p[6] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; - skein_p[0] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; - skein_p[2] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; - skein_p[4] += skein_p[3]; - skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; - skein_p[0] += vectorize(0x4903ADFF749C51CEULL); - skein_p[1] += vectorize(0x0D95DE399746DF03ULL); - skein_p[2] += vectorize(0x8FD1934127C79BCEULL); - skein_p[3] += vectorize(0x9A255629FF352CB1ULL); - skein_p[4] += vectorize(0x5DB62599DF6CA7B0ULL); - skein_p[5] += vectorize(0xEABE394CA9D5C434ULL); - skein_p[6] += vectorize(0x891112C71A75B523ULL); - skein_p[7] += vectorize(0xAE18A40B660FCC33ULL + 9); - skein_p[0] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; - skein_p[2] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; - skein_p[4] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; - skein_p[6] += skein_p[7]; - skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; - skein_p[2] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; - skein_p[4] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; - skein_p[6] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; - skein_p[0] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; - skein_p[4] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; - skein_p[6] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; - skein_p[0] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; - skein_p[2] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; - skein_p[6] += skein_p[1]; - skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; - skein_p[0] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; - skein_p[2] += skein_p[5]; - skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; - skein_p[4] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; - skein_p[0] += vectorize(0x0D95DE399746DF03ULL); - skein_p[1] += vectorize(0x8FD1934127C79BCEULL); - skein_p[2] += vectorize(0x9A255629FF352CB1ULL); - skein_p[3] += vectorize(0x5DB62599DF6CA7B0ULL); - skein_p[4] += vectorize(0xEABE394CA9D5C3F4ULL); - skein_p[5] += vectorize(0x891112C71A75B523ULL); - skein_p[6] += vectorize(0x9E18A40B660FCC73ULL); - skein_p[7] += vectorize(0xcab2076d98173eceULL); - skein_p[0] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; - skein_p[2] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; - skein_p[4] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; - skein_p[6] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; - skein_p[2] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; - skein_p[4] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; - skein_p[6] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; - skein_p[0] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; - skein_p[4] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; - skein_p[6] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; - skein_p[0] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; - skein_p[2] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; - skein_p[6] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; - skein_p[0] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; - skein_p[2] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; - skein_p[4] += skein_p[3]; - skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; - skein_p[0] += vectorize(0x8FD1934127C79BCEULL); - skein_p[1] += vectorize(0x9A255629FF352CB1ULL); - skein_p[2] += vectorize(0x5DB62599DF6CA7B0ULL); - skein_p[3] += vectorize(0xEABE394CA9D5C3F4ULL); - skein_p[4] += vectorize(0x991112C71A75B523ULL); - skein_p[5] += vectorize(0x9E18A40B660FCC73ULL); - skein_p[6] += vectorize(0xcab2076d98173ec4ULL + 0x0000000000000040ULL); - skein_p[7] += vectorize(0x4903ADFF749C51CEULL + 11); - skein_p[0] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; - skein_p[2] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; - skein_p[4] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; - skein_p[6] += skein_p[7]; - skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; - skein_p[2] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; - skein_p[4] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; - skein_p[6] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; - skein_p[0] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; - skein_p[4] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; - skein_p[6] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; - skein_p[0] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; - skein_p[2] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; - skein_p[6] += skein_p[1]; - skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; - skein_p[0] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; - skein_p[2] += skein_p[5]; - skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; - skein_p[4] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; - skein_p[0] += vectorize(0x9A255629FF352CB1ULL); - skein_p[1] += vectorize(0x5DB62599DF6CA7B0ULL); - skein_p[2] += vectorize(0xEABE394CA9D5C3F4ULL); - skein_p[3] += vectorize(0x991112C71A75B523ULL); - skein_p[4] += vectorize(0xAE18A40B660FCC33ULL); - skein_p[5] += vectorize(0xcab2076d98173ec4ULL + 0x0000000000000040ULL); - skein_p[6] += vectorize(0x3903ADFF749C51CEULL); - skein_p[7] += vectorize(0x0D95DE399746DF03ULL + 12); - skein_p[0] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; - skein_p[2] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; - skein_p[4] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; - skein_p[6] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; - skein_p[2] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; - skein_p[4] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; - skein_p[6] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; - skein_p[0] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; - skein_p[4] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; - skein_p[6] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; - skein_p[0] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; - skein_p[2] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; - skein_p[6] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; - skein_p[0] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; - skein_p[2] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; - skein_p[4] += skein_p[3]; - skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; - skein_p[0] += vectorize(0x5DB62599DF6CA7B0ULL); - skein_p[1] += vectorize(0xEABE394CA9D5C3F4ULL); - skein_p[2] += vectorize(0x991112C71A75B523ULL); - skein_p[3] += vectorize(0xAE18A40B660FCC33ULL); - skein_p[4] += vectorize(0xcab2076d98173ec4ULL); - skein_p[5] += vectorize(0x3903ADFF749C51CEULL); - skein_p[6] += vectorize(0x0D95DE399746DF03ULL + 0xf000000000000040ULL); - skein_p[7] += vectorize(0x8FD1934127C79BCEULL + 13); - skein_p[0] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; - skein_p[2] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; - skein_p[4] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; - skein_p[6] += skein_p[7]; - skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; - skein_p[2] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; - skein_p[4] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; - skein_p[6] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; - skein_p[0] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; - skein_p[4] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; - skein_p[6] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; - skein_p[0] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; - skein_p[2] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; - skein_p[6] += skein_p[1]; - skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; - skein_p[0] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; - skein_p[2] += skein_p[5]; - skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; - skein_p[4] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; - skein_p[0] += vectorize(0xEABE394CA9D5C3F4ULL); - skein_p[1] += vectorize(0x991112C71A75B523ULL); - skein_p[2] += vectorize(0xAE18A40B660FCC33ULL); - skein_p[3] += vectorize(0xcab2076d98173ec4ULL); - skein_p[4] += vectorize(0x4903ADFF749C51CEULL); - skein_p[5] += vectorize(0x0D95DE399746DF03ULL + 0xf000000000000040ULL); - skein_p[6] += vectorize(0x8FD1934127C79BCEULL + 0x0000000000000040ULL); - skein_p[7] += vectorize(0x9A255629FF352CB1ULL + 14); - skein_p[0] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; - skein_p[2] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; - skein_p[4] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; - skein_p[6] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; - skein_p[2] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; - skein_p[4] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; - skein_p[6] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; - skein_p[0] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; - skein_p[4] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; - skein_p[6] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; - skein_p[0] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; - skein_p[2] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; - skein_p[6] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; - skein_p[0] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; - skein_p[2] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; - skein_p[4] += skein_p[3]; - skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; - skein_p[0] += vectorize(0x991112C71A75B523ULL); - skein_p[1] += vectorize(0xAE18A40B660FCC33ULL); - skein_p[2] += vectorize(0xcab2076d98173ec4ULL); - skein_p[3] += vectorize(0x4903ADFF749C51CEULL); - skein_p[4] += vectorize(0x0D95DE399746DF03ULL); - skein_p[5] += vectorize(0x8FD1934127C79BCEULL + 0x0000000000000040ULL); - skein_p[6] += vectorize(0x8A255629FF352CB1ULL); - skein_p[7] += vectorize(0x5DB62599DF6CA7B0ULL + 15); - skein_p[0] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; - skein_p[2] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; - skein_p[4] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; - skein_p[6] += skein_p[7]; - skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; - skein_p[2] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; - skein_p[4] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; - skein_p[6] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; - skein_p[0] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; - skein_p[4] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; - skein_p[6] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; - skein_p[0] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; - skein_p[2] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; - skein_p[6] += skein_p[1]; - skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; - skein_p[0] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; - skein_p[2] += skein_p[5]; - skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; - skein_p[4] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; - skein_p[0] += vectorize(0xAE18A40B660FCC33ULL); - skein_p[1] += vectorize(0xcab2076d98173ec4ULL); - skein_p[2] += vectorize(0x4903ADFF749C51CEULL); - skein_p[3] += vectorize(0x0D95DE399746DF03ULL); - skein_p[4] += vectorize(0x8FD1934127C79BCEULL); - skein_p[5] += vectorize(0x8A255629FF352CB1ULL); - skein_p[6] += vectorize(0x4DB62599DF6CA7F0ULL); - skein_p[7] += vectorize(0xEABE394CA9D5C3F4ULL +16ULL); - skein_p[0] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 46) ^ skein_p[0]; - skein_p[2] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 36) ^ skein_p[2]; - skein_p[4] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 19) ^ skein_p[4]; - skein_p[6] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 37) ^ skein_p[6]; - skein_p[2] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 33) ^ skein_p[2]; - skein_p[4] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 27) ^ skein_p[4]; - skein_p[6] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 14) ^ skein_p[6]; - skein_p[0] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 42) ^ skein_p[0]; - skein_p[4] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 17) ^ skein_p[4]; - skein_p[6] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 49) ^ skein_p[6]; - skein_p[0] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 36) ^ skein_p[0]; - skein_p[2] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 39) ^ skein_p[2]; - skein_p[6] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 44) ^ skein_p[6]; - skein_p[0] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 9) ^ skein_p[0]; - skein_p[2] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 54) ^ skein_p[2]; - skein_p[4] += skein_p[3]; - skein_p[3] = ROR8(skein_p[3]) ^ skein_p[4]; - skein_p[0] += vectorize(0xcab2076d98173ec4ULL); - skein_p[1] += vectorize(0x4903ADFF749C51CEULL); - skein_p[2] += vectorize(0x0D95DE399746DF03ULL); - skein_p[3] += vectorize(0x8FD1934127C79BCEULL); - skein_p[4] += vectorize(0x9A255629FF352CB1ULL); - skein_p[5] += vectorize(0x4DB62599DF6CA7F0ULL); - skein_p[6] += vectorize(0xEABE394CA9D5C3F4ULL + 0x0000000000000040ULL); - skein_p[7] += vectorize(0x991112C71A75B523ULL + 17); - skein_p[0] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 39) ^ skein_p[0]; - skein_p[2] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 30) ^ skein_p[2]; - skein_p[4] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 34) ^ skein_p[4]; - skein_p[6] += skein_p[7]; - skein_p[7] = ROL24(skein_p[7]) ^ skein_p[6]; - skein_p[2] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 13) ^ skein_p[2]; - skein_p[4] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 50) ^ skein_p[4]; - skein_p[6] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 10) ^ skein_p[6]; - skein_p[0] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 17) ^ skein_p[0]; - skein_p[4] += skein_p[1]; - skein_p[1] = ROL2(skein_p[1], 25) ^ skein_p[4]; - skein_p[6] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 29) ^ skein_p[6]; - skein_p[0] += skein_p[5]; - skein_p[5] = ROL2(skein_p[5], 39) ^ skein_p[0]; - skein_p[2] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 43) ^ skein_p[2]; - skein_p[6] += skein_p[1]; - skein_p[1] = ROL8(skein_p[1]) ^ skein_p[6]; - skein_p[0] += skein_p[7]; - skein_p[7] = ROL2(skein_p[7], 35) ^ skein_p[0]; - skein_p[2] += skein_p[5]; - skein_p[5] = ROR8(skein_p[5]) ^ skein_p[2]; - skein_p[4] += skein_p[3]; - skein_p[3] = ROL2(skein_p[3], 22) ^ skein_p[4]; - skein_p[0] += vectorize(0x4903ADFF749C51CEULL); - skein_p[1] += vectorize(0x0D95DE399746DF03ULL); - skein_p[2] += vectorize(0x8FD1934127C79BCEULL); - skein_p[3] += vectorize(0x9A255629FF352CB1ULL); - skein_p[4] += vectorize(0x5DB62599DF6CA7B0ULL); - skein_p[5] += vectorize(0xEABE394CA9D5C3F4ULL + 0x0000000000000040ULL); - skein_p[6] += vectorize(0x891112C71A75B523ULL); - skein_p[7] += vectorize(0xAE18A40B660FCC33ULL + 18); - -#define h0 skein_p[0] -#define h1 skein_p[1] -#define h2 skein_p[2] -#define h3 skein_p[3] -#define h4 skein_p[4] -#define h5 skein_p[5] -#define h6 skein_p[6] -#define h7 skein_p[7] - h0 ^= h[0]; - h1 ^= h[1]; - h2 ^= h[2]; - h3 ^= h[3]; - h4 ^= h[4]; - h5 ^= h[5]; - h6 ^= h[6]; - h7 ^= h[7]; - - uint2 skein_h8 = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^ h6 ^ h7 ^ vectorize(0x1BD11BDAA9FC1A22ULL); - - uint2 hash64[8]; - - hash64[0] = (h0); -// hash64[1] = (h1); - hash64[2] = (h2); -// hash64[3] = (h3); - hash64[4] = (h4); - hash64[5] = (h5 + vectorizelow(8ULL)); - hash64[6] = (h6 + vectorizehigh(0xff000000UL)); -// hash64[7] = (h7); - - hash64[0] += h1; - hash64[1] = ROL2(h1, 46) ^ hash64[0]; - hash64[2] += h3; - hash64[3] = ROL2(h3, 36) ^ hash64[2]; - hash64[4] += hash64[5]; - hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; - hash64[6] += h7; - hash64[7] = ROL2(h7, 37) ^ hash64[6]; - hash64[2] += hash64[1]; - hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; - hash64[4] += hash64[7]; - hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; - hash64[6] += hash64[5]; - hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; - hash64[0] += hash64[3]; - hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; - hash64[4] += hash64[1]; - hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; - hash64[6] += hash64[3]; - hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; - hash64[0] += hash64[5]; - hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; - hash64[2] += hash64[7]; - hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; - hash64[6] += hash64[1]; - hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; - hash64[0] += hash64[7]; - hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; - hash64[2] += hash64[5]; - hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; - hash64[4] += hash64[3]; - hash64[3] = ROR8(hash64[3]) ^ hash64[4]; - hash64[0] = (hash64[0] + h1); - hash64[1] = (hash64[1] + h2); - hash64[2] = (hash64[2] + h3); - hash64[3] = (hash64[3] + h4); - hash64[4] = (hash64[4] + h5); - hash64[5] = (hash64[5] + h6 + vectorizehigh(0xff000000UL)); - hash64[6] = (hash64[6] + h7 + vectorize(0xff00000000000008ULL)); - hash64[7] = (hash64[7] + skein_h8 + vectorizelow(1)); - hash64[0] += hash64[1]; - hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; - hash64[2] += hash64[3]; - hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; - hash64[4] += hash64[5]; - hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; - hash64[6] += hash64[7]; - hash64[7] = ROL24(hash64[7]) ^ hash64[6]; - hash64[2] += hash64[1]; - hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; - hash64[4] += hash64[7]; - hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; - hash64[6] += hash64[5]; - hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; - hash64[0] += hash64[3]; - hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; - hash64[4] += hash64[1]; - hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; - hash64[6] += hash64[3]; - hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; - hash64[0] += hash64[5]; - hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; - hash64[2] += hash64[7]; - hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; - hash64[6] += hash64[1]; - hash64[1] = ROL8(hash64[1]) ^ hash64[6]; - hash64[0] += hash64[7]; - hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; - hash64[2] += hash64[5]; - hash64[5] = ROR8(hash64[5]) ^ hash64[2]; - hash64[4] += hash64[3]; - hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; - hash64[0] = (hash64[0] + h2); - hash64[1] = (hash64[1] + h3); - hash64[2] = (hash64[2] + h4); - hash64[3] = (hash64[3] + h5); - hash64[4] = (hash64[4] + h6); - hash64[5] = (hash64[5] + h7 + vectorize(0xff00000000000008ULL)); - hash64[6] = (hash64[6] + skein_h8 + vectorizelow(8ULL)); - hash64[7] = (hash64[7] + h0 + vectorize(2)); - hash64[0] += hash64[1]; - hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; - hash64[2] += hash64[3]; - hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; - hash64[4] += hash64[5]; - hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; - hash64[6] += hash64[7]; - hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; - hash64[2] += hash64[1]; - hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; - hash64[4] += hash64[7]; - hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; - hash64[6] += hash64[5]; - hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; - hash64[0] += hash64[3]; - hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; - hash64[4] += hash64[1]; - hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; - hash64[6] += hash64[3]; - hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; - hash64[0] += hash64[5]; - hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; - hash64[2] += hash64[7]; - hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; - hash64[6] += hash64[1]; - hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; - hash64[0] += hash64[7]; - hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; - hash64[2] += hash64[5]; - hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; - hash64[4] += hash64[3]; - hash64[3] = ROR8(hash64[3]) ^ hash64[4]; - hash64[0] = (hash64[0] + h3); - hash64[1] = (hash64[1] + h4); - hash64[2] = (hash64[2] + h5); - hash64[3] = (hash64[3] + h6); - hash64[4] = (hash64[4] + h7); - hash64[5] = (hash64[5] + skein_h8 + vectorizelow(8)); - hash64[6] = (hash64[6] + h0 + vectorizehigh(0xff000000UL)); - hash64[7] = (hash64[7] + h1 + vectorizelow(3)); - hash64[0] += hash64[1]; - hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; - hash64[2] += hash64[3]; - hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; - hash64[4] += hash64[5]; - hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; - hash64[6] += hash64[7]; - hash64[7] = ROL24(hash64[7]) ^ hash64[6]; - hash64[2] += hash64[1]; - hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; - hash64[4] += hash64[7]; - hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; - hash64[6] += hash64[5]; - hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; - hash64[0] += hash64[3]; - hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; - hash64[4] += hash64[1]; - hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; - hash64[6] += hash64[3]; - hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; - hash64[0] += hash64[5]; - hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; - hash64[2] += hash64[7]; - hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; - hash64[6] += hash64[1]; - hash64[1] = ROL8(hash64[1]) ^ hash64[6]; - hash64[0] += hash64[7]; - hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; - hash64[2] += hash64[5]; - hash64[5] = ROR8(hash64[5]) ^ hash64[2]; - hash64[4] += hash64[3]; - hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; - hash64[0] = (hash64[0] + h4); - hash64[1] = (hash64[1] + h5); - hash64[2] = (hash64[2] + h6); - hash64[3] = (hash64[3] + h7); - hash64[4] = (hash64[4] + skein_h8); - hash64[5] = (hash64[5] + h0 + vectorizehigh(0xff000000UL)); - hash64[6] = (hash64[6] + h1 + vectorize(0xff00000000000008ULL)); - hash64[7] = (hash64[7] + h2 + vectorizelow(4)); - hash64[0] += hash64[1]; - hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; - hash64[2] += hash64[3]; - hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; - hash64[4] += hash64[5]; - hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; - hash64[6] += hash64[7]; - hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; - hash64[2] += hash64[1]; - hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; - hash64[4] += hash64[7]; - hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; - hash64[6] += hash64[5]; - hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; - hash64[0] += hash64[3]; - hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; - hash64[4] += hash64[1]; - hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; - hash64[6] += hash64[3]; - hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; - hash64[0] += hash64[5]; - hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; - hash64[2] += hash64[7]; - hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; - hash64[6] += hash64[1]; - hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; - hash64[0] += hash64[7]; - hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; - hash64[2] += hash64[5]; - hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; - hash64[4] += hash64[3]; - hash64[3] = ROR8(hash64[3]) ^ hash64[4]; - hash64[0] = (hash64[0] + h5); - hash64[1] = (hash64[1] + h6); - hash64[2] = (hash64[2] + h7); - hash64[3] = (hash64[3] + skein_h8); - hash64[4] = (hash64[4] + h0); - hash64[5] = (hash64[5] + h1 + vectorize(0xff00000000000008ULL)); - hash64[6] = (hash64[6] + h2 + vectorizelow(8ULL)); - hash64[7] = (hash64[7] + h3 + vectorizelow(5)); - hash64[0] += hash64[1]; - hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; - hash64[2] += hash64[3]; - hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; - hash64[4] += hash64[5]; - hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; - hash64[6] += hash64[7]; - hash64[7] = ROL24(hash64[7]) ^ hash64[6]; - hash64[2] += hash64[1]; - hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; - hash64[4] += hash64[7]; - hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; - hash64[6] += hash64[5]; - hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; - hash64[0] += hash64[3]; - hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; - hash64[4] += hash64[1]; - hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; - hash64[6] += hash64[3]; - hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; - hash64[0] += hash64[5]; - hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; - hash64[2] += hash64[7]; - hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; - hash64[6] += hash64[1]; - hash64[1] = ROL8(hash64[1]) ^ hash64[6]; - hash64[0] += hash64[7]; - hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; - hash64[2] += hash64[5]; - hash64[5] = ROR8(hash64[5]) ^ hash64[2]; - hash64[4] += hash64[3]; - hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; - hash64[0] = (hash64[0] + h6); - hash64[1] = (hash64[1] + h7); - hash64[2] = (hash64[2] + skein_h8); - hash64[3] = (hash64[3] + h0); - hash64[4] = (hash64[4] + h1); - hash64[5] = (hash64[5] + h2 + vectorizelow(8ULL)); - hash64[6] = (hash64[6] + h3 + vectorizehigh(0xff000000UL)); - hash64[7] = (hash64[7] + h4 + vectorizelow(6)); - hash64[0] += hash64[1]; - hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; - hash64[2] += hash64[3]; - hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; - hash64[4] += hash64[5]; - hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; - hash64[6] += hash64[7]; - hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; - hash64[2] += hash64[1]; - hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; - hash64[4] += hash64[7]; - hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; - hash64[6] += hash64[5]; - hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; - hash64[0] += hash64[3]; - hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; - hash64[4] += hash64[1]; - hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; - hash64[6] += hash64[3]; - hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; - hash64[0] += hash64[5]; - hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; - hash64[2] += hash64[7]; - hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; - hash64[6] += hash64[1]; - hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; - hash64[0] += hash64[7]; - hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; - hash64[2] += hash64[5]; - hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; - hash64[4] += hash64[3]; - hash64[3] = ROR8(hash64[3]) ^ hash64[4]; - hash64[0] = (hash64[0] + h7); - hash64[1] = (hash64[1] + skein_h8); - hash64[2] = (hash64[2] + h0); - hash64[3] = (hash64[3] + h1); - hash64[4] = (hash64[4] + h2); - hash64[5] = (hash64[5] + h3 + vectorizehigh(0xff000000UL)); - hash64[6] = (hash64[6] + h4 + vectorize(0xff00000000000008ULL)); - hash64[7] = (hash64[7] + h5 + vectorizelow(7)); - hash64[0] += hash64[1]; - hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; - hash64[2] += hash64[3]; - hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; - hash64[4] += hash64[5]; - hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; - hash64[6] += hash64[7]; - hash64[7] = ROL24(hash64[7]) ^ hash64[6]; - hash64[2] += hash64[1]; - hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; - hash64[4] += hash64[7]; - hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; - hash64[6] += hash64[5]; - hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; - hash64[0] += hash64[3]; - hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; - hash64[4] += hash64[1]; - hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; - hash64[6] += hash64[3]; - hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; - hash64[0] += hash64[5]; - hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; - hash64[2] += hash64[7]; - hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; - hash64[6] += hash64[1]; - hash64[1] = ROL8(hash64[1]) ^ hash64[6]; - hash64[0] += hash64[7]; - hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; - hash64[2] += hash64[5]; - hash64[5] = ROR8(hash64[5]) ^ hash64[2]; - hash64[4] += hash64[3]; - hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; - hash64[0] = (hash64[0] + skein_h8); - hash64[1] = (hash64[1] + h0); - hash64[2] = (hash64[2] + h1); - hash64[3] = (hash64[3] + h2); - hash64[4] = (hash64[4] + h3); - hash64[5] = (hash64[5] + h4 + vectorize(0xff00000000000008ULL)); - hash64[6] = (hash64[6] + h5 + vectorizelow(8)); - hash64[7] = (hash64[7] + h6 + vectorizelow(8)); - hash64[0] += hash64[1]; - hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; - hash64[2] += hash64[3]; - hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; - hash64[4] += hash64[5]; - hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; - hash64[6] += hash64[7]; - hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; - hash64[2] += hash64[1]; - hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; - hash64[4] += hash64[7]; - hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; - hash64[6] += hash64[5]; - hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; - hash64[0] += hash64[3]; - hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; - hash64[4] += hash64[1]; - hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; - hash64[6] += hash64[3]; - hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; - hash64[0] += hash64[5]; - hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; - hash64[2] += hash64[7]; - hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; - hash64[6] += hash64[1]; - hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; - hash64[0] += hash64[7]; - hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; - hash64[2] += hash64[5]; - hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; - hash64[4] += hash64[3]; - hash64[3] = ROR8(hash64[3]) ^ hash64[4]; - hash64[0] = (hash64[0] + h0); - hash64[1] = (hash64[1] + h1); - hash64[2] = (hash64[2] + h2); - hash64[3] = (hash64[3] + h3); - hash64[4] = (hash64[4] + h4); - hash64[5] = (hash64[5] + h5 + vectorizelow(8)); - hash64[6] = (hash64[6] + h6 + vectorizehigh(0xff000000UL)); - hash64[7] = (hash64[7] + h7 + vectorizelow(9)); - hash64[0] += hash64[1]; - hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; - hash64[2] += hash64[3]; - hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; - hash64[4] += hash64[5]; - hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; - hash64[6] += hash64[7]; - hash64[7] = ROL24(hash64[7]) ^ hash64[6]; - hash64[2] += hash64[1]; - hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; - hash64[4] += hash64[7]; - hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; - hash64[6] += hash64[5]; - hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; - hash64[0] += hash64[3]; - hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; - hash64[4] += hash64[1]; - hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; - hash64[6] += hash64[3]; - hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; - hash64[0] += hash64[5]; - hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; - hash64[2] += hash64[7]; - hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; - hash64[6] += hash64[1]; - hash64[1] = ROL8(hash64[1]) ^ hash64[6]; - hash64[0] += hash64[7]; - hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; - hash64[2] += hash64[5]; - hash64[5] = ROR8(hash64[5]) ^ hash64[2]; - hash64[4] += hash64[3]; - hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; - - hash64[0] = (hash64[0] + h1); - hash64[1] = (hash64[1] + h2); - hash64[2] = (hash64[2] + h3); - hash64[3] = (hash64[3] + h4); - hash64[4] = (hash64[4] + h5); - hash64[5] = (hash64[5] + h6 + vectorizehigh(0xff000000UL)); - hash64[6] = (hash64[6] + h7 + vectorize(0xff00000000000008ULL)); - hash64[7] = (hash64[7] + skein_h8 + (vectorizelow(10))); - hash64[0] += hash64[1]; - hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; - hash64[2] += hash64[3]; - hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; - hash64[4] += hash64[5]; - hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; - hash64[6] += hash64[7]; - hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; - hash64[2] += hash64[1]; - hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; - hash64[4] += hash64[7]; - hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; - hash64[6] += hash64[5]; - hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; - hash64[0] += hash64[3]; - hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; - hash64[4] += hash64[1]; - hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; - hash64[6] += hash64[3]; - hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; - hash64[0] += hash64[5]; - hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; - hash64[2] += hash64[7]; - hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; - hash64[6] += hash64[1]; - hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; - hash64[0] += hash64[7]; - hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; - hash64[2] += hash64[5]; - hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; - hash64[4] += hash64[3]; - hash64[3] = ROR8(hash64[3]) ^ hash64[4]; - hash64[0] = (hash64[0] + h2); - hash64[1] = (hash64[1] + h3); - hash64[2] = (hash64[2] + h4); - hash64[3] = (hash64[3] + h5); - hash64[4] = (hash64[4] + h6); - hash64[5] = (hash64[5] + h7 + vectorize(0xff00000000000008ULL)); - hash64[6] = (hash64[6] + skein_h8 + vectorizelow(8ULL)); - hash64[7] = (hash64[7] + h0 + vectorizelow(11)); - hash64[0] += hash64[1]; - hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; - hash64[2] += hash64[3]; - hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; - hash64[4] += hash64[5]; - hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; - hash64[6] += hash64[7]; - hash64[7] = ROL24(hash64[7]) ^ hash64[6]; - hash64[2] += hash64[1]; - hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; - hash64[4] += hash64[7]; - hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; - hash64[6] += hash64[5]; - hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; - hash64[0] += hash64[3]; - hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; - hash64[4] += hash64[1]; - hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; - hash64[6] += hash64[3]; - hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; - hash64[0] += hash64[5]; - hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; - hash64[2] += hash64[7]; - hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; - hash64[6] += hash64[1]; - hash64[1] = ROL8(hash64[1]) ^ hash64[6]; - hash64[0] += hash64[7]; - hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; - hash64[2] += hash64[5]; - hash64[5] = ROR8(hash64[5]) ^ hash64[2]; - hash64[4] += hash64[3]; - hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; - hash64[0] = (hash64[0] + h3); - hash64[1] = (hash64[1] + h4); - hash64[2] = (hash64[2] + h5); - hash64[3] = (hash64[3] + h6); - hash64[4] = (hash64[4] + h7); - hash64[5] = (hash64[5] + skein_h8 + vectorizelow(8)); - hash64[6] = (hash64[6] + h0 + vectorizehigh(0xff000000UL)); - hash64[7] = (hash64[7] + h1 + vectorizelow(12)); - hash64[0] += hash64[1]; - hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; - hash64[2] += hash64[3]; - hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; - hash64[4] += hash64[5]; - hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; - hash64[6] += hash64[7]; - hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; - hash64[2] += hash64[1]; - hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; - hash64[4] += hash64[7]; - hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; - hash64[6] += hash64[5]; - hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; - hash64[0] += hash64[3]; - hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; - hash64[4] += hash64[1]; - hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; - hash64[6] += hash64[3]; - hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; - hash64[0] += hash64[5]; - hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; - hash64[2] += hash64[7]; - hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; - hash64[6] += hash64[1]; - hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; - hash64[0] += hash64[7]; - hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; - hash64[2] += hash64[5]; - hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; - hash64[4] += hash64[3]; - hash64[3] = ROR8(hash64[3]) ^ hash64[4]; - hash64[0] = (hash64[0] + h4); - hash64[1] = (hash64[1] + h5); - hash64[2] = (hash64[2] + h6); - hash64[3] = (hash64[3] + h7); - hash64[4] = (hash64[4] + skein_h8); - hash64[5] = (hash64[5] + h0 + vectorizehigh(0xff000000UL)); - hash64[6] = (hash64[6] + h1 + vectorize(0xff00000000000008ULL)); - hash64[7] = (hash64[7] + h2 + vectorizelow(13)); - hash64[0] += hash64[1]; - hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; - hash64[2] += hash64[3]; - hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; - hash64[4] += hash64[5]; - hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; - hash64[6] += hash64[7]; - hash64[7] = ROL24(hash64[7]) ^ hash64[6]; - hash64[2] += hash64[1]; - hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; - hash64[4] += hash64[7]; - hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; - hash64[6] += hash64[5]; - hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; - hash64[0] += hash64[3]; - hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; - hash64[4] += hash64[1]; - hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; - hash64[6] += hash64[3]; - hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; - hash64[0] += hash64[5]; - hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; - hash64[2] += hash64[7]; - hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; - hash64[6] += hash64[1]; - hash64[1] = ROL8(hash64[1]) ^ hash64[6]; - hash64[0] += hash64[7]; - hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; - hash64[2] += hash64[5]; - hash64[5] = ROR8(hash64[5]) ^ hash64[2]; - hash64[4] += hash64[3]; - hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; - hash64[0] = (hash64[0] + h5); - hash64[1] = (hash64[1] + h6); - hash64[2] = (hash64[2] + h7); - hash64[3] = (hash64[3] + skein_h8); - hash64[4] = (hash64[4] + h0); - hash64[5] = (hash64[5] + h1 + vectorize(0xff00000000000008ULL)); - hash64[6] = (hash64[6] + h2 + vectorizelow(8ULL)); - hash64[7] = (hash64[7] + h3 + vectorizelow(14)); - hash64[0] += hash64[1]; - hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; - hash64[2] += hash64[3]; - hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; - hash64[4] += hash64[5]; - hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; - hash64[6] += hash64[7]; - hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; - hash64[2] += hash64[1]; - hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; - hash64[4] += hash64[7]; - hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; - hash64[6] += hash64[5]; - hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; - hash64[0] += hash64[3]; - hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; - hash64[4] += hash64[1]; - hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; - hash64[6] += hash64[3]; - hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; - hash64[0] += hash64[5]; - hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; - hash64[2] += hash64[7]; - hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; - hash64[6] += hash64[1]; - hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; - hash64[0] += hash64[7]; - hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; - hash64[2] += hash64[5]; - hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; - hash64[4] += hash64[3]; - hash64[3] = ROR8(hash64[3]) ^ hash64[4]; - hash64[0] = (hash64[0] + h6); - hash64[1] = (hash64[1] + h7); - hash64[2] = (hash64[2] + skein_h8); - hash64[3] = (hash64[3] + h0); - hash64[4] = (hash64[4] + h1); - hash64[5] = (hash64[5] + h2 + vectorizelow(8ULL)); - hash64[6] = (hash64[6] + h3 + vectorizehigh(0xff000000UL)); - hash64[7] = (hash64[7] + h4 + vectorizelow(15)); - hash64[0] += hash64[1]; - hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; - hash64[2] += hash64[3]; - hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; - hash64[4] += hash64[5]; - hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; - hash64[6] += hash64[7]; - hash64[7] = ROL24(hash64[7]) ^ hash64[6]; - hash64[2] += hash64[1]; - hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; - hash64[4] += hash64[7]; - hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; - hash64[6] += hash64[5]; - hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; - hash64[0] += hash64[3]; - hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; - hash64[4] += hash64[1]; - hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; - hash64[6] += hash64[3]; - hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; - hash64[0] += hash64[5]; - hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; - hash64[2] += hash64[7]; - hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; - hash64[6] += hash64[1]; - hash64[1] = ROL8(hash64[1]) ^ hash64[6]; - hash64[0] += hash64[7]; - hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; - hash64[2] += hash64[5]; - hash64[5] = ROR8(hash64[5]) ^ hash64[2]; - hash64[4] += hash64[3]; - hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; - hash64[0] = (hash64[0] + h7); - hash64[1] = (hash64[1] + skein_h8); - hash64[2] = (hash64[2] + h0); - hash64[3] = (hash64[3] + h1); - hash64[4] = (hash64[4] + h2); - hash64[5] = (hash64[5] + h3 + vectorizehigh(0xff000000UL)); - hash64[6] = (hash64[6] + h4 + vectorize(0xff00000000000008ULL)); - hash64[7] = (hash64[7] + h5 + vectorizelow(16)); - hash64[0] += hash64[1]; - hash64[1] = ROL2(hash64[1], 46) ^ hash64[0]; - hash64[2] += hash64[3]; - hash64[3] = ROL2(hash64[3], 36) ^ hash64[2]; - hash64[4] += hash64[5]; - hash64[5] = ROL2(hash64[5], 19) ^ hash64[4]; - hash64[6] += hash64[7]; - hash64[7] = ROL2(hash64[7], 37) ^ hash64[6]; - hash64[2] += hash64[1]; - hash64[1] = ROL2(hash64[1], 33) ^ hash64[2]; - hash64[4] += hash64[7]; - hash64[7] = ROL2(hash64[7], 27) ^ hash64[4]; - hash64[6] += hash64[5]; - hash64[5] = ROL2(hash64[5], 14) ^ hash64[6]; - hash64[0] += hash64[3]; - hash64[3] = ROL2(hash64[3], 42) ^ hash64[0]; - hash64[4] += hash64[1]; - hash64[1] = ROL2(hash64[1], 17) ^ hash64[4]; - hash64[6] += hash64[3]; - hash64[3] = ROL2(hash64[3], 49) ^ hash64[6]; - hash64[0] += hash64[5]; - hash64[5] = ROL2(hash64[5], 36) ^ hash64[0]; - hash64[2] += hash64[7]; - hash64[7] = ROL2(hash64[7], 39) ^ hash64[2]; - hash64[6] += hash64[1]; - hash64[1] = ROL2(hash64[1], 44) ^ hash64[6]; - hash64[0] += hash64[7]; - hash64[7] = ROL2(hash64[7], 9) ^ hash64[0]; - hash64[2] += hash64[5]; - hash64[5] = ROL2(hash64[5], 54) ^ hash64[2]; - hash64[4] += hash64[3]; - hash64[3] = ROR8(hash64[3]) ^ hash64[4]; - hash64[0] = (hash64[0] + skein_h8); - hash64[1] = (hash64[1] + h0); - hash64[2] = (hash64[2] + h1); - hash64[3] = (hash64[3] + h2); - hash64[4] = (hash64[4] + h3); - hash64[5] = (hash64[5] + h4 + vectorize(0xff00000000000008ULL)); - hash64[6] = (hash64[6] + h5 + vectorizelow(8ULL)); - hash64[7] = (hash64[7] + h6 + vectorizelow(17)); - hash64[0] += hash64[1]; - hash64[1] = ROL2(hash64[1], 39) ^ hash64[0]; - hash64[2] += hash64[3]; - hash64[3] = ROL2(hash64[3], 30) ^ hash64[2]; - hash64[4] += hash64[5]; - hash64[5] = ROL2(hash64[5], 34) ^ hash64[4]; - hash64[6] += hash64[7]; - hash64[7] = ROL24(hash64[7]) ^ hash64[6]; - hash64[2] += hash64[1]; - hash64[1] = ROL2(hash64[1], 13) ^ hash64[2]; - hash64[4] += hash64[7]; - hash64[7] = ROL2(hash64[7], 50) ^ hash64[4]; - hash64[6] += hash64[5]; - hash64[5] = ROL2(hash64[5], 10) ^ hash64[6]; - hash64[0] += hash64[3]; - hash64[3] = ROL2(hash64[3], 17) ^ hash64[0]; - hash64[4] += hash64[1]; - hash64[1] = ROL2(hash64[1], 25) ^ hash64[4]; - hash64[6] += hash64[3]; - hash64[3] = ROL2(hash64[3], 29) ^ hash64[6]; - hash64[0] += hash64[5]; - hash64[5] = ROL2(hash64[5], 39) ^ hash64[0]; - hash64[2] += hash64[7]; - hash64[7] = ROL2(hash64[7], 43) ^ hash64[2]; - hash64[6] += hash64[1]; - hash64[1] = ROL8(hash64[1]) ^ hash64[6]; - hash64[0] += hash64[7]; - hash64[7] = ROL2(hash64[7], 35) ^ hash64[0]; - hash64[2] += hash64[5]; - hash64[5] = ROR8(hash64[5]) ^ hash64[2]; - hash64[4] += hash64[3]; - hash64[3] = ROL2(hash64[3], 22) ^ hash64[4]; - - Hash[0] = devectorize(hash64[0] + h0); - Hash[1] = devectorize(hash64[1] + h1); - Hash[2] = devectorize(hash64[2] + h2); - Hash[3] = devectorize(hash64[3] + h3); - Hash[4] = devectorize(hash64[4] + h4); - Hash[5] = devectorize(hash64[5] + h5)+ 8; - Hash[6] = devectorize(hash64[6] + h6)+ 0xff00000000000000ULL; - Hash[7] = devectorize(hash64[7] + h7)+ 18; - -#undef h0 -#undef h1 -#undef h2 -#undef h3 -#undef h4 -#undef h5 -#undef h6 -#undef h7 - } -} - -#if __CUDA_ARCH__ > 500 -#define tp 448 -#else -#define tp 128 -#endif - -__host__ -void quark_skein512_cpu_hash_64(uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash) -{ - dim3 grid((threads + tp - 1) / tp); - dim3 block(tp); - quark_skein512_gpu_hash_64 << > >(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); - -} - -#ifdef WANT_SKEIN_80 - -__host__ void quark_skein512_cpu_init(int thr_id) -{ - cudaMalloc(&d_nonce[thr_id], 2*sizeof(uint32_t)); -} - -__host__ void quark_skein512_setTarget(const void *ptarget) -{ -} -__host__ void quark_skein512_cpu_free(int32_t thr_id) -{ - cudaFree(d_nonce[thr_id]); -} - - -static __device__ __constant__ uint32_t sha256_hashTable[] = { - 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 -}; - - -/* Elementary functions used by SHA256 */ -#define SWAB32(x) cuda_swab32(x) -//#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) - -#define R(x, n) ((x) >> (n)) -#define Ch(x, y, z) ((x & (y ^ z)) ^ z) -#define Maj(x, y, z) ((x & (y | z)) | (y & z)) -#define S0(x) (ROTR32(x, 2) ^ ROTR32(x, 13) ^ ROTR32(x, 22)) -#define S1(x) (ROTR32(x, 6) ^ ROTR32(x, 11) ^ ROTR32(x, 25)) -#define s0(x) (ROTR32(x, 7) ^ ROTR32(x, 18) ^ R(x, 3)) -#define s1(x) (ROTR32(x, 17) ^ ROTR32(x, 19) ^ R(x, 10)) - - -__constant__ uint32_t sha256_constantTable[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; - -__global__ __launch_bounds__(1024) -void skein512_gpu_hash_80_52(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ d_found, uint64_t target) -{ - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint2 h0, h1, h2, h3, h4, h5, h6, h7, h8; - uint2 t0, t1, t2; - uint2 p[8]; - - h0 = precalcvalues[0]; - h1 = precalcvalues[1]; - h2 = precalcvalues[2]; - h3 = precalcvalues[3]; - h4 = precalcvalues[4]; - h5 = precalcvalues[5]; - h6 = precalcvalues[6]; - h7 = precalcvalues[7]; - t2 = precalcvalues[8]; - - const uint2 nounce2 = make_uint2(_LOWORD(c_PaddedMessage16[1]), cuda_swab32(startNounce + thread)); - - // skein_big_close -> etype = 0x160, ptr = 16, bcount = 1, extra = 16 - p[0] = vectorize(c_PaddedMessage16[0]); - p[1] = nounce2; - - #pragma unroll - for (int i = 2; i < 8; i++) - p[i] = make_uint2(0,0); - - t0 = vectorizelow(0x50ull); // SPH_T64(bcount << 6) + (sph_u64)(extra); - t1 = vectorizehigh(0xB0000000ul); // (bcount >> 58) + ((sph_u64)(etype) << 55); - TFBIG_KINIT_UI2(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); - TFBIG_4e_UI2(0); - TFBIG_4o_UI2(1); - TFBIG_4e_UI2(2); - TFBIG_4o_UI2(3); - TFBIG_4e_UI2(4); - TFBIG_4o_UI2(5); - TFBIG_4e_UI2(6); - TFBIG_4o_UI2(7); - TFBIG_4e_UI2(8); - TFBIG_4o_UI2(9); - TFBIG_4e_UI2(10); - TFBIG_4o_UI2(11); - TFBIG_4e_UI2(12); - TFBIG_4o_UI2(13); - TFBIG_4e_UI2(14); - TFBIG_4o_UI2(15); - TFBIG_4e_UI2(16); - TFBIG_4o_UI2(17); - TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); - - - t0 = vectorizelow(8); // extra - t1 = vectorizehigh(0xFF000000ul); // etype - - h0 = vectorize(c_PaddedMessage16[0]) ^ p[0]; - h1 = nounce2 ^ p[1]; - h2 = p[2]; - h3 = p[3]; - h4 = p[4]; - h5 = p[5]; - h6 = p[6]; - h7 = p[7]; - - h8 = h0 ^ h1 ^ p[2] ^ p[3] ^ p[4] ^ p[5] ^ p[6] ^ p[7] ^ vectorize(0x1BD11BDAA9FC1A22); - t2 = vectorize(0xFF00000000000008ull); - - // p[8] = { 0 }; - #pragma unroll 8 - for (int i = 0; i<8; i++) - p[i] = make_uint2(0, 0); - - TFBIG_4e_UI2(0); - TFBIG_4o_UI2(1); - TFBIG_4e_UI2(2); - TFBIG_4o_UI2(3); - TFBIG_4e_UI2(4); - TFBIG_4o_UI2(5); - TFBIG_4e_UI2(6); - TFBIG_4o_UI2(7); - TFBIG_4e_UI2(8); - TFBIG_4o_UI2(9); - TFBIG_4e_UI2(10); - TFBIG_4o_UI2(11); - TFBIG_4e_UI2(12); - TFBIG_4o_UI2(13); - TFBIG_4e_UI2(14); - TFBIG_4o_UI2(15); - TFBIG_4e_UI2(16); - TFBIG_4o_UI2(17); - TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); - - uint32_t *message = (uint32_t *)p; - - uint32_t W1[16]; - uint32_t W2[16]; - - uint32_t regs[8]; - uint32_t hash[8]; - - // Init with Hash-Table -#pragma unroll 8 - for (int k = 0; k < 8; k++) - { - hash[k] = regs[k] = sha256_hashTable[k]; - } - -#pragma unroll 16 - for (int k = 0; k<16; k++) - W1[k] = SWAB32(message[k]); - - // Progress W1 -#pragma unroll 16 - for (int j = 0; j<16; j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j] + W1[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - -#pragma unroll 7 - for (int k = 6; k >= 0; k--) regs[k + 1] = regs[k]; - regs[0] = T1 + T2; - regs[4] += T1; - } - - // Progress W2...W3 - - ////// PART 1 -#pragma unroll 2 - for (int j = 0; j<2; j++) - W2[j] = s1(W1[14 + j]) + W1[9 + j] + s0(W1[1 + j]) + W1[j]; -#pragma unroll 5 - for (int j = 2; j<7; j++) - W2[j] = s1(W2[j - 2]) + W1[9 + j] + s0(W1[1 + j]) + W1[j]; - -#pragma unroll 8 - for (int j = 7; j<15; j++) - W2[j] = s1(W2[j - 2]) + W2[j - 7] + s0(W1[1 + j]) + W1[j]; - - W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; - - // Round function -#pragma unroll 16 - for (int j = 0; j<16; j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 16] + W2[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - -#pragma unroll 7 - for (int l = 6; l >= 0; l--) regs[l + 1] = regs[l]; - regs[0] = T1 + T2; - regs[4] += T1; - } - - ////// PART 2 -#pragma unroll 2 - for (int j = 0; j<2; j++) - W1[j] = s1(W2[14 + j]) + W2[9 + j] + s0(W2[1 + j]) + W2[j]; - -#pragma unroll 5 - for (int j = 2; j<7; j++) - W1[j] = s1(W1[j - 2]) + W2[9 + j] + s0(W2[1 + j]) + W2[j]; - -#pragma unroll 8 - for (int j = 7; j<15; j++) - W1[j] = s1(W1[j - 2]) + W1[j - 7] + s0(W2[1 + j]) + W2[j]; - - W1[15] = s1(W1[13]) + W1[8] + s0(W1[0]) + W2[15]; - - // Round function -#pragma unroll 16 - for (int j = 0; j<16; j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 32] + W1[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - -#pragma unroll 7 - for (int l = 6; l >= 0; l--) regs[l + 1] = regs[l]; - regs[0] = T1 + T2; - regs[4] += T1; - } - - ////// PART 3 -#pragma unroll 2 - for (int j = 0; j<2; j++) - W2[j] = s1(W1[14 + j]) + W1[9 + j] + s0(W1[1 + j]) + W1[j]; - -#pragma unroll 5 - for (int j = 2; j<7; j++) - W2[j] = s1(W2[j - 2]) + W1[9 + j] + s0(W1[1 + j]) + W1[j]; - -#pragma unroll 8 - for (int j = 7; j<15; j++) - W2[j] = s1(W2[j - 2]) + W2[j - 7] + s0(W1[1 + j]) + W1[j]; - - W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; - - // Round function -#pragma unroll 16 - for (int j = 0; j<16; j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 48] + W2[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - -#pragma unroll 7 - for (int l = 6; l >= 0; l--) regs[l + 1] = regs[l]; - regs[0] = T1 + T2; - regs[4] += T1; - } - -#pragma unroll 8 - for (int k = 0; k<8; k++) - hash[k] += regs[k]; - - ///// - ///// Second Pass (ending) - ///// -#pragma unroll 8 - for (int k = 0; k<8; k++) - regs[k] = hash[k]; - - // Progress W1 - uint32_t T1, T2; -#pragma unroll - for (int j = 0; j<56; j++) - { - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_endingTable[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - -#pragma unroll 7 - for (int k = 6; k >= 0; k--) - regs[k + 1] = regs[k]; - regs[0] = T1 + T2; - regs[4] += T1; - } - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_endingTable[56]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - regs[7] = T1 + T2; - regs[3] += T1; - - T1 = regs[6] + S1(regs[3]) + Ch(regs[3], regs[4], regs[5]) + sha256_endingTable[57]; - T2 = S0(regs[7]) + Maj(regs[7], regs[0], regs[1]); - regs[6] = T1 + T2; - regs[2] += T1; - //************ - regs[1] += regs[5] + S1(regs[2]) + Ch(regs[2], regs[3], regs[4]) + sha256_endingTable[58]; - regs[0] += regs[4] + S1(regs[1]) + Ch(regs[1], regs[2], regs[3]) + sha256_endingTable[59]; - regs[7] += regs[3] + S1(regs[0]) + Ch(regs[0], regs[1], regs[2]) + sha256_endingTable[60]; - regs[6] += regs[2] + S1(regs[7]) + Ch(regs[7], regs[0], regs[1]) + sha256_endingTable[61]; - - uint64_t test = SWAB32(hash[7] + regs[7]); - test <<= 32; - test |= SWAB32(hash[6] + regs[6]); - if (test <= target) - { - uint32_t tmp = atomicExch(&(d_found[0]), startNounce + thread); - if (tmp != 0xffffffff) - d_found[1] = startNounce + thread; - } - } -} -__global__ -void skein512_gpu_hash_80_50(uint32_t threads, uint32_t startNounce, uint32_t *const __restrict__ d_found, uint64_t target) -{ - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint2 h0, h1, h2, h3, h4, h5, h6, h7, h8; - uint2 t0, t1, t2; - uint2 p[8]; - - h0 = precalcvalues[0]; - h1 = precalcvalues[1]; - h2 = precalcvalues[2]; - h3 = precalcvalues[3]; - h4 = precalcvalues[4]; - h5 = precalcvalues[5]; - h6 = precalcvalues[6]; - h7 = precalcvalues[7]; - t2 = precalcvalues[8]; - - const uint2 nounce2 = make_uint2(_LOWORD(c_PaddedMessage16[1]), cuda_swab32(startNounce + thread)); - - // skein_big_close -> etype = 0x160, ptr = 16, bcount = 1, extra = 16 - p[0] = vectorize(c_PaddedMessage16[0]); - p[1] = nounce2; - -#pragma unroll - for (int i = 2; i < 8; i++) - p[i] = make_uint2(0, 0); - - t0 = vectorizelow(0x50ull); // SPH_T64(bcount << 6) + (sph_u64)(extra); - t1 = vectorizehigh(0xB0000000ul); // (bcount >> 58) + ((sph_u64)(etype) << 55); - TFBIG_KINIT_UI2(h0, h1, h2, h3, h4, h5, h6, h7, h8, t0, t1, t2); - TFBIG_4e_UI2(0); - TFBIG_4o_UI2(1); - TFBIG_4e_UI2(2); - TFBIG_4o_UI2(3); - TFBIG_4e_UI2(4); - TFBIG_4o_UI2(5); - TFBIG_4e_UI2(6); - TFBIG_4o_UI2(7); - TFBIG_4e_UI2(8); - TFBIG_4o_UI2(9); - TFBIG_4e_UI2(10); - TFBIG_4o_UI2(11); - TFBIG_4e_UI2(12); - TFBIG_4o_UI2(13); - TFBIG_4e_UI2(14); - TFBIG_4o_UI2(15); - TFBIG_4e_UI2(16); - TFBIG_4o_UI2(17); - TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); - - - t0 = vectorizelow(8); // extra - t1 = vectorizehigh(0xFF000000ul); // etype - - h0 = vectorize(c_PaddedMessage16[0]) ^ p[0]; - h1 = nounce2 ^ p[1]; - h2 = p[2]; - h3 = p[3]; - h4 = p[4]; - h5 = p[5]; - h6 = p[6]; - h7 = p[7]; - - h8 = h0 ^ h1 ^ p[2] ^ p[3] ^ p[4] ^ p[5] ^ p[6] ^ p[7] ^ vectorize(0x1BD11BDAA9FC1A22); - t2 = vectorize(0xFF00000000000008ull); - - // p[8] = { 0 }; -#pragma unroll 8 - for (int i = 0; i<8; i++) - p[i] = make_uint2(0, 0); - - TFBIG_4e_UI2(0); - TFBIG_4o_UI2(1); - TFBIG_4e_UI2(2); - TFBIG_4o_UI2(3); - TFBIG_4e_UI2(4); - TFBIG_4o_UI2(5); - TFBIG_4e_UI2(6); - TFBIG_4o_UI2(7); - TFBIG_4e_UI2(8); - TFBIG_4o_UI2(9); - TFBIG_4e_UI2(10); - TFBIG_4o_UI2(11); - TFBIG_4e_UI2(12); - TFBIG_4o_UI2(13); - TFBIG_4e_UI2(14); - TFBIG_4o_UI2(15); - TFBIG_4e_UI2(16); - TFBIG_4o_UI2(17); - TFBIG_ADDKEY_UI2(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); - - uint32_t *message = (uint32_t *)p; - - uint32_t W1[16]; - uint32_t W2[16]; - - uint32_t regs[8]; - uint32_t hash[8]; - - // Init with Hash-Table -#pragma unroll 8 - for (int k = 0; k < 8; k++) - { - hash[k] = regs[k] = sha256_hashTable[k]; - } - -#pragma unroll 16 - for (int k = 0; k<16; k++) - W1[k] = SWAB32(message[k]); - - // Progress W1 -#pragma unroll 16 - for (int j = 0; j<16; j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j] + W1[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - -#pragma unroll 7 - for (int k = 6; k >= 0; k--) regs[k + 1] = regs[k]; - regs[0] = T1 + T2; - regs[4] += T1; - } - - // Progress W2...W3 - - ////// PART 1 -#pragma unroll 2 - for (int j = 0; j<2; j++) - W2[j] = s1(W1[14 + j]) + W1[9 + j] + s0(W1[1 + j]) + W1[j]; -#pragma unroll 5 - for (int j = 2; j<7; j++) - W2[j] = s1(W2[j - 2]) + W1[9 + j] + s0(W1[1 + j]) + W1[j]; - -#pragma unroll 8 - for (int j = 7; j<15; j++) - W2[j] = s1(W2[j - 2]) + W2[j - 7] + s0(W1[1 + j]) + W1[j]; - - W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; - - // Round function -#pragma unroll 16 - for (int j = 0; j<16; j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 16] + W2[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - -#pragma unroll 7 - for (int l = 6; l >= 0; l--) regs[l + 1] = regs[l]; - regs[0] = T1 + T2; - regs[4] += T1; - } - - ////// PART 2 -#pragma unroll 2 - for (int j = 0; j<2; j++) - W1[j] = s1(W2[14 + j]) + W2[9 + j] + s0(W2[1 + j]) + W2[j]; - -#pragma unroll 5 - for (int j = 2; j<7; j++) - W1[j] = s1(W1[j - 2]) + W2[9 + j] + s0(W2[1 + j]) + W2[j]; - -#pragma unroll 8 - for (int j = 7; j<15; j++) - W1[j] = s1(W1[j - 2]) + W1[j - 7] + s0(W2[1 + j]) + W2[j]; - - W1[15] = s1(W1[13]) + W1[8] + s0(W1[0]) + W2[15]; - - // Round function -#pragma unroll 16 - for (int j = 0; j<16; j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 32] + W1[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - -#pragma unroll 7 - for (int l = 6; l >= 0; l--) regs[l + 1] = regs[l]; - regs[0] = T1 + T2; - regs[4] += T1; - } - - ////// PART 3 -#pragma unroll 2 - for (int j = 0; j<2; j++) - W2[j] = s1(W1[14 + j]) + W1[9 + j] + s0(W1[1 + j]) + W1[j]; - -#pragma unroll 5 - for (int j = 2; j<7; j++) - W2[j] = s1(W2[j - 2]) + W1[9 + j] + s0(W1[1 + j]) + W1[j]; - -#pragma unroll 8 - for (int j = 7; j<15; j++) - W2[j] = s1(W2[j - 2]) + W2[j - 7] + s0(W1[1 + j]) + W1[j]; - - W2[15] = s1(W2[13]) + W2[8] + s0(W2[0]) + W1[15]; - - // Round function -#pragma unroll 16 - for (int j = 0; j<16; j++) - { - uint32_t T1, T2; - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_constantTable[j + 48] + W2[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - -#pragma unroll 7 - for (int l = 6; l >= 0; l--) regs[l + 1] = regs[l]; - regs[0] = T1 + T2; - regs[4] += T1; - } - -#pragma unroll 8 - for (int k = 0; k<8; k++) - hash[k] += regs[k]; - - ///// - ///// Second Pass (ending) - ///// -#pragma unroll 8 - for (int k = 0; k<8; k++) - regs[k] = hash[k]; - - // Progress W1 - uint32_t T1, T2; -#pragma unroll 1 - for (int j = 0; j<56; j++)//62 - { - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6]) + sha256_endingTable[j]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - -#pragma unroll 7 - for (int k = 6; k >= 0; k--) - regs[k + 1] = regs[k]; - regs[0] = T1 + T2; - regs[4] += T1; - } - T1 = regs[7] + S1(regs[4]) + Ch(regs[4], regs[5], regs[6])+sha256_endingTable[56]; - T2 = S0(regs[0]) + Maj(regs[0], regs[1], regs[2]); - regs[7] = T1 + T2; - regs[3] += T1; - - T1 = regs[6] + S1(regs[3]) + Ch(regs[3], regs[4], regs[5]) + sha256_endingTable[57]; - T2 = S0(regs[7]) + Maj(regs[7], regs[0], regs[1]); - regs[6] = T1 + T2; - regs[2] += T1; - //************ - regs[1] += regs[5] + S1(regs[2]) + Ch(regs[2], regs[3], regs[4]) + sha256_endingTable[58]; - regs[0] += regs[4] + S1(regs[1]) + Ch(regs[1], regs[2], regs[3]) + sha256_endingTable[59]; - regs[7] += regs[3] + S1(regs[0]) + Ch(regs[0], regs[1], regs[2]) + sha256_endingTable[60]; - regs[6] += regs[2] + S1(regs[7]) + Ch(regs[7], regs[0], regs[1]) + sha256_endingTable[61]; - - uint64_t test = SWAB32(hash[7] + regs[7]); - test <<= 32; - test|= SWAB32(hash[6] + regs[6]); - if (test <= target) - { - uint32_t tmp = atomicCAS(d_found, 0xffffffff, startNounce + thread); - if (tmp != 0xffffffff) - d_found[1] = startNounce + thread; - } - } -} - -static uint64_t PaddedMessage[16]; - -__host__ -static void precalc() -{ - uint64_t h0, h1, h2, h3, h4, h5, h6, h7, h8; - uint64_t t0, t1, t2; - - h0 = 0x4903ADFF749C51CEull; - h1 = 0x0D95DE399746DF03ull; - h2 = 0x8FD1934127C79BCEull; - h3 = 0x9A255629FF352CB1ull; - h4 = 0x5DB62599DF6CA7B0ull; - h5 = 0xEABE394CA9D5C3F4ull; - h6 = 0x991112C71A75B523ull; - h7 = 0xAE18A40B660FCC33ull; - h8 = h0 ^ h1 ^ h2 ^ h3 ^ h4 ^ h5 ^ h6 ^ h7 ^ SPH_C64(0x1BD11BDAA9FC1A22); - - t0 = 64; // ptr - t1 = 0x7000000000000000ull; - t2 = 0x7000000000000040ull; - - uint64_t p[8]; - for (int i = 0; i<8; i++) - p[i] = PaddedMessage[i]; - - TFBIG_4e_PRE(0); - TFBIG_4o_PRE(1); - TFBIG_4e_PRE(2); - TFBIG_4o_PRE(3); - TFBIG_4e_PRE(4); - TFBIG_4o_PRE(5); - TFBIG_4e_PRE(6); - TFBIG_4o_PRE(7); - TFBIG_4e_PRE(8); - TFBIG_4o_PRE(9); - TFBIG_4e_PRE(10); - TFBIG_4o_PRE(11); - TFBIG_4e_PRE(12); - TFBIG_4o_PRE(13); - TFBIG_4e_PRE(14); - TFBIG_4o_PRE(15); - TFBIG_4e_PRE(16); - TFBIG_4o_PRE(17); - TFBIG_ADDKEY_PRE(p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7], h, t, 18); - - uint64_t buffer[9]; - - buffer[0] = PaddedMessage[0] ^ p[0]; - buffer[1] = PaddedMessage[1] ^ p[1]; - buffer[2] = PaddedMessage[2] ^ p[2]; - buffer[3] = PaddedMessage[3] ^ p[3]; - buffer[4] = PaddedMessage[4] ^ p[4]; - buffer[5] = PaddedMessage[5] ^ p[5]; - buffer[6] = PaddedMessage[6] ^ p[6]; - buffer[7] = PaddedMessage[7] ^ p[7]; - buffer[8] = t2; - CUDA_SAFE_CALL(cudaMemcpyToSymbol(precalcvalues, buffer, sizeof(buffer), 0, cudaMemcpyHostToDevice)); - - uint32_t endingTable[] = { - 0x80000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, - 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000200, - 0x80000000, 0x01400000, 0x00205000, 0x00005088, 0x22000800, 0x22550014, 0x05089742, 0xa0000020, - 0x5a880000, 0x005c9400, 0x0016d49d, 0xfa801f00, 0xd33225d0, 0x11675959, 0xf6e6bfda, 0xb30c1549, - 0x08b2b050, 0x9d7c4c27, 0x0ce2a393, 0x88e6e1ea, 0xa52b4335, 0x67a16f49, 0xd732016f, 0x4eeb2e91, - 0x5dbf55e5, 0x8eee2335, 0xe2bc5ec2, 0xa83f4394, 0x45ad78f7, 0x36f3d0cd, 0xd99c05e8, 0xb0511dc7, - 0x69bc7ac4, 0xbd11375b, 0xe3ba71e5, 0x3b209ff2, 0x18feee17, 0xe25ad9e7, 0x13375046, 0x0515089d, - 0x4f0d0f04, 0x2627484e, 0x310128d2, 0xc668b434, 0x420841cc, 0x62d311b8, 0xe59ba771, 0x85a7a484 - }; - - uint32_t constantTable[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 - }; - for (int i = 0; i < 64; i++) - { - endingTable[i] = constantTable[i] + endingTable[i]; - } - CUDA_SAFE_CALL(cudaMemcpyToSymbol(sha256_endingTable, endingTable, sizeof(uint32_t) * 64, 0, cudaMemcpyHostToDevice)); - -} - - - -__host__ -void skein512_cpu_setBlock_80(uint32_t thr_id, void *pdata) -{ - memcpy(&PaddedMessage[0], pdata, 80); - - CUDA_SAFE_CALL( - cudaMemcpyToSymbol(c_PaddedMessage16, &PaddedMessage[8], 16, 0, cudaMemcpyHostToDevice) - ); - CUDA_SAFE_CALL(cudaMalloc(&(d_found[thr_id]), 3 * sizeof(uint32_t))); - - precalc(); -} - -__host__ -void skein512_cpu_hash_80_52(int thr_id, uint32_t threads, uint32_t startNounce, int swapu,uint64_t target, uint32_t *h_found) -{ - dim3 grid((threads + 1024 - 1) / 1024); - dim3 block(1024); - cudaMemset(d_found[thr_id], 0xffffffff, 2 * sizeof(uint32_t)); - skein512_gpu_hash_80_52 << < grid, block >> > (threads, startNounce, d_found[thr_id], target); - cudaMemcpy(h_found, d_found[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost); -} -__host__ -void skein512_cpu_hash_80_50(int thr_id, uint32_t threads, uint32_t startNounce, int swapu, uint64_t target, uint32_t *h_found) -{ - dim3 grid((threads + 256 - 1) / 256); - dim3 block(256); - cudaMemset(d_found[thr_id], 0xffffffff, 2 * sizeof(uint32_t)); - skein512_gpu_hash_80_50 << < grid, block >> > (threads, startNounce, d_found[thr_id], target); - cudaMemcpy(h_found, d_found[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost); -} - -#endif