From a4196b341d74b5abeba8239da8c7125ed4fe0f82 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Fri, 8 Jul 2016 15:23:02 +0200 Subject: [PATCH] neoscrypt: apply last VTC improvements rewrote almost properly ;) --- README.txt | 5 +- configure.ac | 2 +- neoscrypt/cuda_neoscrypt.cu | 1697 ++++++++++++++++++++++++----------- neoscrypt/neoscrypt.cpp | 41 +- quark/quarkcoin.cu | 11 +- 5 files changed, 1212 insertions(+), 544 deletions(-) diff --git a/README.txt b/README.txt index e1bce2e..4f2bb58 100644 --- a/README.txt +++ b/README.txt @@ -240,11 +240,12 @@ features. >>> RELEASE HISTORY <<< - June 2016 v1.8.0 + July 2016 v1.8.0 Pascal support with cuda 8 x11evo algo (XRE) - Lyra2v2 and Decred hashrate improvements + Lyra2v2, Neoscrypt and Decred improvements Enhance windows NVAPI clock and power limits + Led support for mining/shares activity on windows May 18th 2016 v1.7.6 Decred vote support diff --git a/configure.ac b/configure.ac index 62486f5..d0cc35a 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [1.8-dev], [], [ccminer], [http://github.com/tpruvot/ccminer]) +AC_INIT([ccminer], [1.8], [], [ccminer], [http://github.com/tpruvot/ccminer]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu index e7f4b21..43279b1 100644 --- a/neoscrypt/cuda_neoscrypt.cu +++ b/neoscrypt/cuda_neoscrypt.cu @@ -1,16 +1,38 @@ +// originally from djm34 - github.com/djm34/ccminer-sp-neoscrypt + #include #include -#include "cuda_helper.h" -#include "cuda_vectors.h" /* NOT COMPATIBLE WITH SM 3.0 !!! */ +#include +#include +#include "cuda_vectors.h" + +#include "miner.h" + +#ifdef __INTELLISENSE__ +#define __CUDA_ARCH__ 500 +#define atomicExch(p,x) x +#endif + +static __thread cudaStream_t stream[2]; + +__device__ __align__(16) uint2x4* W; +__device__ __align__(16) uint2x4* W2; +__device__ __align__(16) uint2x4* Tr; +__device__ __align__(16) uint2x4* Tr2; +__device__ __align__(16) uint2x4* Input; +__device__ __align__(16) uint2x4* B2; -static uint32_t *d_buffer[MAX_GPUS]; static uint32_t *d_NNonce[MAX_GPUS]; -__constant__ uint4* W; + __constant__ uint32_t pTarget[8]; __constant__ uint32_t key_init[16]; __constant__ uint32_t input_init[16]; -__constant__ uint32_t c_data[80]; +__constant__ uint32_t c_data[64]; + +#define BLOCK_SIZE 64U +#define BLAKE2S_BLOCK_SIZE 64U +#define BLAKE2S_OUT_SIZE 32U /// constants /// @@ -37,26 +59,99 @@ static const uint32_t BLAKE2S_SIGMA_host[10][16] = { { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, }; -static __constant__ uint32_t BLAKE2S_SIGMA[10][16]; +__constant__ uint32_t BLAKE2S_SIGMA[10][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, +}; -#define FASTKDF_BUFFER_SIZE 256U +#define SALSA(a,b,c,d) { \ + t =a+d; t=rotateL(t, 7); b^=t; \ + t =b+a; t=rotateL(t, 9); c^=t; \ + t =c+b; t=rotateL(t, 13); d^=t; \ + t =d+c; t=rotateL(t, 18); a^=t; \ +} -// Blake2S +#define SALSA_CORE(state) { \ + SALSA(state.s0, state.s4, state.s8, state.sc); \ + SALSA(state.s5, state.s9, state.sd, state.s1); \ + SALSA(state.sa, state.se, state.s2, state.s6); \ + SALSA(state.sf, state.s3, state.s7, state.sb); \ + SALSA(state.s0, state.s1, state.s2, state.s3); \ + SALSA(state.s5, state.s6, state.s7, state.s4); \ + SALSA(state.sa, state.sb, state.s8, state.s9); \ + SALSA(state.sf, state.sc, state.sd, state.se); \ +} -#define BLAKE2S_BLOCK_SIZE 64U -#define BLAKE2S_OUT_SIZE 32U -#define BLAKE2S_KEY_SIZE 32U +__device__ __forceinline__ +static void shift256R4(uint32_t * ret, const uint8 &vec4, uint32_t shift2) +{ +#if __CUDA_ARCH__ >= 320 + uint32_t shift = 32 - shift2; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[0]) : "r"(0), "r"(vec4.s0), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[1]) : "r"(vec4.s0), "r"(vec4.s1), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[2]) : "r"(vec4.s1), "r"(vec4.s2), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[3]) : "r"(vec4.s2), "r"(vec4.s3), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[4]) : "r"(vec4.s3), "r"(vec4.s4), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[5]) : "r"(vec4.s4), "r"(vec4.s5), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[6]) : "r"(vec4.s5), "r"(vec4.s6), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[7]) : "r"(vec4.s6), "r"(vec4.s7), "r"(shift)); + asm("shr.b32 %0, %1, %2;" : "=r"(ret[8]) : "r"(vec4.s7), "r"(shift)); +#else + // to check + shift256R(ret, vec4, shift2); +#endif +} #if __CUDA_ARCH__ >= 500 -#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \ - idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \ + +#define CHACHA_STEP(a,b,c,d) { \ a += b; d = __byte_perm(d^a, 0, 0x1032); \ + c += d; b = rotateL(b^c, 12); \ + a += b; d = __byte_perm(d^a, 0, 0x2103); \ + c += d; b = rotateL(b^c, 7); \ +} + +#else + +#define CHACHA_STEP(a,b,c,d) { \ + a += b; d = rotateL(d^a, 16); \ + c += d; b = rotateL(b^c, 12); \ + a += b; d = rotateL(d^a, 8); \ + c += d; b = rotateL(b^c, 7); \ +} + +#endif + +#define CHACHA_CORE_PARALLEL(state) { \ + CHACHA_STEP(state.lo.s0, state.lo.s4, state.hi.s0, state.hi.s4); \ + CHACHA_STEP(state.lo.s1, state.lo.s5, state.hi.s1, state.hi.s5); \ + CHACHA_STEP(state.lo.s2, state.lo.s6, state.hi.s2, state.hi.s6); \ + CHACHA_STEP(state.lo.s3, state.lo.s7, state.hi.s3, state.hi.s7); \ + CHACHA_STEP(state.lo.s0, state.lo.s5, state.hi.s2, state.hi.s7); \ + CHACHA_STEP(state.lo.s1, state.lo.s6, state.hi.s3, state.hi.s4); \ + CHACHA_STEP(state.lo.s2, state.lo.s7, state.hi.s0, state.hi.s5); \ + CHACHA_STEP(state.lo.s3, state.lo.s4, state.hi.s1, state.hi.s6); \ +} + +#if __CUDA_ARCH__ < 500 + +#define BLAKE(a, b, c, d, key1, key2) { \ + a += key1; \ + a += b; d = rotateL(d^a, 16); \ c += d; b = rotateR(b^c, 12); \ - idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \ - a += b; d = __byte_perm(d^a, 0, 0x0321); \ + a += key2; \ + a += b; d = rotateR(d^a, 8); \ c += d; b = rotateR(b^c, 7); \ } -#else + #define BLAKE_G(idx0, idx1, a, b, c, d, key) { \ idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \ a += b; d = rotateL(d^a, 16); \ @@ -65,39 +160,41 @@ static __constant__ uint32_t BLAKE2S_SIGMA[10][16]; a += b; d = rotateR(d^a, 8); \ c += d; b = rotateR(b^c, 7); \ } -#endif -#if __CUDA_ARCH__ >= 500 #define BLAKE_G_PRE(idx0, idx1, a, b, c, d, key) { \ a += key[idx0]; \ - a += b; d = __byte_perm(d^a, 0, 0x1032); \ + a += b; d = rotateL(d^a, 16); \ c += d; b = rotateR(b^c, 12); \ a += key[idx1]; \ - a += b; d = __byte_perm(d^a, 0, 0x0321); \ + a += b; d = rotateR(d^a, 8); \ c += d; b = rotateR(b^c, 7); \ } -#else -#define BLAKE_G_PRE(idx0, idx1, a, b, c, d, key) { \ + +#define BLAKE_G_PRE0(idx0, idx1, a, b, c, d, key) { \ + a += b; d = rotateL(d^a, 16); \ + c += d; b = rotateR(b^c, 12); \ + a += b; d = rotateR(d^a, 8); \ + c += d; b = rotateR(b^c, 7); \ +} + +#define BLAKE_G_PRE1(idx0, idx1, a, b, c, d, key) { \ a += key[idx0]; \ a += b; d = rotateL(d^a, 16); \ c += d; b = rotateR(b^c, 12); \ - a += key[idx1]; \ a += b; d = rotateR(d^a, 8); \ c += d; b = rotateR(b^c, 7); \ } -#endif -#define BLAKE_Ghost(idx0, idx1, a, b, c, d, key) { \ - idx = BLAKE2S_SIGMA_host[idx0][idx1]; a += key[idx]; \ - a += b; d = ROTR32(d^a,16); \ - c += d; b = ROTR32(b^c, 12); \ - idx = BLAKE2S_SIGMA_host[idx0][idx1+1]; a += key[idx]; \ - a += b; d = ROTR32(d^a,8); \ - c += d; b = ROTR32(b^c, 7); \ +#define BLAKE_G_PRE2(idx0, idx1, a, b, c, d, key) { \ + a += b; d = rotateL(d^a, 16); \ + c += d; b = rotateR(b^c, 12); \ + a += key[idx1]; \ + a += b; d = rotateR(d^a, 8); \ + c += d; b = rotateR(b^c, 7); \ } static __forceinline__ __device__ -void Blake2S(uint32_t * inout, const uint32_t * TheKey) +void Blake2S(uint32_t *out, const uint32_t* const __restrict__ inout, const uint32_t * const __restrict__ TheKey) { uint16 V; uint32_t idx; @@ -112,122 +209,98 @@ void Blake2S(uint32_t * inout, const uint32_t * TheKey) V.hi.s4 ^= BLAKE2S_BLOCK_SIZE; -#if 0 - for (int x = 0; x < 10; ++x) - { - BLAKE_G(x, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G(x, 0x2, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G(x, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G(x, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G(x, 0x8, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G(x, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G(x, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G(x, 0xE, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - } -#else - // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - BLAKE_G_PRE(0x0, 0x1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G_PRE(0x2, 0x3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G_PRE(0x4, 0x5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G_PRE(0x6, 0x7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G_PRE(0x8, 0x9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G_PRE(0xA, 0xB, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G_PRE(0xC, 0xD, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G_PRE(0xE, 0xF, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE0(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - BLAKE_G_PRE(0xE, 0xA, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G_PRE(0x4, 0x8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G_PRE(0x9, 0xF, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G_PRE(0xD, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G_PRE(0x1, 0xC, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G_PRE(0x0, 0x2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G_PRE(0xB, 0x7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G_PRE(0x5, 0x3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - + BLAKE_G_PRE0(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE2(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE1(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - BLAKE_G_PRE(0xB, 0x8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G_PRE(0xC, 0x0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G_PRE(0x5, 0x2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G_PRE(0xF, 0xD, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G_PRE(0xA, 0xE, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G_PRE(0x3, 0x6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G_PRE(0x7, 0x1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G_PRE(0x9, 0x4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - + BLAKE_G_PRE0(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE2(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - BLAKE_G_PRE(0x7, 0x9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G_PRE(0x3, 0x1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G_PRE(0xD, 0xC, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G_PRE(0xB, 0xE, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G_PRE(0x2, 0x6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G_PRE(0x5, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G_PRE(0x4, 0x0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G_PRE(0xF, 0x8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - + BLAKE_G_PRE1(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE1(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); // { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - BLAKE_G_PRE(0x9, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G_PRE(0x5, 0x7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G_PRE(0x2, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G_PRE(0xA, 0xF, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G_PRE(0xE, 0x1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G_PRE(0xB, 0xC, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G_PRE(0x6, 0x8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G_PRE(0x3, 0xD, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - + BLAKE_G_PRE2(9, 0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(5, 7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(2, 4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(10, 15, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE2(14, 1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(11, 12, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE1(6, 8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE1(3, 13, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); // { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - BLAKE_G_PRE(0x2, 0xC, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G_PRE(0x6, 0xA, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G_PRE(0x0, 0xB, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G_PRE(0x8, 0x3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G_PRE(0x4, 0xD, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G_PRE(0x7, 0x5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G_PRE(0xF, 0xE, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G_PRE(0x1, 0x9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - + BLAKE_G_PRE1(2, 12, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(6, 10, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE1(0, 11, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE2(8, 3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE1(4, 13, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(7, 5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE0(15, 14, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE1(1, 9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); // { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - BLAKE_G_PRE(0xC, 0x5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G_PRE(0x1, 0xF, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G_PRE(0xE, 0xD, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G_PRE(0x4, 0xA, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G_PRE(0x0, 0x7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G_PRE(0x6, 0x3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G_PRE(0x9, 0x2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G_PRE(0x8, 0xB, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - + BLAKE_G_PRE2(12, 5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(1, 15, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(14, 13, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(4, 10, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(0, 7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(6, 3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(9, 2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(8, 11, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); // { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - BLAKE_G_PRE(0xD, 0xB, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G_PRE(0x7, 0xE, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G_PRE(0xC, 0x1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G_PRE(0x3, 0x9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G_PRE(0x5, 0x0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G_PRE(0xF, 0x4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G_PRE(0x8, 0x6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G_PRE(0x2, 0xA, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - + BLAKE_G_PRE0(13, 11, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(7, 14, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE2(12, 1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(3, 9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(5, 0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE2(15, 4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(8, 6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE(2, 10, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); // { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - BLAKE_G_PRE(0x6, 0xF, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G_PRE(0xE, 0x9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G_PRE(0xB, 0x3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G_PRE(0x0, 0x8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G_PRE(0xC, 0x2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G_PRE(0xD, 0x7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G_PRE(0x1, 0x4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G_PRE(0xA, 0x5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - + BLAKE_G_PRE1(6, 15, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE0(14, 9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE2(11, 3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(0, 8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE2(12, 2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE2(13, 7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(1, 4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(10, 5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); // { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, - BLAKE_G_PRE(0xA, 0x2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G_PRE(0x8, 0x4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G_PRE(0x7, 0x6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G_PRE(0x1, 0x5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G_PRE(0xF, 0xB, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G_PRE(0x9, 0xE, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G_PRE(0x3, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G_PRE(0xD, 0x0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); -#endif + BLAKE_G_PRE2(10, 2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE2(8, 4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(7, 6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE(1, 5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(15, 11, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(9, 14, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE1(3, 12, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(13, 0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - V.lo ^= V.hi; - V.lo ^= tmpblock; + V.lo ^= V.hi ^ tmpblock; V.hi = BLAKE2S_IV_Vec; tmpblock = V.lo; @@ -235,86 +308,121 @@ void Blake2S(uint32_t * inout, const uint32_t * TheKey) V.hi.s4 ^= 128; V.hi.s6 = ~V.hi.s6; -#if 0 - for (int x = 0; x < 10; ++x) - { - BLAKE_G(x, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); - BLAKE_G(x, 0x2, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); - BLAKE_G(x, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); - BLAKE_G(x, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); - BLAKE_G(x, 0x8, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); - BLAKE_G(x, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); - BLAKE_G(x, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); - BLAKE_G(x, 0xE, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); - } -#else - // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - BLAKE_G_PRE(0x0, 0x1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); - BLAKE_G_PRE(0x2, 0x3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); - BLAKE_G_PRE(0x4, 0x5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); - BLAKE_G_PRE(0x6, 0x7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); - BLAKE_G_PRE(0x8, 0x9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); - BLAKE_G_PRE(0xA, 0xB, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); - BLAKE_G_PRE(0xC, 0xD, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); - BLAKE_G_PRE(0xE, 0xF, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); - + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - BLAKE_G_PRE(0xE, 0xA, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); - BLAKE_G_PRE(0x4, 0x8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); - BLAKE_G_PRE(0x9, 0xF, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); - BLAKE_G_PRE(0xD, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); - BLAKE_G_PRE(0x1, 0xC, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); - BLAKE_G_PRE(0x0, 0x2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); - BLAKE_G_PRE(0xB, 0x7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); - BLAKE_G_PRE(0x5, 0x3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); - + BLAKE_G_PRE(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - BLAKE_G_PRE(0xB, 0x8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); - BLAKE_G_PRE(0xC, 0x0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); - BLAKE_G_PRE(0x5, 0x2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); - BLAKE_G_PRE(0xF, 0xD, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); - BLAKE_G_PRE(0xA, 0xE, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); - BLAKE_G_PRE(0x3, 0x6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); - BLAKE_G_PRE(0x7, 0x1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); - BLAKE_G_PRE(0x9, 0x4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); - + BLAKE_G_PRE(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - BLAKE_G_PRE(0x7, 0x9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); - BLAKE_G_PRE(0x3, 0x1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); - BLAKE_G_PRE(0xD, 0xC, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); - BLAKE_G_PRE(0xB, 0xE, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); - BLAKE_G_PRE(0x2, 0x6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); - BLAKE_G_PRE(0x5, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); - BLAKE_G_PRE(0x4, 0x0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); - BLAKE_G_PRE(0xF, 0x8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); - - for (int x = 4; x < 10; ++x) + BLAKE_G_PRE(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + + for(int x = 4; x < 10; ++x) { - BLAKE_G(x, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); - BLAKE_G(x, 0x2, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); - BLAKE_G(x, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); - BLAKE_G(x, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); - BLAKE_G(x, 0x8, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); - BLAKE_G(x, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); - BLAKE_G(x, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); - BLAKE_G(x, 0xE, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + BLAKE_G(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); } -#endif V.lo ^= V.hi ^ tmpblock; - ((uint8*)inout)[0]=V.lo; + ((uint8*)out)[0] = V.lo; +} +#endif + +#if __CUDA_ARCH__ >= 500 + +#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \ + idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \ + a += b; d = __byte_perm(d^a, 0, 0x1032); \ + c += d; b = rotateR(b^c, 12); \ + idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \ + a += b; d = __byte_perm(d^a, 0, 0x0321); \ + c += d; b = rotateR(b^c, 7); \ +} + +#define BLAKE(a, b, c, d, key1,key2) { \ + a += key1; \ + a += b; d = __byte_perm(d^a, 0, 0x1032); \ + c += d; b = rotateR(b^c, 12); \ + a += key2; \ + a += b; d = __byte_perm(d^a, 0, 0x0321); \ + c += d; b = rotateR(b^c, 7); \ +} + +#define BLAKE_G_PRE(idx0,idx1, a, b, c, d, key) { \ + a += key[idx0]; \ + a += b; d = __byte_perm(d^a, 0, 0x1032); \ + c += d; b = rotateR(b^c, 12); \ + a += key[idx1]; \ + a += b; d = __byte_perm(d^a, 0, 0x0321); \ + c += d; b = rotateR(b^c, 7); \ } -static __forceinline__ __host__ -void Blake2Shost(uint32_t * inout, const uint32_t * inkey) +#define BLAKE_G_PRE0(idx0,idx1, a, b, c, d, key) { \ + a += b; d = __byte_perm(d^a, 0, 0x1032); \ + c += d; b = rotateR(b^c, 12); \ + a += b; d = __byte_perm(d^a, 0, 0x0321); \ + c += d; b = rotateR(b^c, 7); \ +} + +#define BLAKE_G_PRE1(idx0,idx1, a, b, c, d, key) { \ + a += key[idx0]; \ + a += b; d = __byte_perm(d^a, 0, 0x1032); \ + c += d; b = rotateR(b^c, 12); \ + a += b; d = __byte_perm(d^a, 0, 0x0321); \ + c += d; b = rotateR(b^c, 7); \ +} + +#define BLAKE_G_PRE2(idx0,idx1, a, b, c, d, key) { \ + a += b; d = __byte_perm(d^a, 0, 0x1032); \ + c += d; b = rotateR(b^c, 12); \ + a += key[idx1]; \ + a += b; d = __byte_perm(d^a, 0, 0x0321); \ + c += d; b = rotateR(b^c, 7); \ +} + +static __forceinline__ __device__ +void Blake2S_v2(uint32_t *out, const uint32_t* __restrict__ inout, const uint32_t * __restrict__ TheKey) { uint16 V; - uint32_t idx; uint8 tmpblock; - V.hi = BLAKE2S_IV_Vechost; - V.lo = BLAKE2S_IV_Vechost; + V.hi = BLAKE2S_IV_Vec; + V.lo = BLAKE2S_IV_Vec; V.lo.s0 ^= 0x01012020; // Copy input block for later @@ -322,469 +430,1020 @@ void Blake2Shost(uint32_t * inout, const uint32_t * inkey) V.hi.s4 ^= BLAKE2S_BLOCK_SIZE; - for (int x = 0; x < 10; ++x) - { - BLAKE_Ghost(x, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inkey); - BLAKE_Ghost(x, 0x2, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inkey); - BLAKE_Ghost(x, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inkey); - BLAKE_Ghost(x, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inkey); - BLAKE_Ghost(x, 0x8, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inkey); - BLAKE_Ghost(x, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inkey); - BLAKE_Ghost(x, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inkey); - BLAKE_Ghost(x, 0xE, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inkey); - } + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE0(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + BLAKE_G_PRE0(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE2(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE1(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + BLAKE_G_PRE0(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE2(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + BLAKE_G_PRE1(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE1(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + BLAKE_G_PRE2(9, 0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(5, 7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(2, 4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(10, 15, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE2(14, 1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(11, 12, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE1(6, 8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE1(3, 13, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + BLAKE_G_PRE1(2, 12, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(6, 10, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE1(0, 11, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE2(8, 3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE1(4, 13, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(7, 5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE0(15, 14, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE1(1, 9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + BLAKE_G_PRE2(12, 5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(1, 15, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(14, 13, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(4, 10, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(0, 7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(6, 3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(9, 2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(8, 11, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + BLAKE_G_PRE0(13, 11, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(7, 14, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE2(12, 1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(3, 9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(5, 0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE2(15, 4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(8, 6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE(2, 10, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + BLAKE_G_PRE1(6, 15, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE0(14, 9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE2(11, 3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(0, 8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE2(12, 2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE2(13, 7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(1, 4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(10, 5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + BLAKE_G_PRE2(10, 2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE2(8, 4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(7, 6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE(1, 5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(15, 11, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(9, 14, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE1(3, 12, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(13, 0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); V.lo ^= V.hi; V.lo ^= tmpblock; - V.hi = BLAKE2S_IV_Vechost; + V.hi = BLAKE2S_IV_Vec; tmpblock = V.lo; V.hi.s4 ^= 128; V.hi.s6 = ~V.hi.s6; - for (int x = 0; x < 10; ++x) - { - BLAKE_Ghost(x, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); - BLAKE_Ghost(x, 0x2, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); - BLAKE_Ghost(x, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); - BLAKE_Ghost(x, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); - BLAKE_Ghost(x, 0x8, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); - BLAKE_Ghost(x, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); - BLAKE_Ghost(x, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); - BLAKE_Ghost(x, 0xE, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + BLAKE_G_PRE(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + BLAKE_G_PRE(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + BLAKE_G_PRE(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[9], inout[0]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[5], inout[7]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[2], inout[4]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[10], inout[15]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[14], inout[1]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[11], inout[12]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[6], inout[8]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[3], inout[13]); + + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[2], inout[12]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[6], inout[10]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[0], inout[11]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[8], inout[3]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[4], inout[13]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[7], inout[5]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[15], inout[14]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[1], inout[9]); + + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[12], inout[5]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[1], inout[15]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[14], inout[13]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[4], inout[10]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[0], inout[7]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[6], inout[3]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[9], inout[2]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[8], inout[11]); + // 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10, + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[13], inout[11]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[7], inout[14]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[12], inout[1]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[3], inout[9]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[5], inout[0]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[15], inout[4]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[8], inout[6]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[2], inout[10]); + // 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5, + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[6], inout[15]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[14], inout[9]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[11], inout[3]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[0], inout[8]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[12], inout[2]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[13], inout[7]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[1], inout[4]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[10], inout[5]); + // 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0, + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[10], inout[2]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[8], inout[4]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[7], inout[6]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[1], inout[5]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[15], inout[11]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[9], inout[14]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[3], inout[12]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[13], inout[0]); + + V.lo ^= V.hi; + V.lo ^= tmpblock; + + ((uint8*)out)[0] = V.lo; +} +#endif /* __CUDA_ARCH__ >= 500 */ + + +static __forceinline__ __device__ +uint16 salsa_small_scalar_rnd(const uint16 &X) +{ + uint16 state = X; + uint32_t t; + + #pragma unroll 1 + for(int i = 0; i < 10; i++) { + SALSA_CORE(state); } - V.lo ^= V.hi ^ tmpblock; + return(X + state); +} - ((uint8*)inout)[0] = V.lo; +static __device__ __forceinline__ +uint16 chacha_small_parallel_rnd(const uint16 &X) +{ + uint16 st = X; + + #pragma nounroll + for(int i = 0; i < 10; i++) { + CHACHA_CORE_PARALLEL(st); + } + return(X + st); } +static __device__ __forceinline__ +void neoscrypt_chacha(uint16 *XV) +{ + uint16 temp; + + XV[0] = chacha_small_parallel_rnd(XV[0] ^ XV[3]); + temp = chacha_small_parallel_rnd(XV[1] ^ XV[0]); + XV[1] = chacha_small_parallel_rnd(XV[2] ^ temp); + XV[3] = chacha_small_parallel_rnd(XV[3] ^ XV[1]); + XV[2] = temp; +} + +static __device__ __forceinline__ +void neoscrypt_salsa(uint16 *XV) +{ + uint16 temp; + + XV[0] = salsa_small_scalar_rnd(XV[0] ^ XV[3]); + temp = salsa_small_scalar_rnd(XV[1] ^ XV[0]); + XV[1] = salsa_small_scalar_rnd(XV[2] ^ temp); + XV[3] = salsa_small_scalar_rnd(XV[3] ^ XV[1]); + XV[2] = temp; +} + + +#if __CUDA_ARCH__ < 500 static __forceinline__ __device__ -void fastkdf256(const uint32_t* password, uint8_t* output) +void fastkdf256_v1(int thread, const uint32_t nonce, const uint32_t * const __restrict__ s_data) { - uint8_t bufidx = 0; + uint2x4 output[8]; + uint8_t bufidx; uchar4 bufhelper; - uint8_t A[320],B[288]; - - ((uintx64*)A)[0] = ((uintx64*)password)[0]; - ((uint816 *)A)[4] = ((uint816 *)password)[0]; + uint32_t B[64]; + uint32_t qbuf, rbuf, bitbuf; + uint32_t input[BLAKE2S_BLOCK_SIZE / 4]; + uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = {0}; - ((uintx64*)B)[0] = ((uintx64*)password)[0]; - ((uint48 *)B)[8] = ((uint48 *)password)[0]; + const uint32_t data18 = s_data[18]; + const uint32_t data20 = s_data[0]; - uint32_t input[BLAKE2S_BLOCK_SIZE/4]; uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = { 0 }; + ((uintx64*)(B))[0] = ((uintx64*)s_data)[0]; + ((uint32_t*)B)[19] = nonce; + ((uint32_t*)B)[39] = nonce; + ((uint32_t*)B)[59] = nonce; ((uint816*)input)[0] = ((uint816*)input_init)[0]; ((uint48*)key)[0] = ((uint48*)key_init)[0]; - for (int i = 0; i < 32; ++i) + #pragma unroll 1 + for(int i = 0; i < 31; i++) { bufhelper = ((uchar4*)input)[0]; - for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) + for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) + { bufhelper += ((uchar4*)input)[x]; + } bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w; - int qbuf = bufidx/4; - int rbuf = bufidx&3; - int bitbuf = rbuf << 3; + qbuf = bufidx / 4; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + uint32_t shifted[9]; - shift256R(shifted, ((uint8*)input)[0], bitbuf); + shift256R4(shifted, ((uint8*)input)[0], bitbuf); - for (int k = 0; k < 9; ++k) { - ((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k]; + //#pragma unroll + uint32_t temp[9]; + + for(int k = 0; k < 9; ++k) + { + uint32_t indice = (k + qbuf) & 0x3f; + temp[k] = B[indice] ^ shifted[k]; + B[indice] = temp[k]; + } +#if __CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__) + uint32_t a = s_data[qbuf & 0x3f], b; + //#pragma unroll + for(int k = 0; k<16; k+=2) + { + b = s_data[(qbuf + k + 1) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf)); + a = s_data[(qbuf + k + 2) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf)); } - if (bufidx < BLAKE2S_KEY_SIZE) {((uint8*)B)[8] = ((uint8*)B)[0];} - else if (bufidx > FASTKDF_BUFFER_SIZE-BLAKE2S_OUT_SIZE) {((uint8*)B)[0] = ((uint8*)B)[8];} - - if (i<31) { - for (int k = 0; k = 320 + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(((uint32_t*)output)[i]) : "r"(B[(qbuf + i) & 0x3f]), "r"(B[(qbuf + i + 1) & 0x3f4]), "r"(bitbuf)); +#endif + ((ulonglong4*)output)[0] ^= ((ulonglong4*)input)[0]; + + ((uintx64*)output)[0] ^= ((uintx64*)s_data)[0]; + ((uint32_t*)output)[19] ^= nonce; + ((uint32_t*)output)[39] ^= nonce; + ((uint32_t*)output)[59] ^= nonce; + + for(int i = 0; i<8; i++) + (Input + 8 * thread)[i] = output[i]; +} +#endif + +#if __CUDA_ARCH__ >= 500 +static __forceinline__ __device__ +void fastkdf256_v2(int thread, const uint32_t nonce, const uint32_t* const __restrict__ s_data) //, uint2x4 * output) +{ + uint2x4 output[8]; + uint8_t bufidx; + uchar4 bufhelper; + const uint32_t data18 = s_data[18]; + const uint32_t data20 = s_data[0]; + uint32_t input[16]; + uint32_t key[16] = {0}; + uint32_t qbuf, rbuf, bitbuf; + +#define Bshift 16*thread + + uint32_t *const B = (uint32_t*)&B2[Bshift]; + ((uintx64*)(B))[0] = ((uintx64*)s_data)[0]; + + B[19] = nonce; + B[39] = nonce; + B[59] = nonce; + + ((ulonglong4*)input)[0] = ((ulonglong4*)input_init)[0]; + ((uint2x4*)key)[0] = ((uint2x4*)key_init)[0]; + + #pragma unroll 1 + for(int i = 0; i < 31; i++) + { + bufhelper = ((uchar4*)input)[0]; + for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + bufhelper += ((uchar4*)input)[x]; + } + bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w; + + qbuf = bufidx / 4; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + uint32_t shifted[9]; + + shift256R4(shifted, ((uint8*)input)[0], bitbuf); + + uint32_t temp[9]; + + for(int k = 0; k < 9; ++k) + temp[k] = __ldg(&B[(k + qbuf) & 0x3f]) ^ shifted[k]; + + uint32_t a = s_data[qbuf & 0x3f], b; + + //#pragma unroll + for(int k = 0; k<16; k+=2) + { + b = s_data[(qbuf + k + 1) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf)); + a = s_data[(qbuf + k + 2) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf)); + } + + const uint32_t noncepos = 19 - qbuf % 20; + if(noncepos <= 16 && qbuf<60) + { + if(noncepos != 0) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf)); + if(noncepos != 16) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf)); + } + + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); + + Blake2S_v2(input, input, key); + + for(int k = 0; k < 9; k++) + B[(k + qbuf) & 0x3f] = temp[k]; } - for (int i = qleft*4+rleft; i < (qleft+1)*4; ++i) { - ((uint8_t *)output)[i] = ((uint8_t *)B)[i - left] ^ ((uint8_t *)A)[i]; + + bufhelper = ((uchar4*)input)[0]; + for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + bufhelper += ((uchar4*)input)[x]; } - for (int i = qleft+1; i < FASTKDF_BUFFER_SIZE/4; ++i) { - ((uchar4 *)output)[i] = make_uchar4(B[4*i - left],B[4*i+1-left], - B[4*i+2-left],B[4*i+3-left]) ^ ((uchar4 *)A)[i]; + bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w; + + qbuf = bufidx / 4; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + + for(int i = 0; i<64; i++) + { + const uint32_t a = (qbuf + i) & 0x3f, b = (qbuf + i + 1) & 0x3f; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(((uint32_t*)output)[i]) : "r"(__ldg(&B[a])), "r"(__ldg(&B[b])), "r"(bitbuf)); } + + output[0] ^= ((uint2x4*)input)[0]; + for(int i = 0; i<8; i++) + output[i] ^= ((uint2x4*)s_data)[i]; + + ((uint32_t*)output)[19] ^= nonce; + ((uint32_t*)output)[39] ^= nonce; + ((uint32_t*)output)[59] ^= nonce;; + ((ulonglong16 *)(Input + 8 * thread))[0] = ((ulonglong16*)output)[0]; } +#endif +#if __CUDA_ARCH__ < 500 static __forceinline__ __device__ -void fastkdf32(const uint32_t * password, const uint32_t * salt, uint32_t * output) +void fastkdf32_v1(int thread, const uint32_t nonce, const uint32_t * const __restrict__ salt, const uint32_t *const __restrict__ s_data, uint32_t &output) { - uint8_t bufidx = 0; + uint8_t bufidx; uchar4 bufhelper; + uint32_t temp[9]; - uint8_t A[320]; - uint8_t B[288]; - - // Initialize the password buffer - ((uintx64*)A)[0] = ((uintx64*)password)[0]; - ((uint816*)A)[4] = ((uint816*)password)[0]; - ((uintx64*)B)[0] = ((uintx64*)salt)[0]; - ((uintx64*)B)[1] = ((uintx64*)salt)[0]; +#define Bshift 16*thread - uint32_t input[BLAKE2S_BLOCK_SIZE/4]; - uint32_t key[BLAKE2S_BLOCK_SIZE/4] = { 0 }; + uint32_t* const B0 = (uint32_t*)&B2[Bshift]; + const uint32_t cdata7 = s_data[7]; + const uint32_t data18 = s_data[18]; + const uint32_t data20 = s_data[0]; - ((uint816*)input)[0] = ((uint816*)password)[0]; + ((uintx64*)B0)[0] = ((uintx64*)salt)[0]; + uint32_t input[BLAKE2S_BLOCK_SIZE / 4]; uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = {0}; + ((uint816*)input)[0] = ((uint816*)s_data)[0]; ((uint48*)key)[0] = ((uint48*)salt)[0]; + uint32_t qbuf, rbuf, bitbuf; - for (int i = 0; i < 32; ++i) + #pragma nounroll + for(int i = 0; i < 31; i++) { - Blake2S((uint32_t*)input, key); + Blake2S(input, input, key); bufidx = 0; bufhelper = ((uchar4*)input)[0]; - - for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) + for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) + { bufhelper += ((uchar4*)input)[x]; - + } bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w; - int qbuf = bufidx / 4; - int rbuf = bufidx & 3; - int bitbuf = rbuf << 3; + qbuf = bufidx / 4; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; uint32_t shifted[9]; - shift256R(shifted, ((uint8*)input)[0], bitbuf); + shift256R4(shifted, ((uint8*)input)[0], bitbuf); - for (int k = 0; k < 9; ++k) { - ((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k]; + for(int k = 0; k < 9; k++) { + temp[k] = B0[(k + qbuf) & 0x3f]; } - if (i<31) { - if (bufidx < BLAKE2S_KEY_SIZE) {((uint8*)B)[8] = ((uint8*)B)[0];} - else if (bufidx > FASTKDF_BUFFER_SIZE - BLAKE2S_OUT_SIZE) {((uint8*)B)[0] = ((uint8*)B)[8];} - - for (uint8_t k = 0; k < BLAKE2S_BLOCK_SIZE/4; k++) { - ((uchar4*)(input))[k] = make_uchar4( - (A + bufidx)[4 * k], (A + bufidx)[4 * k + 1], - (A + bufidx)[4 * k + 2], (A + bufidx)[4 * k + 3] - ); - } - for (uint8_t k = 0; k < BLAKE2S_KEY_SIZE / 4; k++) { - ((uchar4*)(key))[k] = make_uchar4( - (B + bufidx)[4 * k], (B + bufidx)[4 * k + 1], - (B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3] - ); - } + ((uint2x4*)temp)[0] ^= ((uint2x4*)shifted)[0]; + temp[8] ^= shifted[8]; + +#if __CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__) + uint32_t a = s_data[qbuf & 0x3f], b; + //#pragma unroll + for(int k = 0; k<16; k+=2) + { + b = s_data[(qbuf + k + 1) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf)); + a = s_data[(qbuf + k + 2) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf)); } - } - uchar4 unfucked[1]; - unfucked[0] = make_uchar4(B[28 + bufidx], B[29 + bufidx],B[30 + bufidx], B[31 + bufidx]); - ((uint32_t*)output)[7] = ((uint32_t*)unfucked)[0] ^ ((uint32_t*)A)[7]; -} + const uint32_t noncepos = 19 - qbuf % 20; + if(noncepos <= 16 && qbuf < 60) + { + if(noncepos != 0) asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf)); + if(noncepos != 16) asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf)); + } + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); +#else +//#error SM 3.0 code missing here +#endif + for(int k = 0; k < 9; k++) { + B0[(k + qbuf) & 0x3f] = temp[k]; + } + } -#define SALSA(a,b,c,d) { \ - t =a+d; b^=rotateL(t, 7); \ - t =b+a; c^=rotateL(t, 9); \ - t =c+b; d^=rotateL(t, 13); \ - t =d+c; a^=rotateL(t, 18); \ -} + Blake2S(input, input, key); -#define SALSA_CORE(state) { \ - SALSA(state.s0,state.s4,state.s8,state.sc); \ - SALSA(state.s5,state.s9,state.sd,state.s1); \ - SALSA(state.sa,state.se,state.s2,state.s6); \ - SALSA(state.sf,state.s3,state.s7,state.sb); \ - SALSA(state.s0,state.s1,state.s2,state.s3); \ - SALSA(state.s5,state.s6,state.s7,state.s4); \ - SALSA(state.sa,state.sb,state.s8,state.s9); \ - SALSA(state.sf,state.sc,state.sd,state.se); \ -} + bufidx = 0; + bufhelper = ((uchar4*)input)[0]; + for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) { + bufhelper += ((uchar4*)input)[x]; + } -#if __CUDA_ARCH__ >=500 -#define CHACHA_STEP(a,b,c,d) { \ - a += b; d = __byte_perm(d^a,0,0x1032); \ - c += d; b = rotateL(b^c, 12); \ - a += b; d = __byte_perm(d^a,0,0x2103); \ - c += d; b = rotateL(b^c, 7); \ -} + bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w; + qbuf = bufidx / 4; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + + for(int k = 7; k < 9; k++) { + temp[k] = B0[(k + qbuf) & 0x3f]; + } +#if __CUDA_ARCH__ >= 320 + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); #else -#define CHACHA_STEP(a,b,c,d) { \ - a += b; d = rotateL(d^a,16); \ - c += d; b = rotateL(b^c, 12); \ - a += b; d = rotateL(d^a,8); \ - c += d; b = rotateL(b^c, 7); \ + output = (MAKE_ULONGLONG(temp[7], temp[8]) >> bitbuf); // to check maybe 7/8 reversed +#endif + output ^= input[7] ^ cdata7; } #endif -#define CHACHA_CORE_PARALLEL(state) { \ - CHACHA_STEP(state.lo.s0, state.lo.s4, state.hi.s0, state.hi.s4); \ - CHACHA_STEP(state.lo.s1, state.lo.s5, state.hi.s1, state.hi.s5); \ - CHACHA_STEP(state.lo.s2, state.lo.s6, state.hi.s2, state.hi.s6); \ - CHACHA_STEP(state.lo.s3, state.lo.s7, state.hi.s3, state.hi.s7); \ - CHACHA_STEP(state.lo.s0, state.lo.s5, state.hi.s2, state.hi.s7); \ - CHACHA_STEP(state.lo.s1, state.lo.s6, state.hi.s3, state.hi.s4); \ - CHACHA_STEP(state.lo.s2, state.lo.s7, state.hi.s0, state.hi.s5); \ - CHACHA_STEP(state.lo.s3, state.lo.s4, state.hi.s1, state.hi.s6); \ -} +#if __CUDA_ARCH__ >= 500 +static __forceinline__ __device__ +void fastkdf32_v3(int thread, const uint32_t nonce, const uint32_t* __restrict__ salt, const uint32_t* __restrict__ s_data, uint32_t &output) +{ + uint32_t temp[9]; + uint8_t bufidx; + uchar4 bufhelper; +#define Bshift 16*thread -static __forceinline__ __device__ uint16 salsa_small_scalar_rnd(const uint16 &X) -{ - uint16 state = X; - uint32_t t; + uint32_t*const B0 = (uint32_t*)&B2[Bshift]; + const uint32_t cdata7 = s_data[7]; + const uint32_t data18 = s_data[18]; + const uint32_t data20 = s_data[0]; - for (int i = 0; i < 10; ++i) { SALSA_CORE(state);} + ((uintx64*)B0)[0] = ((uintx64*)salt)[0]; + uint32_t input[BLAKE2S_BLOCK_SIZE / 4]; uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = {0}; + ((uint816*)input)[0] = ((uint816*)s_data)[0]; + ((uint48*)key)[0] = ((uint48*)salt)[0]; + uint32_t qbuf, rbuf, bitbuf; - return(X + state); + #pragma nounroll + for(int i = 0; i < 31; i++) + { + Blake2S_v2(input, input, key); + + bufidx = 0; + bufhelper = ((uchar4*)input)[0]; + for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + bufhelper += ((uchar4*)input)[x]; + } + bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w; + qbuf = bufidx / 4; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + uint32_t shifted[9]; + + shift256R4(shifted, ((uint8*)input)[0], bitbuf); + + for(int k = 0; k < 9; k++) + { + temp[k] = __ldg(&B0[(k + qbuf) & 0x3f]); + } + + ((uint2x4*)temp)[0] ^= ((uint2x4*)shifted)[0]; + temp[8] ^= shifted[8]; + + uint32_t a = s_data[qbuf & 0x3f], b; + //#pragma unroll + for(int k = 0; k<16; k+=2) + { + b = s_data[(qbuf + k + 1) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf)); + a = s_data[(qbuf + k + 2) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf)); + } + + const uint32_t noncepos = 19 - qbuf % 20; + if(noncepos <= 16 && qbuf<60) + { + if(noncepos != 0) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf)); + if(noncepos != 16) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf)); + } + + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); + + for(int k = 0; k < 9; k++) + { + B0[(k + qbuf) & 0x3f] = temp[k]; + } + } + + Blake2S_v2(input, input, key); + + bufidx = 0; + bufhelper = ((uchar4*)input)[0]; + for(int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + bufhelper += ((uchar4*)input)[x]; + } + bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w; + qbuf = bufidx / 4; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + + temp[7] = __ldg(&B0[(qbuf + 7) & 0x3f]); + temp[8] = __ldg(&B0[(qbuf + 8) & 0x3f]); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); + output ^= input[7] ^ cdata7; } +#endif -static __device__ __forceinline__ uint16 chacha_small_parallel_rnd(const uint16 &X) -{ - uint16 st = X; - for (int i = 0; i < 10; ++i) {CHACHA_CORE_PARALLEL(st);} - return(X + st); +#define BLAKE_Ghost(idx0, idx1, a, b, c, d, key) { \ + idx = BLAKE2S_SIGMA_host[idx0][idx1]; a += key[idx]; \ + a += b; d = ROTR32(d^a,16); \ + c += d; b = ROTR32(b^c, 12); \ + idx = BLAKE2S_SIGMA_host[idx0][idx1+1]; a += key[idx]; \ + a += b; d = ROTR32(d^a,8); \ + c += d; b = ROTR32(b^c, 7); \ } -static __device__ __forceinline__ void neoscrypt_chacha(uint16 *XV) +static void Blake2Shost(uint32_t * inout, const uint32_t * inkey) { - XV[0] ^= XV[3]; - uint16 temp; + uint16 V; + uint32_t idx; + uint8 tmpblock; - XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0]; - temp = chacha_small_parallel_rnd(XV[1]); XV[2] ^= temp; - XV[1] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[1]; - XV[3] = chacha_small_parallel_rnd(XV[3]); - XV[2] = temp; + V.hi = BLAKE2S_IV_Vechost; + V.lo = BLAKE2S_IV_Vechost; + V.lo.s0 ^= 0x01012020; + + // Copy input block for later + tmpblock = V.lo; + + V.hi.s4 ^= BLAKE2S_BLOCK_SIZE; + + for(int x = 0; x < 10; ++x) + { + BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inkey); + BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inkey); + BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inkey); + BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inkey); + BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inkey); + BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inkey); + BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inkey); + BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inkey); + } + + V.lo ^= V.hi; + V.lo ^= tmpblock; + + V.hi = BLAKE2S_IV_Vechost; + tmpblock = V.lo; + + V.hi.s4 ^= 128; + V.hi.s6 = ~V.hi.s6; + + for(int x = 0; x < 10; ++x) + { + BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + } + + V.lo ^= V.hi ^ tmpblock; + + ((uint8*)inout)[0] = V.lo; } -static __device__ __forceinline__ void neoscrypt_salsa(uint16 *XV) + +#define SHIFT 128 +#define TPB 128 +#define TPB2 64 + +__global__ +__launch_bounds__(TPB2, 1) +void neoscrypt_gpu_hash_start(int stratum, int threads, uint32_t startNonce) { - XV[0] ^= XV[3]; - uint16 temp; + __shared__ uint32_t s_data[64]; - XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0]; - temp = salsa_small_scalar_rnd(XV[1]); XV[2] ^= temp; - XV[1] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[1]; - XV[3] = salsa_small_scalar_rnd(XV[3]); - XV[2] = temp; -} +#if TPB2<64 +#error TPB2 too low +#else +#if TPB2>64 + if(threadIdx.x<64) +#endif +#endif + s_data[threadIdx.x] = c_data[threadIdx.x]; + __syncthreads(); + const int thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t nonce = startNonce + thread; + const uint32_t ZNonce = (stratum) ? cuda_swab32(nonce) : nonce; //freaking morons !!! -#define SHIFT 130 +#if __CUDA_ARCH__ < 500 + fastkdf256_v1(thread, ZNonce, s_data); +#else + fastkdf256_v2(thread, ZNonce, s_data); +#endif +} -__global__ __launch_bounds__(128, 1) -void neoscrypt_gpu_hash_k0(uint32_t threads, uint32_t startNonce, int stratum) +__global__ +__launch_bounds__(TPB, 1) +void neoscrypt_gpu_hash_chacha1_stream1(int threads, uint32_t startNonce) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); -// if (thread < threads) + int thread = (blockDim.x * blockIdx.x + threadIdx.x); + const int shift = SHIFT * 8 * thread; + const unsigned int shiftTr = 8 * thread; + + uint2x4 X[8]; + for(int i = 0; i<8; i++) + X[i] = __ldg4(&(Input + shiftTr)[i]); + + #pragma nounroll + for(int i = 0; i < 128; i++) { - uint32_t data[80]; - uint16 X[4]; - uint32_t shift = thread * SHIFT * 16; - const uint32_t nonce = startNonce + thread; - - for (int i = 0; i<20; i++) { - ((uint4*)data)[i] = ((uint4 *)c_data)[i]; - } //ld.local.v4 - data[19] = (stratum) ? cuda_swab32(nonce) : nonce; //freaking morons !!! - data[39] = data[19]; - data[59] = data[19]; - - fastkdf256(data, (uint8_t*)X); - - ((uintx64 *)(W + shift))[0] = ((uintx64 *)X)[0]; -// ((ulonglong16 *)(W + shift))[0] = ((ulonglong16 *)X)[0]; + uint32_t offset = shift + i * 8; + for(int j = 0; j<8; j++) + (W + offset)[j] = X[j]; + neoscrypt_chacha((uint16*)X); } + + for(int i = 0; i<8; i++) + (Tr + shiftTr)[i] = X[i]; } -__global__ __launch_bounds__(128, 1) -void neoscrypt_gpu_hash_k01(uint32_t threads, uint32_t startNonce) +__global__ +__launch_bounds__(TPB, 1) +void neoscrypt_gpu_hash_chacha2_stream1(int threads, uint32_t startNonce) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); -// if (thread < threads) + const int thread = (blockDim.x * blockIdx.x + threadIdx.x); + const int shift = SHIFT * 8 * thread; + const int shiftTr = 8 * thread; + + uint2x4 X[8]; + #pragma unroll + for(int i = 0; i<8; i++) + X[i] = __ldg4(&(Tr + shiftTr)[i]); + + #pragma nounroll + for(int t = 0; t < 128; t++) { - uint16 X[4]; - uint32_t shift = thread * SHIFT * 16; - ((uintx64 *)X)[0]= ldg256(&(W + shift)[0]); + int idx = (X[6].x.x & 0x7F) << 3; - //#pragma unroll - for (int i = 0; i < 128; ++i) - { - neoscrypt_chacha(X); - ((ulonglong16 *)(W + shift))[i+1] = ((ulonglong16 *)X)[0]; -// ((uintx64 *)(W + shift))[i + 1] = ((uintx64 *)X)[0]; - } + for(int j = 0; j<8; j++) + X[j] ^= __ldg4(&(W + shift + idx)[j]); + neoscrypt_chacha((uint16*)X); } + #pragma unroll + for(int i = 0; i<8; i++) + (Tr + shiftTr)[i] = X[i]; // best checked } -__global__ __launch_bounds__(128, 1) -void neoscrypt_gpu_hash_k2(uint32_t threads, uint32_t startNonce) +__global__ +__launch_bounds__(TPB, 1) +void neoscrypt_gpu_hash_salsa1_stream1(int threads, uint32_t startNonce) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); -// if (thread < threads) - { - uint16 X[4]; - uint32_t shift = thread * SHIFT * 16; - ((uintx64 *)X)[0] = ldg256(&(W + shift)[2048]); + const int thread = (blockDim.x * blockIdx.x + threadIdx.x); + const int shift = SHIFT * 8 * thread; + const int shiftTr = 8 * thread; - for (int t = 0; t < 128; t++) - { - int idx = X[3].lo.s0 & 0x7F; - ((uintx64 *)X)[0] ^= ldg256(&(W + shift)[idx << 4]); - neoscrypt_chacha(X); - - } - ((uintx64 *)(W + shift))[129] = ((uintx64*)X)[0]; // best checked + uint2x4 Z[8]; + #pragma unroll + for(int i = 0; i<8; i++) + Z[i] = __ldg4(&(Input + shiftTr)[i]); + #pragma nounroll + for(int i = 0; i < 128; i++) + { + for(int j = 0; j<8; j++) + (W2 + shift + i * 8)[j] = Z[j]; + neoscrypt_salsa((uint16*)Z); } + #pragma unroll + for(int i = 0; i<8; i++) + (Tr2 + shiftTr)[i] = Z[i]; } -__global__ __launch_bounds__(128, 1) -void neoscrypt_gpu_hash_k3(uint32_t threads, uint32_t startNonce) +__global__ +__launch_bounds__(TPB, 1) +void neoscrypt_gpu_hash_salsa2_stream1(int threads, uint32_t startNonce) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); -// if (thread < threads) - { - uint32_t shift = thread * SHIFT * 16; - uint16 Z[4]; + const int thread = (blockDim.x * blockIdx.x + threadIdx.x); + const int shift = SHIFT * 8 * thread; + const int shiftTr = 8 * thread; - ((uintx64*)Z)[0] = ldg256(&(W + shift)[0]); + uint2x4 X[8]; + #pragma unroll + for(int i = 0; i<8; i++) + X[i] = __ldg4(&(Tr2 + shiftTr)[i]); - //#pragma unroll - for (int i = 0; i < 128; ++i) { - neoscrypt_salsa(Z); - ((ulonglong16 *)(W + shift))[i+1] = ((ulonglong16 *)Z)[0]; -// ((uintx64 *)(W + shift))[i + 1] = ((uintx64 *)Z)[0]; - } + #pragma nounroll + for(int t = 0; t < 128; t++) + { + int idx = (X[6].x.x & 0x7F) << 3; + + for(int j = 0; j<8; j++) + X[j] ^= __ldg4(&(W2 + shift + idx)[j]); + neoscrypt_salsa((uint16*)X); } + #pragma unroll + for(int i = 0; i<8; i++) + (Tr2 + shiftTr)[i] = X[i]; // best checked } -__global__ __launch_bounds__(128, 1) -void neoscrypt_gpu_hash_k4(uint32_t threads, uint32_t startNonce, uint32_t *nonceRes, int stratum) +__global__ +__launch_bounds__(TPB2, 8) +void neoscrypt_gpu_hash_ending(int stratum, int threads, uint32_t startNonce, uint32_t *nonceVector) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - const uint32_t nonce = startNonce + thread; + __shared__ uint32_t s_data[64]; - uint32_t shift = thread * SHIFT * 16; - uint16 Z[4]; - uint32_t outbuf[8]; - uint32_t data[80]; +#if TPB2<64 +#error TPB2 too low +#elif TPB2>64 + if(threadIdx.x<64) +#endif + s_data[threadIdx.x] = c_data[threadIdx.x]; + __syncthreads(); + const int thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t nonce = startNonce + thread; - for (int i=0; i<20; i++) { - ((uint4*)data)[i] = ((uint4 *)c_data)[i]; - } + const int shiftTr = 8 * thread; + uint2x4 Z[8]; + uint32_t outbuf; - data[19] = (stratum) ? cuda_swab32(nonce) : nonce; - data[39] = data[19]; - data[59] = data[19]; - ((uintx64 *)Z)[0] = ldg256(&(W + shift)[2048]); - for (int t = 0; t < 128; t++) - { - int idx = Z[3].lo.s0 & 0x7F; - ((uintx64 *)Z)[0] ^= ldg256(&(W + shift)[idx << 4]); - neoscrypt_salsa(Z); - } - ((uintx64 *)Z)[0] ^= ldg256(&(W + shift)[2064]); - fastkdf32(data, (uint32_t*)Z, outbuf); -#if __CUDA_ARCH__ < 320 - // workaround required when using SM 3.0 shift256R() func (tested on SM 5.0) - if (thread == 0) - printf("", outbuf[7]); + const uint32_t ZNonce = (stratum) ? cuda_swab32(nonce) : nonce; + + #pragma unroll + for(int i = 0; i<8; i++) + Z[i] = __ldg4(&(Tr2 + shiftTr)[i]) ^ __ldg4(&(Tr + shiftTr)[i]); + +#if __CUDA_ARCH__ < 500 + fastkdf32_v1(thread, ZNonce, (uint32_t*)Z, s_data, outbuf); +#else + fastkdf32_v3(thread, ZNonce, (uint32_t*)Z, s_data, outbuf); #endif - if (outbuf[7] <= pTarget[7]) { - atomicMin(nonceRes, nonce); // init val is UINT32_MAX - } + + if(outbuf <= pTarget[7]) + { + uint32_t tmp = atomicExch(nonceVector, nonce); + if(tmp != UINT32_MAX) + nonceVector[1] = tmp; } } +static __thread uint32_t *hash1 = NULL; +static __thread uint32_t *hash2 = NULL; // 2 streams +static __thread uint32_t *Trans1 = NULL; +static __thread uint32_t *Trans2 = NULL; // 2 streams +static __thread uint32_t *Trans3 = NULL; // 2 streams +static __thread uint32_t *Bhash = NULL; + __host__ -void neoscrypt_cpu_init(int thr_id, uint32_t threads) +void neoscrypt_init_2stream(int thr_id, uint32_t threads) { - cuda_get_arch(thr_id); - cudaMalloc(&d_NNonce[thr_id], sizeof(uint32_t)); - CUDA_SAFE_CALL(cudaMalloc(&d_buffer[thr_id], (size_t) 256 * SHIFT * threads)); - cudaMemcpyToSymbol(W, &d_buffer[thr_id], sizeof(uint4*), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(BLAKE2S_SIGMA, BLAKE2S_SIGMA_host, sizeof(BLAKE2S_SIGMA_host), 0, cudaMemcpyHostToDevice); + CUDA_SAFE_CALL(cudaStreamCreate(&stream[0])); + CUDA_SAFE_CALL(cudaStreamCreate(&stream[1])); + + CUDA_SAFE_CALL(cudaMalloc(&d_NNonce[thr_id], 2 * sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMalloc(&hash1, 32 * 128 * sizeof(uint64_t) * threads)); + CUDA_SAFE_CALL(cudaMalloc(&hash2, 32 * 128 * sizeof(uint64_t) * threads)); + CUDA_SAFE_CALL(cudaMalloc(&Trans1, 32 * sizeof(uint64_t) * threads)); + CUDA_SAFE_CALL(cudaMalloc(&Trans2, 32 * sizeof(uint64_t) * threads)); + CUDA_SAFE_CALL(cudaMalloc(&Trans3, 32 * sizeof(uint64_t) * threads)); + CUDA_SAFE_CALL(cudaMalloc(&Bhash, 128 * sizeof(uint32_t) * threads)); + + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(B2, &Bhash, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice, stream[0])); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(W, &hash1, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice, stream[0])); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(W2, &hash2, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice, stream[0])); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(Tr, &Trans1, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice, stream[0])); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(Tr2, &Trans2, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice, stream[0])); + CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(Input, &Trans3, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice, stream[0])); } __host__ -void neoscrypt_cpu_free(int thr_id) +void neoscrypt_free_2stream(int thr_id) { cudaFree(d_NNonce[thr_id]); - cudaFree(d_buffer[thr_id]); + + cudaFree(hash1); + cudaFree(hash2); + cudaFree(Trans1); + cudaFree(Trans2); + cudaFree(Trans3); + cudaFree(Bhash); + + cudaStreamDestroy(stream[0]); + CUDA_SAFE_CALL(cudaStreamDestroy(stream[1])); } __host__ -uint32_t neoscrypt_cpu_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, int have_stratum, int order) +void neoscrypt_hash_k4_2stream(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *result, bool stratum) { - uint32_t result[MAX_GPUS]; - memset(result, 0xff, sizeof(result)); - cudaMemset(d_NNonce[thr_id], 0xff, sizeof(uint32_t)); + CUDA_SAFE_CALL(cudaMemsetAsync(d_NNonce[thr_id], 0xff, 2 * sizeof(uint32_t), stream[1])); + + const int threadsperblock = TPB; - const uint32_t threadsperblock = 128; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - neoscrypt_gpu_hash_k0 <<< grid, block >>>(threads, startNounce, have_stratum); - neoscrypt_gpu_hash_k01 <<< grid, block >>>(threads, startNounce); - neoscrypt_gpu_hash_k2 <<< grid, block >>>(threads, startNounce); - neoscrypt_gpu_hash_k3 <<< grid, block >>>(threads, startNounce); - neoscrypt_gpu_hash_k4 <<< grid, block >>>(threads, startNounce, d_NNonce[thr_id], have_stratum); + const int threadsperblock2 = TPB2; + dim3 grid2((threads + threadsperblock2 - 1) / threadsperblock2); + dim3 block2(threadsperblock2); + + neoscrypt_gpu_hash_start <<>> (stratum, threads, startNounce); //fastkdf + + CUDA_SAFE_CALL(cudaStreamSynchronize(stream[0])); + + neoscrypt_gpu_hash_salsa1_stream1 <<>> (threads, startNounce); + neoscrypt_gpu_hash_chacha1_stream1 <<>> (threads, startNounce); - MyStreamSynchronize(NULL, order, thr_id); - cudaMemcpy(&result[thr_id], d_NNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + neoscrypt_gpu_hash_salsa2_stream1 <<>> (threads, startNounce); + neoscrypt_gpu_hash_chacha2_stream1 <<>> (threads, startNounce); - return result[thr_id]; + CUDA_SAFE_CALL(cudaDeviceSynchronize()); + + neoscrypt_gpu_hash_ending <<>> (stratum, threads, startNounce, d_NNonce[thr_id]); //fastkdf+end + + CUDA_SAFE_CALL(cudaMemcpy(result, d_NNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); } __host__ void neoscrypt_setBlockTarget(uint32_t* pdata, const void *target) { - unsigned char PaddedMessage[80*4]; //bring balance to the force - uint32_t input[16], key[16] = { 0 }; + uint32_t PaddedMessage[64]; + uint32_t input[16], key[16] = {0}; + + for(int i = 0; i < 19; i++) + { + PaddedMessage[i ] = pdata[i]; + PaddedMessage[i + 20] = pdata[i]; + PaddedMessage[i + 40] = pdata[i]; + } + for(int i = 0; i<4; i++) + PaddedMessage[i + 60] = pdata[i]; - memcpy(PaddedMessage, pdata, 80); - memcpy(PaddedMessage + 80, pdata, 80); - memcpy(PaddedMessage + 160, pdata, 80); - memcpy(PaddedMessage + 240, pdata, 80); + PaddedMessage[19] = 0; + PaddedMessage[39] = 0; + PaddedMessage[59] = 0; ((uint16*)input)[0] = ((uint16*)pdata)[0]; ((uint8*)key)[0] = ((uint8*)pdata)[0]; - Blake2Shost(input,key); + Blake2Shost(input, key); cudaMemcpyToSymbol(pTarget, target, 32, 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(input_init, input, sizeof(input), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(key_init, key, sizeof(key), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(input_init, input, 64, 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(key_init, key, 64, 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(c_data, PaddedMessage, 80*4, 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(c_data, PaddedMessage, 64 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); + CUDA_SAFE_CALL(cudaGetLastError()); } - diff --git a/neoscrypt/neoscrypt.cpp b/neoscrypt/neoscrypt.cpp index b0cb1cb..b710cdc 100644 --- a/neoscrypt/neoscrypt.cpp +++ b/neoscrypt/neoscrypt.cpp @@ -3,9 +3,10 @@ #include "neoscrypt/neoscrypt.h" extern void neoscrypt_setBlockTarget(uint32_t * data, const void *ptarget); -extern void neoscrypt_cpu_init(int thr_id, uint32_t threads); -extern void neoscrypt_cpu_free(int thr_id); -extern uint32_t neoscrypt_cpu_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, int have_stratum, int order); + +extern void neoscrypt_init_2stream(int thr_id, uint32_t threads); +extern void neoscrypt_free_2stream(int thr_id); +extern void neoscrypt_hash_k4_2stream(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *result, bool stratum); static bool init[MAX_GPUS] = { 0 }; @@ -35,12 +36,12 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign cudaGetLastError(); // reset errors if device is not "reset" if (device_sm[dev_id] <= 300) { - applog(LOG_ERR, "Sorry neoscrypt is not supported on SM 3.0 devices"); + gpulog(LOG_ERR, thr_id, "Sorry neoscrypt is not supported on SM 3.0 devices"); proper_exit(EXIT_CODE_CUDA_ERROR); } - applog(LOG_INFO, "GPU #%d: Using %d cuda threads", dev_id, throughput); - neoscrypt_cpu_init(thr_id, throughput); + gpulog(LOG_INFO, thr_id, "Using %d cuda threads", throughput); + neoscrypt_init_2stream(thr_id, throughput); init[thr_id] = true; } @@ -56,26 +57,28 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign neoscrypt_setBlockTarget(endiandata,ptarget); do { - uint32_t foundNonce = neoscrypt_cpu_hash_k4(thr_id, throughput, pdata[19], have_stratum, 0); - if (foundNonce != UINT32_MAX) - { - uint32_t _ALIGN(64) vhash64[8]; + uint32_t foundNonces[2] = { UINT32_MAX, UINT32_MAX }; + neoscrypt_hash_k4_2stream(thr_id, throughput, pdata[19], foundNonces, have_stratum); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce + throughput; + + if (foundNonces[0] != UINT32_MAX) + { + uint32_t _ALIGN(64) vhash[8]; if (have_stratum) { - be32enc(&endiandata[19], foundNonce); + be32enc(&endiandata[19], foundNonces[0]); } else { - endiandata[19] = foundNonce; + endiandata[19] = foundNonces[0]; } - neoscrypt((uchar*)vhash64, (uchar*) endiandata, 0x80000620U); + neoscrypt((uchar*)vhash, (uchar*) endiandata, 0x80000620U); - if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { - work_set_target_ratio(work, vhash64); - pdata[19] = foundNonce; + if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { + work_set_target_ratio(work, vhash); + pdata[19] = foundNonces[0]; return 1; } else { - gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce); + gpulog(LOG_WARNING, thr_id, "nonce %08x does not validate on CPU!", foundNonces[0]); } } @@ -100,7 +103,7 @@ void free_neoscrypt(int thr_id) cudaThreadSynchronize(); - neoscrypt_cpu_free(thr_id); + neoscrypt_free_2stream(thr_id); init[thr_id] = false; cudaDeviceSynchronize(); diff --git a/quark/quarkcoin.cu b/quark/quarkcoin.cu index 684a50b..91a0e11 100644 --- a/quark/quarkcoin.cu +++ b/quark/quarkcoin.cu @@ -294,6 +294,7 @@ extern "C" int scanhash_quark(int thr_id, struct work* work, uint32_t max_nonce, // cleanup extern "C" void free_quark(int thr_id) { + int dev_id = device_map[thr_id]; if (!init[thr_id]) return; @@ -301,9 +302,13 @@ extern "C" void free_quark(int thr_id) cudaFree(d_hash[thr_id]); - cudaFree(d_branch1Nonces[thr_id]); - cudaFree(d_branch2Nonces[thr_id]); - cudaFree(d_branch3Nonces[thr_id]); + if (cuda_arch[dev_id] >= 300) { + cudaFree(d_branch1Nonces[thr_id]); + cudaFree(d_branch2Nonces[thr_id]); + cudaFree(d_branch3Nonces[thr_id]); + } else { + cudaFree(d_hash_br2[thr_id]); + } quark_blake512_cpu_free(thr_id); quark_groestl512_cpu_free(thr_id);