From f262850270c377690c4348c3f600954c48dc3a99 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 16 Jul 2016 18:54:00 +0200 Subject: [PATCH] nanashi r10 with proper utf8 --- Algo256/cuda_blake256.cu | 661 +++++++++-- Algo256/cuda_bmw256.cu | 304 +++-- Algo256/cuda_cubehash256.cu | 482 ++++---- Algo256/cuda_skein256.cu | 451 ++++++-- ccminer.cpp | 582 +++++++++- ccminer.vcxproj | 35 +- ccminer.vcxproj.filters | 8 +- configure.ac | 2 +- cuda_helper.h | 30 +- lyra2/cuda_lyra2.cu | 627 +++++++--- lyra2/cuda_lyra2_sm2.cuh | 7 +- lyra2/cuda_lyra2_sm5.cuh | 701 ++++++++++++ lyra2/cuda_lyra2v2.cu | 656 +++++++---- lyra2/cuda_lyra2v2_sm3.cuh | 338 ------ lyra2/lyra2RE.cu | 63 +- lyra2/lyra2REv2.cu | 180 ++- miner.h | 4 + neoscrypt/cuda_neoscrypt.cu | 1834 +++++++++++++++++++++--------- neoscrypt/cuda_vectors.h | 4 +- neoscrypt/neoscrypt.cpp | 80 +- nvml.cpp | 4 +- quark/cuda_quark_blake512_sp.cuh | 7 +- util.cpp | 2 +- 23 files changed, 5150 insertions(+), 1912 deletions(-) create mode 100644 lyra2/cuda_lyra2_sm5.cuh delete mode 100644 lyra2/cuda_lyra2v2_sm3.cuh diff --git a/Algo256/cuda_blake256.cu b/Algo256/cuda_blake256.cu index c3326e6..78c038a 100644 --- a/Algo256/cuda_blake256.cu +++ b/Algo256/cuda_blake256.cu @@ -8,17 +8,28 @@ extern "C" { } #include "cuda_helper.h" - #include -static __device__ uint64_t cuda_swab32ll(uint64_t x) { - return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x))); +#define UINT2(x,y) make_uint2(x,y) + +__device__ __inline__ uint2 ROR8(const uint2 a) +{ + uint2 result; + result.x = __byte_perm(a.y, a.x, 0x0765); + result.y = __byte_perm(a.x, a.y, 0x0765); + + return result; } -__constant__ static uint32_t c_data[3+1]; -__constant__ static uint32_t sigma[16][16]; -static uint32_t c_sigma[16][16] = { +//static __device__ uint64_t cuda_swab32ll(uint64_t x) { +// return MAKE_ULONGLONG(cuda_swab32(_LOWORD(x)), cuda_swab32(_HIWORD(x))); +//} + +__constant__ static uint32_t c_data[3]; + +//__constant__ static uint8_t sigma[16][16]; +static uint8_t c_sigma[16][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, @@ -46,7 +57,7 @@ static const uint32_t c_IV256[8] = { __device__ __constant__ static uint32_t cpu_h[8]; -__device__ __constant__ static uint32_t u256[16]; +//__device__ __constant__ static uint32_t u256[16]; static const uint32_t c_u256[16] = { 0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344, @@ -59,24 +70,22 @@ static const uint32_t c_u256[16] = { }; #define GS2(a,b,c,d,x) { \ - const uint32_t idx1 = sigma[r][x]; \ - const uint32_t idx2 = sigma[r][x+1]; \ + const uint8_t idx1 = sigma[r][x]; \ + const uint8_t idx2 = sigma[r][x+1]; \ v[a] += (m[idx1] ^ u256[idx2]) + v[b]; \ - v[d] = SPH_ROTL32(v[d] ^ v[a], 16); \ + v[d] = __byte_perm(v[d] ^ v[a],0, 0x1032); \ v[c] += v[d]; \ v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \ \ v[a] += (m[idx2] ^ u256[idx1]) + v[b]; \ - v[d] = SPH_ROTR32(v[d] ^ v[a], 8); \ + v[d] = __byte_perm(v[d] ^ v[a],0, 0x0321); \ v[c] += v[d]; \ v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \ } -//#define ROTL32(x, n) ((x) << (n)) | ((x) >> (32 - (n))) -//#define ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) #define hostGS(a,b,c,d,x) { \ - const uint32_t idx1 = c_sigma[r][x]; \ - const uint32_t idx2 = c_sigma[r][x+1]; \ + const uint8_t idx1 = c_sigma[r][x]; \ + const uint8_t idx2 = c_sigma[r][x+1]; \ v[a] += (m[idx1] ^ c_u256[idx2]) + v[b]; \ v[d] = ROTR32(v[d] ^ v[a], 16); \ v[c] += v[d]; \ @@ -86,14 +95,47 @@ static const uint32_t c_u256[16] = { v[d] = ROTR32(v[d] ^ v[a], 8); \ v[c] += v[d]; \ v[b] = ROTR32(v[b] ^ v[c], 7); \ - } + } -/* Second part (64-80) msg never change, store it */ -__device__ __constant__ static const uint32_t c_Padding[16] = { - 0, 0, 0, 0, - 0x80000000, 0, 0, 0, - 0, 0, 0, 0, - 0, 1, 0, 640, +#define GSPREC(a,b,c,d,x,y) { \ + v[a] += (m[x] ^ u256[y]) + v[b]; \ + v[d] = __byte_perm(v[d] ^ v[a],0, 0x1032); \ + v[c] += v[d]; \ + v[b] = SPH_ROTR32(v[b] ^ v[c], 12); \ + v[a] += (m[y] ^ u256[x]) + v[b]; \ + v[d] = __byte_perm(v[d] ^ v[a],0, 0x0321); \ + v[c] += v[d]; \ + v[b] = SPH_ROTR32(v[b] ^ v[c], 7); \ + } + +__constant__ uint64_t keccak_round_constants[24] = { + 0x0000000000000001ull, 0x0000000000008082ull, + 0x800000000000808aull, 0x8000000080008000ull, + 0x000000000000808bull, 0x0000000080000001ull, + 0x8000000080008081ull, 0x8000000000008009ull, + 0x000000000000008aull, 0x0000000000000088ull, + 0x0000000080008009ull, 0x000000008000000aull, + 0x000000008000808bull, 0x800000000000008bull, + 0x8000000000008089ull, 0x8000000000008003ull, + 0x8000000000008002ull, 0x8000000000000080ull, + 0x000000000000800aull, 0x800000008000000aull, + 0x8000000080008081ull, 0x8000000000008080ull, + 0x0000000080000001ull, 0x8000000080008008ull +}; + +__constant__ uint2 keccak_round_constants35[24] = { + { 0x00000001ul, 0x00000000 }, { 0x00008082ul, 0x00000000 }, + { 0x0000808aul, 0x80000000 }, { 0x80008000ul, 0x80000000 }, + { 0x0000808bul, 0x00000000 }, { 0x80000001ul, 0x00000000 }, + { 0x80008081ul, 0x80000000 }, { 0x00008009ul, 0x80000000 }, + { 0x0000008aul, 0x00000000 }, { 0x00000088ul, 0x00000000 }, + { 0x80008009ul, 0x00000000 }, { 0x8000000aul, 0x00000000 }, + { 0x8000808bul, 0x00000000 }, { 0x0000008bul, 0x80000000 }, + { 0x00008089ul, 0x80000000 }, { 0x00008003ul, 0x80000000 }, + { 0x00008002ul, 0x80000000 }, { 0x00000080ul, 0x80000000 }, + { 0x0000800aul, 0x00000000 }, { 0x8000000aul, 0x80000000 }, + { 0x80008081ul, 0x80000000 }, { 0x00008080ul, 0x80000000 }, + { 0x80000001ul, 0x00000000 }, { 0x80008008ul, 0x80000000 } }; __host__ __forceinline__ @@ -132,116 +174,545 @@ static void blake256_compress1st(uint32_t *h, const uint32_t *block, const uint3 hostGS(3, 4, 0x9, 0xE, 0xE); } - for (int i = 0; i < 16; i++) { - int j = i & 7; - h[j] ^= v[i]; - } + h[0] ^= v[0] ^ v[8]; + h[1] ^= v[1] ^ v[9]; + h[2] ^= v[2] ^ v[10]; + h[3] ^= v[3] ^ v[11]; + h[4] ^= v[4] ^ v[12]; + h[5] ^= v[5] ^ v[13]; + h[6] ^= v[6] ^ v[14]; + h[7] ^= v[7] ^ v[15]; } +#define bitselect(a, b, c) ((a) ^ ((c) & ((b) ^ (a)))) -__device__ __forceinline__ -static void blake256_compress2nd(uint32_t *h, const uint32_t *block, const uint32_t T0) +static void __forceinline__ __device__ keccak_block(uint2 *s) { - uint32_t m[16]; - uint32_t v[16]; - - m[0] = block[0]; - m[1] = block[1]; - m[2] = block[2]; - m[3] = block[3]; - - #pragma unroll - for (int i = 4; i < 16; i++) { - m[i] = c_Padding[i]; - } - - #pragma unroll 8 - for (int i = 0; i < 8; i++) - v[i] = h[i]; - - v[8] = u256[0]; - v[9] = u256[1]; - v[10] = u256[2]; - v[11] = u256[3]; + uint2 bc[5], tmpxor[5], tmp1, tmp2; + // uint2 s[25]; - v[12] = u256[4] ^ T0; - v[13] = u256[5] ^ T0; - v[14] = u256[6]; - v[15] = u256[7]; - - #pragma unroll 14 - for (int r = 0; r < 14; r++) { - /* column step */ - GS2(0, 4, 0x8, 0xC, 0x0); - GS2(1, 5, 0x9, 0xD, 0x2); - GS2(2, 6, 0xA, 0xE, 0x4); - GS2(3, 7, 0xB, 0xF, 0x6); - /* diagonal step */ - GS2(0, 5, 0xA, 0xF, 0x8); - GS2(1, 6, 0xB, 0xC, 0xA); - GS2(2, 7, 0x8, 0xD, 0xC); - GS2(3, 4, 0x9, 0xE, 0xE); - } - - #pragma unroll 16 - for (int i = 0; i < 16; i++) { - int j = i & 7; - h[j] ^= v[i]; +#pragma unroll 1 + for (int i = 0; i < 24; i++) + { +#pragma unroll + for (uint32_t x = 0; x < 5; x++) + tmpxor[x] = s[x] ^ s[x + 5] ^ s[x + 10] ^ s[x + 15] ^ s[x + 20]; + + bc[0] = tmpxor[0] ^ ROL2(tmpxor[2], 1); + bc[1] = tmpxor[1] ^ ROL2(tmpxor[3], 1); + bc[2] = tmpxor[2] ^ ROL2(tmpxor[4], 1); + bc[3] = tmpxor[3] ^ ROL2(tmpxor[0], 1); + bc[4] = tmpxor[4] ^ ROL2(tmpxor[1], 1); + + tmp1 = s[1] ^ bc[0]; + + s[0] ^= bc[4]; + s[1] = ROL2(s[6] ^ bc[0], 44); + s[6] = ROL2(s[9] ^ bc[3], 20); + s[9] = ROL2(s[22] ^ bc[1], 61); + s[22] = ROL2(s[14] ^ bc[3], 39); + s[14] = ROL2(s[20] ^ bc[4], 18); + s[20] = ROL2(s[2] ^ bc[1], 62); + s[2] = ROL2(s[12] ^ bc[1], 43); + s[12] = ROL2(s[13] ^ bc[2], 25); + s[13] = ROL8(s[19] ^ bc[3]); + s[19] = ROR8(s[23] ^ bc[2]); + s[23] = ROL2(s[15] ^ bc[4], 41); + s[15] = ROL2(s[4] ^ bc[3], 27); + s[4] = ROL2(s[24] ^ bc[3], 14); + s[24] = ROL2(s[21] ^ bc[0], 2); + s[21] = ROL2(s[8] ^ bc[2], 55); + s[8] = ROL2(s[16] ^ bc[0], 45); + s[16] = ROL2(s[5] ^ bc[4], 36); + s[5] = ROL2(s[3] ^ bc[2], 28); + s[3] = ROL2(s[18] ^ bc[2], 21); + s[18] = ROL2(s[17] ^ bc[1], 15); + s[17] = ROL2(s[11] ^ bc[0], 10); + s[11] = ROL2(s[7] ^ bc[1], 6); + s[7] = ROL2(s[10] ^ bc[4], 3); + s[10] = ROL2(tmp1, 1); + + tmp1 = s[0]; tmp2 = s[1]; s[0] = bitselect(s[0] ^ s[2], s[0], s[1]); s[1] = bitselect(s[1] ^ s[3], s[1], s[2]); s[2] = bitselect(s[2] ^ s[4], s[2], s[3]); s[3] = bitselect(s[3] ^ tmp1, s[3], s[4]); s[4] = bitselect(s[4] ^ tmp2, s[4], tmp1); + tmp1 = s[5]; tmp2 = s[6]; s[5] = bitselect(s[5] ^ s[7], s[5], s[6]); s[6] = bitselect(s[6] ^ s[8], s[6], s[7]); s[7] = bitselect(s[7] ^ s[9], s[7], s[8]); s[8] = bitselect(s[8] ^ tmp1, s[8], s[9]); s[9] = bitselect(s[9] ^ tmp2, s[9], tmp1); + tmp1 = s[10]; tmp2 = s[11]; s[10] = bitselect(s[10] ^ s[12], s[10], s[11]); s[11] = bitselect(s[11] ^ s[13], s[11], s[12]); s[12] = bitselect(s[12] ^ s[14], s[12], s[13]); s[13] = bitselect(s[13] ^ tmp1, s[13], s[14]); s[14] = bitselect(s[14] ^ tmp2, s[14], tmp1); + tmp1 = s[15]; tmp2 = s[16]; s[15] = bitselect(s[15] ^ s[17], s[15], s[16]); s[16] = bitselect(s[16] ^ s[18], s[16], s[17]); s[17] = bitselect(s[17] ^ s[19], s[17], s[18]); s[18] = bitselect(s[18] ^ tmp1, s[18], s[19]); s[19] = bitselect(s[19] ^ tmp2, s[19], tmp1); + tmp1 = s[20]; tmp2 = s[21]; s[20] = bitselect(s[20] ^ s[22], s[20], s[21]); s[21] = bitselect(s[21] ^ s[23], s[21], s[22]); s[22] = bitselect(s[22] ^ s[24], s[22], s[23]); s[23] = bitselect(s[23] ^ tmp1, s[23], s[24]); s[24] = bitselect(s[24] ^ tmp2, s[24], tmp1); + s[0] ^= keccak_round_constants35[i]; } } -__global__ __launch_bounds__(256,3) -void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t * Hash) +//__launch_bounds__(256) +__global__ +void blakeKeccak256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t * Hash) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { + const uint32_t nonce = startNonce + thread; uint32_t h[8]; - uint32_t input[4]; + // uint32_t input[4]; + const uint32_t T0 = 640; +#pragma unroll 8 + for (int i = 0; i<8; i++) { h[i] = cpu_h[i]; } + + uint32_t v[16]; + + const uint32_t c_Padding[12] = { + 0x80000000, 0, 0, 0, + 0, 0, 0, 0, + 0, 1, 0, 640 + }; + + const uint32_t u256[16] = + { + 0x243F6A88, 0x85A308D3, + 0x13198A2E, 0x03707344, + 0xA4093822, 0x299F31D0, + 0x082EFA98, 0xEC4E6C89, + 0x452821E6, 0x38D01377, + 0xBE5466CF, 0x34E90C6C, + 0xC0AC29B7, 0xC97C50DD, + 0x3F84D5B5, 0xB5470917 + }; + + uint32_t m[16] = + { + c_data[0], c_data[1], c_data[2], nonce, + c_Padding[0], c_Padding[1], c_Padding[2], c_Padding[3], + c_Padding[4], c_Padding[5], c_Padding[6], c_Padding[7], + c_Padding[8], c_Padding[9], c_Padding[10], c_Padding[11] + }; + +#pragma unroll 8 + for (int i = 0; i < 8; i++) + v[i] = h[i]; + + v[8] = u256[0]; + v[9] = u256[1]; + v[10] = u256[2]; + v[11] = u256[3]; + v[12] = u256[4] ^ T0; + v[13] = u256[5] ^ T0; + v[14] = u256[6]; + v[15] = u256[7]; + + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + GSPREC(0, 4, 0x8, 0xC, 0, 1); + GSPREC(1, 5, 0x9, 0xD, 2, 3); + GSPREC(2, 6, 0xA, 0xE, 4, 5); + GSPREC(3, 7, 0xB, 0xF, 6, 7); + GSPREC(0, 5, 0xA, 0xF, 8, 9); + GSPREC(1, 6, 0xB, 0xC, 10, 11); + GSPREC(2, 7, 0x8, 0xD, 12, 13); + GSPREC(3, 4, 0x9, 0xE, 14, 15); + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + GSPREC(0, 4, 0x8, 0xC, 14, 10); + GSPREC(1, 5, 0x9, 0xD, 4, 8); + GSPREC(2, 6, 0xA, 0xE, 9, 15); + GSPREC(3, 7, 0xB, 0xF, 13, 6); + GSPREC(0, 5, 0xA, 0xF, 1, 12); + GSPREC(1, 6, 0xB, 0xC, 0, 2); + GSPREC(2, 7, 0x8, 0xD, 11, 7); + GSPREC(3, 4, 0x9, 0xE, 5, 3); + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + GSPREC(0, 4, 0x8, 0xC, 11, 8); + GSPREC(1, 5, 0x9, 0xD, 12, 0); + GSPREC(2, 6, 0xA, 0xE, 5, 2); + GSPREC(3, 7, 0xB, 0xF, 15, 13); + GSPREC(0, 5, 0xA, 0xF, 10, 14); + GSPREC(1, 6, 0xB, 0xC, 3, 6); + GSPREC(2, 7, 0x8, 0xD, 7, 1); + GSPREC(3, 4, 0x9, 0xE, 9, 4); + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + GSPREC(0, 4, 0x8, 0xC, 7, 9); + GSPREC(1, 5, 0x9, 0xD, 3, 1); + GSPREC(2, 6, 0xA, 0xE, 13, 12); + GSPREC(3, 7, 0xB, 0xF, 11, 14); + GSPREC(0, 5, 0xA, 0xF, 2, 6); + GSPREC(1, 6, 0xB, 0xC, 5, 10); + GSPREC(2, 7, 0x8, 0xD, 4, 0); + GSPREC(3, 4, 0x9, 0xE, 15, 8); + + // { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + GSPREC(0, 4, 0x8, 0xC, 9, 0); + GSPREC(1, 5, 0x9, 0xD, 5, 7); + GSPREC(2, 6, 0xA, 0xE, 2, 4); + GSPREC(3, 7, 0xB, 0xF, 10, 15); + GSPREC(0, 5, 0xA, 0xF, 14, 1); + GSPREC(1, 6, 0xB, 0xC, 11, 12); + GSPREC(2, 7, 0x8, 0xD, 6, 8); + GSPREC(3, 4, 0x9, 0xE, 3, 13); + // { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + GSPREC(0, 4, 0x8, 0xC, 2, 12); + GSPREC(1, 5, 0x9, 0xD, 6, 10); + GSPREC(2, 6, 0xA, 0xE, 0, 11); + GSPREC(3, 7, 0xB, 0xF, 8, 3); + GSPREC(0, 5, 0xA, 0xF, 4, 13); + GSPREC(1, 6, 0xB, 0xC, 7, 5); + GSPREC(2, 7, 0x8, 0xD, 15, 14); + GSPREC(3, 4, 0x9, 0xE, 1, 9); + + // { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + GSPREC(0, 4, 0x8, 0xC, 12, 5); + GSPREC(1, 5, 0x9, 0xD, 1, 15); + GSPREC(2, 6, 0xA, 0xE, 14, 13); + GSPREC(3, 7, 0xB, 0xF, 4, 10); + GSPREC(0, 5, 0xA, 0xF, 0, 7); + GSPREC(1, 6, 0xB, 0xC, 6, 3); + GSPREC(2, 7, 0x8, 0xD, 9, 2); + GSPREC(3, 4, 0x9, 0xE, 8, 11); + + // { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + GSPREC(0, 4, 0x8, 0xC, 13, 11); + GSPREC(1, 5, 0x9, 0xD, 7, 14); + GSPREC(2, 6, 0xA, 0xE, 12, 1); + GSPREC(3, 7, 0xB, 0xF, 3, 9); + GSPREC(0, 5, 0xA, 0xF, 5, 0); + GSPREC(1, 6, 0xB, 0xC, 15, 4); + GSPREC(2, 7, 0x8, 0xD, 8, 6); + GSPREC(3, 4, 0x9, 0xE, 2, 10); + // { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + GSPREC(0, 4, 0x8, 0xC, 6, 15); + GSPREC(1, 5, 0x9, 0xD, 14, 9); + GSPREC(2, 6, 0xA, 0xE, 11, 3); + GSPREC(3, 7, 0xB, 0xF, 0, 8); + GSPREC(0, 5, 0xA, 0xF, 12, 2); + GSPREC(1, 6, 0xB, 0xC, 13, 7); + GSPREC(2, 7, 0x8, 0xD, 1, 4); + GSPREC(3, 4, 0x9, 0xE, 10, 5); + // { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + GSPREC(0, 4, 0x8, 0xC, 10, 2); + GSPREC(1, 5, 0x9, 0xD, 8, 4); + GSPREC(2, 6, 0xA, 0xE, 7, 6); + GSPREC(3, 7, 0xB, 0xF, 1, 5); + GSPREC(0, 5, 0xA, 0xF, 15, 11); + GSPREC(1, 6, 0xB, 0xC, 9, 14); + GSPREC(2, 7, 0x8, 0xD, 3, 12); + GSPREC(3, 4, 0x9, 0xE, 13, 0); + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + GSPREC(0, 4, 0x8, 0xC, 0, 1); + GSPREC(1, 5, 0x9, 0xD, 2, 3); + GSPREC(2, 6, 0xA, 0xE, 4, 5); + GSPREC(3, 7, 0xB, 0xF, 6, 7); + GSPREC(0, 5, 0xA, 0xF, 8, 9); + GSPREC(1, 6, 0xB, 0xC, 10, 11); + GSPREC(2, 7, 0x8, 0xD, 12, 13); + GSPREC(3, 4, 0x9, 0xE, 14, 15); + + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + GSPREC(0, 4, 0x8, 0xC, 14, 10); + GSPREC(1, 5, 0x9, 0xD, 4, 8); + GSPREC(2, 6, 0xA, 0xE, 9, 15); + GSPREC(3, 7, 0xB, 0xF, 13, 6); + GSPREC(0, 5, 0xA, 0xF, 1, 12); + GSPREC(1, 6, 0xB, 0xC, 0, 2); + GSPREC(2, 7, 0x8, 0xD, 11, 7); + GSPREC(3, 4, 0x9, 0xE, 5, 3); + + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + GSPREC(0, 4, 0x8, 0xC, 11, 8); + GSPREC(1, 5, 0x9, 0xD, 12, 0); + GSPREC(2, 6, 0xA, 0xE, 5, 2); + GSPREC(3, 7, 0xB, 0xF, 15, 13); + GSPREC(0, 5, 0xA, 0xF, 10, 14); + GSPREC(1, 6, 0xB, 0xC, 3, 6); + GSPREC(2, 7, 0x8, 0xD, 7, 1); + GSPREC(3, 4, 0x9, 0xE, 9, 4); + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + GSPREC(0, 4, 0x8, 0xC, 7, 9); + GSPREC(1, 5, 0x9, 0xD, 3, 1); + GSPREC(2, 6, 0xA, 0xE, 13, 12); + GSPREC(3, 7, 0xB, 0xF, 11, 14); + GSPREC(0, 5, 0xA, 0xF, 2, 6); + GSPREC(1, 6, 0xB, 0xC, 5, 10); + GSPREC(2, 7, 0x8, 0xD, 4, 0); + GSPREC(3, 4, 0x9, 0xE, 15, 8); + + + + + h[0] = cuda_swab32(h[0] ^ v[0] ^ v[8]); + h[1] = cuda_swab32(h[1] ^ v[1] ^ v[9]); + h[2] = cuda_swab32(h[2] ^ v[2] ^ v[10]); + h[3] = cuda_swab32(h[3] ^ v[3] ^ v[11]); + h[4] = cuda_swab32(h[4] ^ v[4] ^ v[12]); + h[5] = cuda_swab32(h[5] ^ v[5] ^ v[13]); + h[6] = cuda_swab32(h[6] ^ v[6] ^ v[14]); + h[7] = cuda_swab32(h[7] ^ v[7] ^ v[15]); + + uint2 keccak_gpu_state[25] = { 0 }; + keccak_gpu_state[0].x = h[0]; + keccak_gpu_state[0].y = h[1]; + keccak_gpu_state[1].x = h[2]; + keccak_gpu_state[1].y = h[3]; + keccak_gpu_state[2].x = h[4]; + keccak_gpu_state[2].y = h[5]; + keccak_gpu_state[3].x = h[6]; + keccak_gpu_state[3].y = h[7]; + keccak_gpu_state[4] = UINT2(1, 0); + + keccak_gpu_state[16] = UINT2(0, 0x80000000); + keccak_block(keccak_gpu_state); + uint64_t *outputHash = (uint64_t *)Hash; +#pragma unroll 4 + for (int i = 0; i<4; i++) + outputHash[i*threads + thread] = devectorize(keccak_gpu_state[i]); + } - #pragma unroll - for (int i = 0; i < 8; i++) h[i] = cpu_h[i]; - #pragma unroll - for (int i = 0; i < 3; ++i) input[i] = c_data[i]; - input[3] = startNonce + thread; - blake256_compress2nd(h, input, 640); +} + - #pragma unroll - for (int i = 0; i<4; i++) { - Hash[i*threads + thread] = cuda_swab32ll(MAKE_ULONGLONG(h[2 * i], h[2*i+1])); - } +__global__ __launch_bounds__(256, 4) +void blake256_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint32_t * Hash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t nonce = startNonce + thread; + uint32_t h[8]; + // uint32_t input[4]; + const uint32_t T0 = 640; +#pragma unroll 8 + for (int i = 0; i<8; i++) { h[i] = cpu_h[i]; } + + uint32_t v[16]; + + const uint32_t c_Padding[12] = { + 0x80000000, 0, 0, 0, + 0, 0, 0, 0, + 0, 1, 0, 640 + }; + + const uint32_t u256[16] = + { + 0x243F6A88, 0x85A308D3, + 0x13198A2E, 0x03707344, + 0xA4093822, 0x299F31D0, + 0x082EFA98, 0xEC4E6C89, + 0x452821E6, 0x38D01377, + 0xBE5466CF, 0x34E90C6C, + 0xC0AC29B7, 0xC97C50DD, + 0x3F84D5B5, 0xB5470917 + }; + + uint32_t m[16] = + { + c_data[0], c_data[1], c_data[2], nonce, + c_Padding[0], c_Padding[1], c_Padding[2], c_Padding[3], + c_Padding[4], c_Padding[5], c_Padding[6], c_Padding[7], + c_Padding[8], c_Padding[9], c_Padding[10], c_Padding[11] + }; + +#pragma unroll 8 + for (int i = 0; i < 8; i++) + v[i] = h[i]; + + v[8] = u256[0]; + v[9] = u256[1]; + v[10] = u256[2]; + v[11] = u256[3]; + v[12] = u256[4] ^ T0; + v[13] = u256[5] ^ T0; + v[14] = u256[6]; + v[15] = u256[7]; + + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + GSPREC(0, 4, 0x8, 0xC, 0, 1); + GSPREC(1, 5, 0x9, 0xD, 2, 3); + GSPREC(2, 6, 0xA, 0xE, 4, 5); + GSPREC(3, 7, 0xB, 0xF, 6, 7); + GSPREC(0, 5, 0xA, 0xF, 8, 9); + GSPREC(1, 6, 0xB, 0xC, 10, 11); + GSPREC(2, 7, 0x8, 0xD, 12, 13); + GSPREC(3, 4, 0x9, 0xE, 14, 15); + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + GSPREC(0, 4, 0x8, 0xC, 14, 10); + GSPREC(1, 5, 0x9, 0xD, 4, 8); + GSPREC(2, 6, 0xA, 0xE, 9, 15); + GSPREC(3, 7, 0xB, 0xF, 13, 6); + GSPREC(0, 5, 0xA, 0xF, 1, 12); + GSPREC(1, 6, 0xB, 0xC, 0, 2); + GSPREC(2, 7, 0x8, 0xD, 11, 7); + GSPREC(3, 4, 0x9, 0xE, 5, 3); + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + GSPREC(0, 4, 0x8, 0xC, 11, 8); + GSPREC(1, 5, 0x9, 0xD, 12, 0); + GSPREC(2, 6, 0xA, 0xE, 5, 2); + GSPREC(3, 7, 0xB, 0xF, 15, 13); + GSPREC(0, 5, 0xA, 0xF, 10, 14); + GSPREC(1, 6, 0xB, 0xC, 3, 6); + GSPREC(2, 7, 0x8, 0xD, 7, 1); + GSPREC(3, 4, 0x9, 0xE, 9, 4); + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + GSPREC(0, 4, 0x8, 0xC, 7, 9); + GSPREC(1, 5, 0x9, 0xD, 3, 1); + GSPREC(2, 6, 0xA, 0xE, 13, 12); + GSPREC(3, 7, 0xB, 0xF, 11, 14); + GSPREC(0, 5, 0xA, 0xF, 2, 6); + GSPREC(1, 6, 0xB, 0xC, 5, 10); + GSPREC(2, 7, 0x8, 0xD, 4, 0); + GSPREC(3, 4, 0x9, 0xE, 15, 8); + + // { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + GSPREC(0, 4, 0x8, 0xC, 9, 0); + GSPREC(1, 5, 0x9, 0xD, 5, 7); + GSPREC(2, 6, 0xA, 0xE, 2, 4); + GSPREC(3, 7, 0xB, 0xF, 10, 15); + GSPREC(0, 5, 0xA, 0xF, 14, 1); + GSPREC(1, 6, 0xB, 0xC, 11, 12); + GSPREC(2, 7, 0x8, 0xD, 6, 8); + GSPREC(3, 4, 0x9, 0xE, 3, 13); + // { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + GSPREC(0, 4, 0x8, 0xC, 2, 12); + GSPREC(1, 5, 0x9, 0xD, 6, 10); + GSPREC(2, 6, 0xA, 0xE, 0, 11); + GSPREC(3, 7, 0xB, 0xF, 8, 3); + GSPREC(0, 5, 0xA, 0xF, 4, 13); + GSPREC(1, 6, 0xB, 0xC, 7, 5); + GSPREC(2, 7, 0x8, 0xD, 15, 14); + GSPREC(3, 4, 0x9, 0xE, 1, 9); + + // { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + GSPREC(0, 4, 0x8, 0xC, 12, 5); + GSPREC(1, 5, 0x9, 0xD, 1, 15); + GSPREC(2, 6, 0xA, 0xE, 14, 13); + GSPREC(3, 7, 0xB, 0xF, 4, 10); + GSPREC(0, 5, 0xA, 0xF, 0, 7); + GSPREC(1, 6, 0xB, 0xC, 6, 3); + GSPREC(2, 7, 0x8, 0xD, 9, 2); + GSPREC(3, 4, 0x9, 0xE, 8, 11); + + // { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + GSPREC(0, 4, 0x8, 0xC, 13, 11); + GSPREC(1, 5, 0x9, 0xD, 7, 14); + GSPREC(2, 6, 0xA, 0xE, 12, 1); + GSPREC(3, 7, 0xB, 0xF, 3, 9); + GSPREC(0, 5, 0xA, 0xF, 5, 0); + GSPREC(1, 6, 0xB, 0xC, 15, 4); + GSPREC(2, 7, 0x8, 0xD, 8, 6); + GSPREC(3, 4, 0x9, 0xE, 2, 10); + // { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + GSPREC(0, 4, 0x8, 0xC, 6, 15); + GSPREC(1, 5, 0x9, 0xD, 14, 9); + GSPREC(2, 6, 0xA, 0xE, 11, 3); + GSPREC(3, 7, 0xB, 0xF, 0, 8); + GSPREC(0, 5, 0xA, 0xF, 12, 2); + GSPREC(1, 6, 0xB, 0xC, 13, 7); + GSPREC(2, 7, 0x8, 0xD, 1, 4); + GSPREC(3, 4, 0x9, 0xE, 10, 5); + // { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + GSPREC(0, 4, 0x8, 0xC, 10, 2); + GSPREC(1, 5, 0x9, 0xD, 8, 4); + GSPREC(2, 6, 0xA, 0xE, 7, 6); + GSPREC(3, 7, 0xB, 0xF, 1, 5); + GSPREC(0, 5, 0xA, 0xF, 15, 11); + GSPREC(1, 6, 0xB, 0xC, 9, 14); + GSPREC(2, 7, 0x8, 0xD, 3, 12); + GSPREC(3, 4, 0x9, 0xE, 13, 0); + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + GSPREC(0, 4, 0x8, 0xC, 0, 1); + GSPREC(1, 5, 0x9, 0xD, 2, 3); + GSPREC(2, 6, 0xA, 0xE, 4, 5); + GSPREC(3, 7, 0xB, 0xF, 6, 7); + GSPREC(0, 5, 0xA, 0xF, 8, 9); + GSPREC(1, 6, 0xB, 0xC, 10, 11); + GSPREC(2, 7, 0x8, 0xD, 12, 13); + GSPREC(3, 4, 0x9, 0xE, 14, 15); + + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + GSPREC(0, 4, 0x8, 0xC, 14, 10); + GSPREC(1, 5, 0x9, 0xD, 4, 8); + GSPREC(2, 6, 0xA, 0xE, 9, 15); + GSPREC(3, 7, 0xB, 0xF, 13, 6); + GSPREC(0, 5, 0xA, 0xF, 1, 12); + GSPREC(1, 6, 0xB, 0xC, 0, 2); + GSPREC(2, 7, 0x8, 0xD, 11, 7); + GSPREC(3, 4, 0x9, 0xE, 5, 3); + + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + GSPREC(0, 4, 0x8, 0xC, 11, 8); + GSPREC(1, 5, 0x9, 0xD, 12, 0); + GSPREC(2, 6, 0xA, 0xE, 5, 2); + GSPREC(3, 7, 0xB, 0xF, 15, 13); + GSPREC(0, 5, 0xA, 0xF, 10, 14); + GSPREC(1, 6, 0xB, 0xC, 3, 6); + GSPREC(2, 7, 0x8, 0xD, 7, 1); + GSPREC(3, 4, 0x9, 0xE, 9, 4); + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + GSPREC(0, 4, 0x8, 0xC, 7, 9); + GSPREC(1, 5, 0x9, 0xD, 3, 1); + GSPREC(2, 6, 0xA, 0xE, 13, 12); + GSPREC(3, 7, 0xB, 0xF, 11, 14); + GSPREC(0, 5, 0xA, 0xF, 2, 6); + GSPREC(1, 6, 0xB, 0xC, 5, 10); + GSPREC(2, 7, 0x8, 0xD, 4, 0); + GSPREC(3, 4, 0x9, 0xE, 15, 8); + + h[0] = cuda_swab32(h[0] ^ v[0] ^ v[8]); + h[1] = cuda_swab32(h[1] ^ v[1] ^ v[9]); + h[2] = cuda_swab32(h[2] ^ v[2] ^ v[10]); + h[3] = cuda_swab32(h[3] ^ v[3] ^ v[11]); + h[4] = cuda_swab32(h[4] ^ v[4] ^ v[12]); + h[5] = cuda_swab32(h[5] ^ v[5] ^ v[13]); + h[6] = cuda_swab32(h[6] ^ v[6] ^ v[14]); + h[7] = cuda_swab32(h[7] ^ v[7] ^ v[15]); + + Hash[((0 * threads) + thread) * 2] = (h[0]); + Hash[((0 * threads) + thread) * 2 + 1] = (h[1]); + Hash[((1 * threads) + thread) * 2] = (h[2]); + Hash[((1 * threads) + thread) * 2 + 1] = (h[3]); + Hash[((2 * threads) + thread) * 2] = (h[4]); + Hash[((2 * threads) + thread) * 2 + 1] = (h[5]); + Hash[((3 * threads) + thread) * 2] = (h[6]); + Hash[((3 * threads) + thread) * 2 + 1] = (h[7]); } } __host__ void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order) { - const uint32_t threadsperblock = 256; + const uint32_t threadsperblock = 64; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - blake256_gpu_hash_80 <<>> (threads, startNonce, Hash); - MyStreamSynchronize(NULL, order, thr_id); + blake256_gpu_hash_80 << > > (threads, startNonce, (uint32_t *)Hash); } __host__ void blake256_cpu_setBlock_80(uint32_t *pdata) { - uint32_t h[8], data[20]; - + uint32_t h[8]; + uint32_t data[20]; memcpy(data, pdata, 80); - memcpy(h, c_IV256, sizeof(c_IV256)); + for (int i = 0; i<8; i++) { + h[i] = c_IV256[i]; + } blake256_compress1st(h, pdata, 512); cudaMemcpyToSymbol(cpu_h, h, sizeof(h), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(c_data, &data[16], sizeof(c_data), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(c_data, &data[16], 3 * 4, 0, cudaMemcpyHostToDevice); } __host__ -void blake256_cpu_init(int thr_id, uint32_t threads) +void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order) { - cudaMemcpyToSymbol(u256, c_u256, sizeof(c_u256), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(sigma, c_sigma, sizeof(c_sigma), 0, cudaMemcpyHostToDevice); + const uint32_t threadsperblock = 256; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + blakeKeccak256_gpu_hash_80 << > > (threads, startNonce, (uint32_t *)Hash); +} + +__host__ +void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order, cudaStream_t stream) +{ + const uint32_t threadsperblock = 256; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + blakeKeccak256_gpu_hash_80 << > > (threads, startNonce, (uint32_t *)Hash); } diff --git a/Algo256/cuda_bmw256.cu b/Algo256/cuda_bmw256.cu index 0fde12e..b301749 100644 --- a/Algo256/cuda_bmw256.cu +++ b/Algo256/cuda_bmw256.cu @@ -14,87 +14,85 @@ __constant__ uint64_t pTarget[4]; #define shl(x, n) ((x) << (n)) #define shr(x, n) ((x) >> (n)) -#define ss0(x) (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x), 4) ^ SPH_ROTL32((x), 19)) -#define ss1(x) (shr((x), 1) ^ shl((x), 2) ^ SPH_ROTL32((x), 8) ^ SPH_ROTL32((x), 23)) -#define ss2(x) (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25)) -#define ss3(x) (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29)) -#define ss4(x) (shr((x), 1) ^ (x)) -#define ss5(x) (shr((x), 2) ^ (x)) - +#define ss0(x) (shr((x), 1) ^ shl((x), 3) ^ SPH_ROTL32((x), 4) ^ SPH_ROTL32((x), 19)) +#define ss1(x) (shr((x), 1) ^ shl((x), 2) ^ __byte_perm(x,0,0x2103) ^ SPH_ROTL32((x), 23)) +#define ss2(x) (shr((x), 2) ^ shl((x), 1) ^ SPH_ROTL32((x), 12) ^ SPH_ROTL32((x), 25)) +#define ss3(x) (shr((x), 2) ^ shl((x), 2) ^ SPH_ROTL32((x), 15) ^ SPH_ROTL32((x), 29)) +#define ss4(x) (shr((x), 1) ^ (x)) +#define ss5(x) (shr((x), 2) ^ (x)) #define rs1(x) SPH_ROTL32((x), 3) #define rs2(x) SPH_ROTL32((x), 7) #define rs3(x) SPH_ROTL32((x), 13) -#define rs4(x) SPH_ROTL32((x), 16) +#define rs4(x) __byte_perm(x,0,0x1032) #define rs5(x) SPH_ROTL32((x), 19) #define rs6(x) SPH_ROTL32((x), 23) #define rs7(x) SPH_ROTL32((x), 27) /* Message expansion function 1 */ -__forceinline__ __device__ -uint32_t expand32_1(int i, uint32_t *M32, const uint32_t *H, uint32_t *Q) +__forceinline__ __device__ uint32_t expand32_1(int i, uint32_t *M32, const uint32_t *H, uint32_t *Q) { return (ss1(Q[i - 16]) + ss2(Q[i - 15]) + ss3(Q[i - 14]) + ss0(Q[i - 13]) + ss1(Q[i - 12]) + ss2(Q[i - 11]) + ss3(Q[i - 10]) + ss0(Q[i - 9]) + ss1(Q[i - 8]) + ss2(Q[i - 7]) + ss3(Q[i - 6]) + ss0(Q[i - 5]) + ss1(Q[i - 4]) + ss2(Q[i - 3]) + ss3(Q[i - 2]) + ss0(Q[i - 1]) - + ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) - + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16])); + + ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16])); } /* Message expansion function 2 */ -__forceinline__ __device__ -uint32_t expand32_2(int i, uint32_t *M32, const uint32_t *H, uint32_t *Q) +__forceinline__ __device__ uint32_t expand32_2(const int i, uint32_t *M32, const uint32_t *H, uint32_t *Q) { - return (Q[i - 16] + rs1(Q[i - 15]) + Q[i - 14] + rs2(Q[i - 13]) - + Q[i - 12] + rs3(Q[i - 11]) + Q[i - 10] + rs4(Q[i - 9]) - + Q[i - 8] + rs5(Q[i - 7]) + Q[i - 6] + rs6(Q[i - 5]) - + Q[i - 4] + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1]) - + ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) - + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16])); + return ( + rs2(Q[i - 13]) + rs3(Q[i - 11]) + rs4(Q[i - 9]) + rs1(Q[i - 15]) + + +rs5(Q[i - 7]) + rs6(Q[i - 5]) + rs7(Q[i - 3]) + ss4(Q[i - 2]) + ss5(Q[i - 1])); } -__forceinline__ __device__ -void Compression256(uint32_t * M32) +__forceinline__ __device__ void Compression256(uint32_t M32[16]) { - uint32_t Q[32], XL32, XH32; - const uint32_t H[16] = { - 0x40414243, 0x44454647, 0x48494A4B, 0x4C4D4E4F, - 0x50515253, 0x54555657, 0x58595A5B, 0x5C5D5E5F, - 0x60616263, 0x64656667, 0x68696A6B, 0x6C6D6E6F, - 0x70717273, 0x74757677, 0x78797A7B, 0x7C7D7E7F + (0x40414243), (0x44454647), + (0x48494A4B), (0x4C4D4E4F), + (0x50515253), (0x54555657), + (0x58595A5B), (0x5C5D5E5F), + (0x60616263), (0x64656667), + (0x68696A6B), (0x6C6D6E6F), + (0x70717273), (0x74757677), + (0x78797A7B), (0x7C7D7E7F) }; - Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]); - Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]); - Q[2] = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9] ^ H[9]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]); - Q[3] = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (M32[8] ^ H[8]) - (M32[10] ^ H[10]) + (M32[13] ^ H[13]); - Q[4] = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (M32[9] ^ H[9]) - (M32[11] ^ H[11]) - (M32[14] ^ H[14]); - Q[5] = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (M32[10] ^ H[10]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]); - Q[6] = (M32[4] ^ H[4]) - (M32[0] ^ H[0]) - (M32[3] ^ H[3]) - (M32[11] ^ H[11]) + (M32[13] ^ H[13]); - Q[7] = (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[5] ^ H[5]) - (M32[12] ^ H[12]) - (M32[14] ^ H[14]); - Q[8] = (M32[2] ^ H[2]) - (M32[5] ^ H[5]) - (M32[6] ^ H[6]) + (M32[13] ^ H[13]) - (M32[15] ^ H[15]); - Q[9] = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (M32[6] ^ H[6]) - (M32[7] ^ H[7]) + (M32[14] ^ H[14]); - Q[10] = (M32[8] ^ H[8]) - (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[7] ^ H[7]) + (M32[15] ^ H[15]); - Q[11] = (M32[8] ^ H[8]) - (M32[0] ^ H[0]) - (M32[2] ^ H[2]) - (M32[5] ^ H[5]) + (M32[9] ^ H[9]); - Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[10] ^ H[10]); - Q[13] = (M32[2] ^ H[2]) + (M32[4] ^ H[4]) + (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[11] ^ H[11]); - Q[14] = (M32[3] ^ H[3]) - (M32[5] ^ H[5]) + (M32[8] ^ H[8]) - (M32[11] ^ H[11]) - (M32[12] ^ H[12]); - Q[15] = (M32[12] ^ H[12]) - (M32[4] ^ H[4]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[13] ^ H[13]); - - /* Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe. */ - Q[0] = ss0(Q[0]) + H[1]; - Q[1] = ss1(Q[1]) + H[2]; - Q[2] = ss2(Q[2]) + H[3]; - Q[3] = ss3(Q[3]) + H[4]; - Q[4] = ss4(Q[4]) + H[5]; - Q[5] = ss0(Q[5]) + H[6]; - Q[6] = ss1(Q[6]) + H[7]; - Q[7] = ss2(Q[7]) + H[8]; - Q[8] = ss3(Q[8]) + H[9]; - Q[9] = ss4(Q[9]) + H[10]; + M32[8] = 0x80; + M32[14] = 0x100; + + // int i; + uint32_t XL32, XH32, Q[32]; + + Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]); + Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]); + Q[2] = (M32[0] ^ H[0]) + (M32[7] ^ H[7]) + (M32[9] ^ H[9]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]); + Q[3] = (M32[0] ^ H[0]) - (M32[1] ^ H[1]) + (M32[8] ^ H[8]) - (M32[10] ^ H[10]) + (M32[13] ^ H[13]); + Q[4] = (M32[1] ^ H[1]) + (M32[2] ^ H[2]) + (M32[9] ^ H[9]) - (M32[11] ^ H[11]) - (M32[14] ^ H[14]); + Q[5] = (M32[3] ^ H[3]) - (M32[2] ^ H[2]) + (M32[10] ^ H[10]) - (M32[12] ^ H[12]) + (M32[15] ^ H[15]); + Q[6] = (M32[4] ^ H[4]) - (M32[0] ^ H[0]) - (M32[3] ^ H[3]) - (M32[11] ^ H[11]) + (M32[13] ^ H[13]); + Q[7] = (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[5] ^ H[5]) - (M32[12] ^ H[12]) - (M32[14] ^ H[14]); + Q[8] = (M32[2] ^ H[2]) - (M32[5] ^ H[5]) - (M32[6] ^ H[6]) + (M32[13] ^ H[13]) - (M32[15] ^ H[15]); + Q[9] = (M32[0] ^ H[0]) - (M32[3] ^ H[3]) + (M32[6] ^ H[6]) - (M32[7] ^ H[7]) + (M32[14] ^ H[14]); + Q[10] = (M32[8] ^ H[8]) - (M32[1] ^ H[1]) - (M32[4] ^ H[4]) - (M32[7] ^ H[7]) + (M32[15] ^ H[15]); + Q[11] = (M32[8] ^ H[8]) - (M32[0] ^ H[0]) - (M32[2] ^ H[2]) - (M32[5] ^ H[5]) + (M32[9] ^ H[9]); + Q[12] = (M32[1] ^ H[1]) + (M32[3] ^ H[3]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[10] ^ H[10]); + Q[13] = (M32[2] ^ H[2]) + (M32[4] ^ H[4]) + (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[11] ^ H[11]); + Q[14] = (M32[3] ^ H[3]) - (M32[5] ^ H[5]) + (M32[8] ^ H[8]) - (M32[11] ^ H[11]) - (M32[12] ^ H[12]); + Q[15] = (M32[12] ^ H[12]) - (M32[4] ^ H[4]) - (M32[6] ^ H[6]) - (M32[9] ^ H[9]) + (M32[13] ^ H[13]); + + /* Diffuse the differences in every word in a bijective manner with ssi, and then add the values of the previous double pipe.*/ + Q[0] = ss0(Q[0]) + H[1]; + Q[1] = ss1(Q[1]) + H[2]; + Q[2] = ss2(Q[2]) + H[3]; + Q[3] = ss3(Q[3]) + H[4]; + Q[4] = ss4(Q[4]) + H[5]; + Q[5] = ss0(Q[5]) + H[6]; + Q[6] = ss1(Q[6]) + H[7]; + Q[7] = ss2(Q[7]) + H[8]; + Q[8] = ss3(Q[8]) + H[9]; + Q[9] = ss4(Q[9]) + H[10]; Q[10] = ss0(Q[10]) + H[11]; Q[11] = ss1(Q[11]) + H[12]; Q[12] = ss2(Q[12]) + H[13]; @@ -109,13 +107,91 @@ void Compression256(uint32_t * M32) /* The following relation for these parameters should is satisfied: */ /* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16 */ - #pragma unroll - for (int i=16; i<18; i++) - Q[i] = expand32_1(i, M32, H, Q); - - #pragma nounroll - for (int i=18; i<32; i++) - Q[i] = expand32_2(i, M32, H, Q); + // #pragma unroll + // for (i = 0; i<2; i++) + // Q[i + 16] = expand32_1(i + 16, M32, H, Q); + + Q[16] = ss1(Q[16 - 16]) + ss2(Q[16 - 15]) + ss3(Q[16 - 14]) + ss0(Q[16 - 13]) + + ss1(Q[16 - 12]) + ss2(Q[16 - 11]) + ss3(Q[16 - 10]) + ss0(Q[16 - 9]) + + ss1(Q[16 - 8]) + ss2(Q[16 - 7]) + ss3(Q[16 - 6]) + ss0(Q[16 - 5]) + + ss1(Q[16 - 4]) + ss2(Q[16 - 3]) + ss3(Q[16 - 2]) + ss0(Q[16 - 1]) + + ((16 * (0x05555555ul) + SPH_ROTL32(M32[0], ((16 - 16) % 16) + 1) + SPH_ROTL32(M32[3], ((16 - 13) % 16) + 1)) ^ H[(16 - 16 + 7) % 16]); + + Q[17] = ss1(Q[17 - 16]) + ss2(Q[17 - 15]) + ss3(Q[17 - 14]) + ss0(Q[17 - 13]) + + ss1(Q[17 - 12]) + ss2(Q[17 - 11]) + ss3(Q[17 - 10]) + ss0(Q[17 - 9]) + + ss1(Q[17 - 8]) + ss2(Q[17 - 7]) + ss3(Q[17 - 6]) + ss0(Q[17 - 5]) + + ss1(Q[17 - 4]) + ss2(Q[17 - 3]) + ss3(Q[17 - 2]) + ss0(Q[17 - 1]) + + ((17 * (0x05555555ul) + SPH_ROTL32(M32[(17 - 16) % 16], ((17 - 16) % 16) + 1) + SPH_ROTL32(M32[(17 - 13) % 16], ((17 - 13) % 16) + 1)) ^ H[(17 - 16 + 7) % 16]); + + + uint32_t precalc = Q[18 - 16] + Q[18 - 14] + Q[18 - 12] + Q[18 - 10] + Q[18 - 8] + Q[18 - 6]; //+ Q[18 - 4] + uint32_t precalc2 = Q[19 - 16] + Q[19 - 14] + Q[19 - 12] + Q[19 - 10] + Q[19 - 8] + Q[19 - 6];//+ Q[19 - 4] + + // #pragma unroll + // for (i = 2 + 16; i < 16 + 16; i+=2) + // { + precalc = precalc + Q[18 - 4]; + precalc2 = precalc2 + Q[18 + 1 - 4]; + uint32_t p1 = ((18 * (0x05555555ul) + SPH_ROTL32(M32[2], ((18 - 16) % 16) + 1) + SPH_ROTL32(M32[5], ((18 - 13) % 16) + 1)) ^ H[(18 - 16 + 7) % 16]); + uint32_t p2 = (((18 + 1)*(0x05555555ul) + SPH_ROTL32(M32[3], (((18 + 1) - 16) % 16) + 1) + SPH_ROTL32(M32[6], (((18 + 1) - 13) % 16) + 1)) ^ H[((18 + 1) - 16 + 7) % 16]); + Q[18] = precalc + expand32_2(18, M32, H, Q) + p1; + Q[18 + 1] = precalc2 + expand32_2(18 + 1, M32, H, Q) + p2; + precalc = precalc - Q[18 - 16]; + precalc2 = precalc2 - Q[18 + 1 - 16]; + + precalc = precalc + Q[20 - 4]; + precalc2 = precalc2 + Q[20 + 1 - 4]; + p1 = ((20 * (0x05555555ul) + SPH_ROTL32(M32[4], ((20 - 16) % 16) + 1) + SPH_ROTL32(M32[7], ((20 - 13) % 16) + 1) - (0x100 << 15)) ^ H[(20 - 16 + 7) % 16]); + p2 = (((20 + 1)*(0x05555555ul) + SPH_ROTL32(M32[5], (((20 + 1) - 16) % 16) + 1) + (0x80 << 9)) ^ H[((20 + 1) - 16 + 7) % 16]); + Q[20] = precalc + expand32_2(20, M32, H, Q) + p1; + Q[20 + 1] = precalc2 + expand32_2(20 + 1, M32, H, Q) + p2; + precalc = precalc - Q[20 - 16]; + precalc2 = precalc2 - Q[20 + 1 - 16]; + + precalc = precalc + Q[22 - 4]; + precalc2 = precalc2 + Q[22 + 1 - 4]; + p1 = ((22 * (0x05555555ul) + SPH_ROTL32(M32[6], ((22 - 16) % 16) + 1) - SPH_ROTL32(M32[0], ((22 - 6) % 16) + 1)) ^ H[(22 - 16 + 7) % 16]); + p2 = (((22 + 1)*(0x05555555ul) + SPH_ROTL32(M32[7], (((22 + 1) - 16) % 16) + 1) - SPH_ROTL32(M32[1], (((22 + 1) - 6) % 16) + 1)) ^ H[((22 + 1) - 16 + 7) % 16]); + Q[22] = precalc + expand32_2(22, M32, H, Q) + p1; + Q[22 + 1] = precalc2 + expand32_2(22 + 1, M32, H, Q) + p2; + precalc = precalc - Q[22 - 16]; + precalc2 = precalc2 - Q[22 + 1 - 16]; + + precalc = precalc + Q[24 - 4]; + precalc2 = precalc2 + Q[24 + 1 - 4]; + p1 = ((24 * (0x05555555ul) + (0x80 << 9) - SPH_ROTL32(M32[2], ((24 - 6) % 16) + 1)) ^ H[(24 - 16 + 7) % 16]); + p2 = (((24 + 1)*(0x05555555ul) - SPH_ROTL32(M32[3], (((24 + 1) - 6) % 16) + 1)) ^ H[((24 + 1) - 16 + 7) % 16]); + Q[24] = precalc + expand32_2(24, M32, H, Q) + p1; + Q[24 + 1] = precalc2 + expand32_2(24 + 1, M32, H, Q) + p2; + precalc = precalc - Q[24 - 16]; + precalc2 = precalc2 - Q[24 + 1 - 16]; + + precalc = precalc + Q[26 - 4]; + precalc2 = precalc2 + Q[26 + 1 - 4]; + p1 = ((26 * (0x05555555ul) - SPH_ROTL32(M32[4], ((26 - 6) % 16) + 1)) ^ H[(26 - 16 + 7) % 16]); + p2 = (((26 + 1)*(0x05555555ul) + (0x100 << 15) - SPH_ROTL32(M32[5], (((26 + 1) - 6) % 16) + 1)) ^ H[((26 + 1) - 16 + 7) % 16]); + Q[26] = precalc + expand32_2(26, M32, H, Q) + p1; + Q[26 + 1] = precalc2 + expand32_2(26 + 1, M32, H, Q) + p2; + precalc = precalc - Q[26 - 16]; + precalc2 = precalc2 - Q[26 + 1 - 16]; + + precalc = precalc + Q[28 - 4]; + precalc2 = precalc2 + Q[28 + 1 - 4]; + p1 = ((28 * (0x05555555ul) - SPH_ROTL32(M32[6], ((28 - 6) % 16) + 1)) ^ H[(28 - 16 + 7) % 16]); + p2 = (((28 + 1)*(0x05555555ul) + SPH_ROTL32(M32[0], (((28 + 1) - 13) % 16) + 1) - SPH_ROTL32(M32[7], (((28 + 1) - 6) % 16) + 1)) ^ H[((28 + 1) - 16 + 7) % 16]); + Q[28] = precalc + expand32_2(28, M32, H, Q) + p1; + Q[28 + 1] = precalc2 + expand32_2(28 + 1, M32, H, Q) + p2; + precalc = precalc - Q[28 - 16]; + precalc2 = precalc2 - Q[28 + 1 - 16]; + + precalc = precalc + Q[30 - 4]; + precalc2 = precalc2 + Q[30 + 1 - 4]; + p1 = ((30 * (0x05555555ul) + (0x100 << 15) + SPH_ROTL32(M32[1], ((30 - 13) % 16) + 1) - (0x80 << 9)) ^ H[(30 - 16 + 7) % 16]); + p2 = (((30 + 1)*(0x05555555ul) + SPH_ROTL32(M32[2], (((30 + 1) - 13) % 16) + 1)) ^ H[((30 + 1) - 16 + 7) % 16]); + Q[30] = precalc + expand32_2(30, M32, H, Q) + p1; + Q[30 + 1] = precalc2 + expand32_2(30 + 1, M32, H, Q) + p2; + precalc = precalc - Q[30 - 16]; + precalc2 = precalc2 - Q[30 + 1 - 16]; /* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */ /* 16 new variables that are prooduced in the Message Expansion part. */ @@ -145,17 +221,18 @@ void Compression256(uint32_t * M32) M32[15] = SPH_ROTL32(M32[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]); } -__forceinline__ __device__ -void Compression256_2(uint32_t * M32) +__forceinline__ __device__ void Compression256_2(uint32_t M32[16]) { - uint32_t XL32, XH32, Q[32]; - const uint32_t H[16] = { - 0xaaaaaaa0, 0xaaaaaaa1, 0xaaaaaaa2, 0xaaaaaaa3, - 0xaaaaaaa4, 0xaaaaaaa5, 0xaaaaaaa6, 0xaaaaaaa7, - 0xaaaaaaa8, 0xaaaaaaa9, 0xaaaaaaaa, 0xaaaaaaab, - 0xaaaaaaac, 0xaaaaaaad, 0xaaaaaaae, 0xaaaaaaaf + (0xaaaaaaa0), (0xaaaaaaa1), (0xaaaaaaa2), + (0xaaaaaaa3), (0xaaaaaaa4), (0xaaaaaaa5), + (0xaaaaaaa6), (0xaaaaaaa7), (0xaaaaaaa8), + (0xaaaaaaa9), (0xaaaaaaaa), (0xaaaaaaab), + (0xaaaaaaac), (0xaaaaaaad), (0xaaaaaaae), + (0xaaaaaaaf) }; + int i; + uint32_t XL32, XH32, Q[32]; Q[0] = (M32[5] ^ H[5]) - (M32[7] ^ H[7]) + (M32[10] ^ H[10]) + (M32[13] ^ H[13]) + (M32[14] ^ H[14]); Q[1] = (M32[6] ^ H[6]) - (M32[8] ^ H[8]) + (M32[11] ^ H[11]) + (M32[14] ^ H[14]) - (M32[15] ^ H[15]); @@ -199,45 +276,69 @@ void Compression256_2(uint32_t * M32) /* The following relation for these parameters should is satisfied: */ /* EXPAND_1_ROUNDS + EXPAND_2_ROUNDS = 16 */ - #pragma unroll - for (int i = 16; i<18; i++) - Q[i] = expand32_1(i, M32, H, Q); +#pragma unroll + for (i = 0; i<2; i++) + Q[i + 16] = expand32_1(i + 16, M32, H, Q); + + /* #pragma unroll + for (i = 2; i<16; i++) + Q[i + 16] = expand32_2(i + 16, M32, H, Q); + */ + uint32_t precalc = Q[18 - 16] + Q[18 - 14] + Q[18 - 12] + Q[18 - 10] + Q[18 - 8] + Q[18 - 6]; //+ Q[18 - 4] + uint32_t precalc2 = Q[19 - 16] + Q[19 - 14] + Q[19 - 12] + Q[19 - 10] + Q[19 - 8] + Q[19 - 6];//+ Q[19 - 4] + +#pragma unroll + for (i = 2 + 16; i < 16 + 16; i += 2) + { + precalc = precalc + Q[i - 4]; + precalc2 = precalc2 + Q[i + 1 - 4]; + uint32_t p1 = ((i*(0x05555555ul) + SPH_ROTL32(M32[(i - 16) % 16], ((i - 16) % 16) + 1) + SPH_ROTL32(M32[(i - 13) % 16], ((i - 13) % 16) + 1) - SPH_ROTL32(M32[(i - 6) % 16], ((i - 6) % 16) + 1)) ^ H[(i - 16 + 7) % 16]); + uint32_t p2 = (((i + 1)*(0x05555555ul) + SPH_ROTL32(M32[((i + 1) - 16) % 16], (((i + 1) - 16) % 16) + 1) + SPH_ROTL32(M32[((i + 1) - 13) % 16], (((i + 1) - 13) % 16) + 1) - SPH_ROTL32(M32[((i + 1) - 6) % 16], (((i + 1) - 6) % 16) + 1)) ^ H[((i + 1) - 16 + 7) % 16]); + Q[i] = precalc + expand32_2(i, M32, H, Q) + p1; + Q[i + 1] = precalc2 + expand32_2(i + 1, M32, H, Q) + p2; + precalc = precalc - Q[i - 16]; + precalc2 = precalc2 - Q[i + 1 - 16]; + } + - #pragma nounroll - for (int i = 18; i<32; i++) - Q[i] = expand32_2(i, M32, H, Q); /* Blue Midnight Wish has two temporary cummulative variables that accumulate via XORing */ /* 16 new variables that are prooduced in the Message Expansion part. */ XL32 = Q[16] ^ Q[17] ^ Q[18] ^ Q[19] ^ Q[20] ^ Q[21] ^ Q[22] ^ Q[23]; - XH32 = XL32 ^ Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31]; + XH32 = XL32^Q[24] ^ Q[25] ^ Q[26] ^ Q[27] ^ Q[28] ^ Q[29] ^ Q[30] ^ Q[31]; + + + M32[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32 ^ Q[26] ^ Q[2]); + M32[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32 ^ Q[27] ^ Q[3]); + M32[14] = SPH_ROTL32(M32[2], 15) + (XH32 ^ Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]); + M32[15] = SPH_ROTL32(M32[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]); + - M32[2] = (shr(XH32, 5) ^ shl(Q[18], 5) ^ M32[2]) + (XL32 ^ Q[26] ^ Q[2]); - M32[3] = (shr(XH32, 1) ^ shl(Q[19], 5) ^ M32[3]) + (XL32 ^ Q[27] ^ Q[3]); - M32[14] = SPH_ROTL32(M32[2], 15) + (XH32 ^ Q[30] ^ M32[14]) + (shr(XL32, 7) ^ Q[21] ^ Q[14]); - M32[15] = SPH_ROTL32(M32[3], 16) + (XH32 ^ Q[31] ^ M32[15]) + (shr(XL32, 2) ^ Q[22] ^ Q[15]); } #define TPB 512 __global__ __launch_bounds__(TPB, 2) -void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *const __restrict__ nonceVector) +void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash, uint32_t *const __restrict__ nonceVector, uint32_t Target) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t message[16] = { 0 }; - - LOHI(message[0], message[1], __ldg(&g_hash[thread])); - LOHI(message[2], message[3], __ldg(&g_hash[thread + 1 * threads])); - LOHI(message[4], message[5], __ldg(&g_hash[thread + 2 * threads])); - LOHI(message[6], message[7], __ldg(&g_hash[thread + 3 * threads])); - - message[8]=0x80; - message[14]=0x100; - Compression256(message); - Compression256_2(message); - - if (((uint64_t*)message)[7] <= pTarget[3]) + uint2 message[8] = { 0 }; + + message[0] = __ldg(&g_hash[thread + 0 * threads]); + message[1] = __ldg(&g_hash[thread + 1 * threads]); + message[2] = __ldg(&g_hash[thread + 2 * threads]); + message[3] = __ldg(&g_hash[thread + 3 * threads]); + //LOHI(message[2], message[3], __ldg(&g_hash[thread + 1 * threads])); + //LOHI(message[4], message[5], __ldg(&g_hash[thread + 2 * threads])); + //LOHI(message[6], message[7], __ldg(&g_hash[thread + 3 * threads])); + + message[4].x = 0x80; + message[7].x = 0x100; + Compression256((uint32_t*)message); + Compression256_2((uint32_t*)message); + + if (message[7].y <= Target) { uint32_t tmp = atomicExch(&nonceVector[0], startNounce + thread); if (tmp != 0) @@ -247,7 +348,7 @@ void bmw256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *g_hash } __host__ -void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces) +void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces, uint32_t Target) { const uint32_t threadsperblock = TPB; dim3 grid((threads + threadsperblock - 1) / threadsperblock); @@ -255,13 +356,12 @@ void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint cudaMemset(d_GNonce[thr_id], 0, 2 * sizeof(uint32_t)); - bmw256_gpu_hash_32 << > >(threads, startNounce, g_hash, d_GNonce[thr_id]); + bmw256_gpu_hash_32 << > >(threads, startNounce, (uint2*)g_hash, d_GNonce[thr_id], Target); cudaMemcpy(d_gnounce[thr_id], d_GNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost); resultnonces[0] = *(d_gnounce[thr_id]); resultnonces[1] = *(d_gnounce[thr_id] + 1); } - __host__ void bmw256_cpu_init(int thr_id, uint32_t threads) { @@ -276,8 +376,10 @@ void bmw256_cpu_free(int thr_id) cudaFreeHost(d_gnounce[thr_id]); } +/* __host__ void bmw256_setTarget(const void *pTargetIn) { cudaMemcpyToSymbol(pTarget, pTargetIn, 32, 0, cudaMemcpyHostToDevice); } +*/ \ No newline at end of file diff --git a/Algo256/cuda_cubehash256.cu b/Algo256/cuda_cubehash256.cu index 76b9c52..ed889e5 100644 --- a/Algo256/cuda_cubehash256.cu +++ b/Algo256/cuda_cubehash256.cu @@ -3,179 +3,247 @@ #define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */ #define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */ +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +#define __CUDA_ARCH__ 520 +#endif + #if __CUDA_ARCH__ < 350 #define LROT(x,bits) ((x << bits) | (x >> (32 - bits))) #else #define LROT(x, bits) __funnelshift_l(x, x, bits) #endif -#if __CUDA_ARCH__ < 500 -#define TPB 576 -#else -#define TPB 1024 -#endif +#define TPB35 576 +#define TPB50 1024 #define ROTATEUPWARDS7(a) LROT(a,7) #define ROTATEUPWARDS11(a) LROT(a,11) -//#define SWAP(a,b) { uint32_t u = a; a = b; b = u; } -#define SWAP(a,b) { a ^= b; b ^= a; a ^= b; } - __device__ __forceinline__ void rrounds(uint32_t x[2][2][2][2][2]) { int r; - int j; - int k; - int l; - int m; - - #pragma unroll 2 - for (r = 0; r < CUBEHASH_ROUNDS; ++r) { - - /* "add x_0jklm into x_1jklmn modulo 2^32" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[1][j][k][l][m] += x[0][j][k][l][m]; + uint32_t x0[2][2][2][2]; + uint32_t x1[2][2][2][2]; + + for (r = 0; r < CUBEHASH_ROUNDS; r += 2) { /* "rotate x_0jklm upwards by 7 bits" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]); - - /* "swap x_00klm with x_01klm" */ -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - SWAP(x[0][0][k][l][m], x[0][1][k][l][m]) - - /* "xor x_1jklm into x_0jklm" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[0][j][k][l][m] ^= x[1][j][k][l][m]; - - /* "swap x_1jk0m with x_1jk1m" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - SWAP(x[1][j][k][0][m], x[1][j][k][1][m]) - - /* "add x_0jklm into x_1jklm modulo 2^32" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[1][j][k][l][m] += x[0][j][k][l][m]; + x0[0][0][0][0] = ROTATEUPWARDS7(x[0][0][0][0][0]); + x0[0][0][0][1] = ROTATEUPWARDS7(x[0][0][0][0][1]); + x0[0][0][1][0] = ROTATEUPWARDS7(x[0][0][0][1][0]); + x0[0][0][1][1] = ROTATEUPWARDS7(x[0][0][0][1][1]); + x0[0][1][0][0] = ROTATEUPWARDS7(x[0][0][1][0][0]); + x0[0][1][0][1] = ROTATEUPWARDS7(x[0][0][1][0][1]); + x0[0][1][1][0] = ROTATEUPWARDS7(x[0][0][1][1][0]); + x0[0][1][1][1] = ROTATEUPWARDS7(x[0][0][1][1][1]); + x0[1][0][0][0] = ROTATEUPWARDS7(x[0][1][0][0][0]); + x0[1][0][0][1] = ROTATEUPWARDS7(x[0][1][0][0][1]); + x0[1][0][1][0] = ROTATEUPWARDS7(x[0][1][0][1][0]); + x0[1][0][1][1] = ROTATEUPWARDS7(x[0][1][0][1][1]); + x0[1][1][0][0] = ROTATEUPWARDS7(x[0][1][1][0][0]); + x0[1][1][0][1] = ROTATEUPWARDS7(x[0][1][1][0][1]); + x0[1][1][1][0] = ROTATEUPWARDS7(x[0][1][1][1][0]); + x0[1][1][1][1] = ROTATEUPWARDS7(x[0][1][1][1][1]); + + /* "add x_0jklm into x_1jklm modulo 2^32" */ + x1[0][0][0][0] = x[1][0][0][0][0] + x[0][0][0][0][0]; + x1[0][0][0][1] = x[1][0][0][0][1] + x[0][0][0][0][1]; + x1[0][0][1][0] = x[1][0][0][1][0] + x[0][0][0][1][0]; + x1[0][0][1][1] = x[1][0][0][1][1] + x[0][0][0][1][1]; + x1[0][1][0][0] = x[1][0][1][0][0] + x[0][0][1][0][0]; + x1[0][1][0][1] = x[1][0][1][0][1] + x[0][0][1][0][1]; + x1[0][1][1][0] = x[1][0][1][1][0] + x[0][0][1][1][0]; + x1[0][1][1][1] = x[1][0][1][1][1] + x[0][0][1][1][1]; + x1[1][0][0][0] = x[1][1][0][0][0] + x[0][1][0][0][0]; + x1[1][0][0][1] = x[1][1][0][0][1] + x[0][1][0][0][1]; + x1[1][0][1][0] = x[1][1][0][1][0] + x[0][1][0][1][0]; + x1[1][0][1][1] = x[1][1][0][1][1] + x[0][1][0][1][1]; + x1[1][1][0][0] = x[1][1][1][0][0] + x[0][1][1][0][0]; + x1[1][1][0][1] = x[1][1][1][0][1] + x[0][1][1][0][1]; + x1[1][1][1][0] = x[1][1][1][1][0] + x[0][1][1][1][0]; + x1[1][1][1][1] = x[1][1][1][1][1] + x[0][1][1][1][1]; + + /* "xor x_1~jklm into x_0jklm" */ + x[0][0][0][0][0] = x0[0][0][0][0] ^ x1[1][0][0][0]; + x[0][0][0][0][1] = x0[0][0][0][1] ^ x1[1][0][0][1]; + x[0][0][0][1][0] = x0[0][0][1][0] ^ x1[1][0][1][0]; + x[0][0][0][1][1] = x0[0][0][1][1] ^ x1[1][0][1][1]; + x[0][0][1][0][0] = x0[0][1][0][0] ^ x1[1][1][0][0]; + x[0][0][1][0][1] = x0[0][1][0][1] ^ x1[1][1][0][1]; + x[0][0][1][1][0] = x0[0][1][1][0] ^ x1[1][1][1][0]; + x[0][0][1][1][1] = x0[0][1][1][1] ^ x1[1][1][1][1]; + x[0][1][0][0][0] = x0[1][0][0][0] ^ x1[0][0][0][0]; + x[0][1][0][0][1] = x0[1][0][0][1] ^ x1[0][0][0][1]; + x[0][1][0][1][0] = x0[1][0][1][0] ^ x1[0][0][1][0]; + x[0][1][0][1][1] = x0[1][0][1][1] ^ x1[0][0][1][1]; + x[0][1][1][0][0] = x0[1][1][0][0] ^ x1[0][1][0][0]; + x[0][1][1][0][1] = x0[1][1][0][1] ^ x1[0][1][0][1]; + x[0][1][1][1][0] = x0[1][1][1][0] ^ x1[0][1][1][0]; + x[0][1][1][1][1] = x0[1][1][1][1] ^ x1[0][1][1][1]; /* "rotate x_0jklm upwards by 11 bits" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]); - - /* "swap x_0j0lm with x_0j1lm" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - SWAP(x[0][j][0][l][m], x[0][j][1][l][m]) - - /* "xor x_1jklm into x_0jklm" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) -#pragma unroll 2 - for (m = 0; m < 2; ++m) - x[0][j][k][l][m] ^= x[1][j][k][l][m]; - - /* "swap x_1jkl0 with x_1jkl1" */ -#pragma unroll 2 - for (j = 0; j < 2; ++j) -#pragma unroll 2 - for (k = 0; k < 2; ++k) -#pragma unroll 2 - for (l = 0; l < 2; ++l) - SWAP(x[1][j][k][l][0], x[1][j][k][l][1]) + x0[0][0][0][0] = ROTATEUPWARDS11(x[0][0][0][0][0]); + x0[0][0][0][1] = ROTATEUPWARDS11(x[0][0][0][0][1]); + x0[0][0][1][0] = ROTATEUPWARDS11(x[0][0][0][1][0]); + x0[0][0][1][1] = ROTATEUPWARDS11(x[0][0][0][1][1]); + x0[0][1][0][0] = ROTATEUPWARDS11(x[0][0][1][0][0]); + x0[0][1][0][1] = ROTATEUPWARDS11(x[0][0][1][0][1]); + x0[0][1][1][0] = ROTATEUPWARDS11(x[0][0][1][1][0]); + x0[0][1][1][1] = ROTATEUPWARDS11(x[0][0][1][1][1]); + x0[1][0][0][0] = ROTATEUPWARDS11(x[0][1][0][0][0]); + x0[1][0][0][1] = ROTATEUPWARDS11(x[0][1][0][0][1]); + x0[1][0][1][0] = ROTATEUPWARDS11(x[0][1][0][1][0]); + x0[1][0][1][1] = ROTATEUPWARDS11(x[0][1][0][1][1]); + x0[1][1][0][0] = ROTATEUPWARDS11(x[0][1][1][0][0]); + x0[1][1][0][1] = ROTATEUPWARDS11(x[0][1][1][0][1]); + x0[1][1][1][0] = ROTATEUPWARDS11(x[0][1][1][1][0]); + x0[1][1][1][1] = ROTATEUPWARDS11(x[0][1][1][1][1]); + + /* "add x_0jklm into x_1~jk~lm modulo 2^32" */ + x[1][1][0][1][0] = x1[1][0][1][0] + x[0][0][0][0][0]; + x[1][1][0][1][1] = x1[1][0][1][1] + x[0][0][0][0][1]; + x[1][1][0][0][0] = x1[1][0][0][0] + x[0][0][0][1][0]; + x[1][1][0][0][1] = x1[1][0][0][1] + x[0][0][0][1][1]; + x[1][1][1][1][0] = x1[1][1][1][0] + x[0][0][1][0][0]; + x[1][1][1][1][1] = x1[1][1][1][1] + x[0][0][1][0][1]; + x[1][1][1][0][0] = x1[1][1][0][0] + x[0][0][1][1][0]; + x[1][1][1][0][1] = x1[1][1][0][1] + x[0][0][1][1][1]; + x[1][0][0][1][0] = x1[0][0][1][0] + x[0][1][0][0][0]; + x[1][0][0][1][1] = x1[0][0][1][1] + x[0][1][0][0][1]; + x[1][0][0][0][0] = x1[0][0][0][0] + x[0][1][0][1][0]; + x[1][0][0][0][1] = x1[0][0][0][1] + x[0][1][0][1][1]; + x[1][0][1][1][0] = x1[0][1][1][0] + x[0][1][1][0][0]; + x[1][0][1][1][1] = x1[0][1][1][1] + x[0][1][1][0][1]; + x[1][0][1][0][0] = x1[0][1][0][0] + x[0][1][1][1][0]; + x[1][0][1][0][1] = x1[0][1][0][1] + x[0][1][1][1][1]; + + /* "xor x_1~j~k~lm into x_0jklm" */ + x[0][0][0][0][0] = x0[0][0][0][0] ^ x[1][1][1][1][0]; + x[0][0][0][0][1] = x0[0][0][0][1] ^ x[1][1][1][1][1]; + x[0][0][0][1][0] = x0[0][0][1][0] ^ x[1][1][1][0][0]; + x[0][0][0][1][1] = x0[0][0][1][1] ^ x[1][1][1][0][1]; + x[0][0][1][0][0] = x0[0][1][0][0] ^ x[1][1][0][1][0]; + x[0][0][1][0][1] = x0[0][1][0][1] ^ x[1][1][0][1][1]; + x[0][0][1][1][0] = x0[0][1][1][0] ^ x[1][1][0][0][0]; + x[0][0][1][1][1] = x0[0][1][1][1] ^ x[1][1][0][0][1]; + x[0][1][0][0][0] = x0[1][0][0][0] ^ x[1][0][1][1][0]; + x[0][1][0][0][1] = x0[1][0][0][1] ^ x[1][0][1][1][1]; + x[0][1][0][1][0] = x0[1][0][1][0] ^ x[1][0][1][0][0]; + x[0][1][0][1][1] = x0[1][0][1][1] ^ x[1][0][1][0][1]; + x[0][1][1][0][0] = x0[1][1][0][0] ^ x[1][0][0][1][0]; + x[0][1][1][0][1] = x0[1][1][0][1] ^ x[1][0][0][1][1]; + x[0][1][1][1][0] = x0[1][1][1][0] ^ x[1][0][0][0][0]; + x[0][1][1][1][1] = x0[1][1][1][1] ^ x[1][0][0][0][1]; - } -} - -__device__ __forceinline__ void block_tox(const uint32_t *in, uint32_t x[2][2][2][2][2]) -{ - x[0][0][0][0][0] ^= in[0]; - x[0][0][0][0][1] ^= in[1]; - x[0][0][0][1][0] ^= in[2]; - x[0][0][0][1][1] ^= in[3]; - x[0][0][1][0][0] ^= in[4]; - x[0][0][1][0][1] ^= in[5]; - x[0][0][1][1][0] ^= in[6]; - x[0][0][1][1][1] ^= in[7]; -} - -__device__ __forceinline__ void hash_fromx(uint32_t *out, uint32_t x[2][2][2][2][2]) -{ - out[0] = x[0][0][0][0][0]; - out[1] = x[0][0][0][0][1]; - out[2] = x[0][0][0][1][0]; - out[3] = x[0][0][0][1][1]; - out[4] = x[0][0][1][0][0]; - out[5] = x[0][0][1][0][1]; - out[6] = x[0][0][1][1][0]; - out[7] = x[0][0][1][1][1]; - -} - -__device__ __forceinline__ -void Update32(uint32_t x[2][2][2][2][2], const uint32_t *data) -{ - /* "xor the block into the first b bytes of the state" */ - /* "and then transform the state invertibly through r identical rounds" */ - block_tox(data, x); - rrounds(x); -} + /* "rotate x_0jklm upwards by 7 bits" */ + x0[0][0][0][0] = ROTATEUPWARDS7(x[0][0][0][0][0]); + x0[0][0][0][1] = ROTATEUPWARDS7(x[0][0][0][0][1]); + x0[0][0][1][0] = ROTATEUPWARDS7(x[0][0][0][1][0]); + x0[0][0][1][1] = ROTATEUPWARDS7(x[0][0][0][1][1]); + x0[0][1][0][0] = ROTATEUPWARDS7(x[0][0][1][0][0]); + x0[0][1][0][1] = ROTATEUPWARDS7(x[0][0][1][0][1]); + x0[0][1][1][0] = ROTATEUPWARDS7(x[0][0][1][1][0]); + x0[0][1][1][1] = ROTATEUPWARDS7(x[0][0][1][1][1]); + x0[1][0][0][0] = ROTATEUPWARDS7(x[0][1][0][0][0]); + x0[1][0][0][1] = ROTATEUPWARDS7(x[0][1][0][0][1]); + x0[1][0][1][0] = ROTATEUPWARDS7(x[0][1][0][1][0]); + x0[1][0][1][1] = ROTATEUPWARDS7(x[0][1][0][1][1]); + x0[1][1][0][0] = ROTATEUPWARDS7(x[0][1][1][0][0]); + x0[1][1][0][1] = ROTATEUPWARDS7(x[0][1][1][0][1]); + x0[1][1][1][0] = ROTATEUPWARDS7(x[0][1][1][1][0]); + x0[1][1][1][1] = ROTATEUPWARDS7(x[0][1][1][1][1]); + + /* "add x_0jklm into x_1~j~k~l~m modulo 2^32" */ + x1[1][1][1][1] = x[1][1][1][1][1] + x[0][0][0][0][0]; + x1[1][1][1][0] = x[1][1][1][1][0] + x[0][0][0][0][1]; + x1[1][1][0][1] = x[1][1][1][0][1] + x[0][0][0][1][0]; + x1[1][1][0][0] = x[1][1][1][0][0] + x[0][0][0][1][1]; + x1[1][0][1][1] = x[1][1][0][1][1] + x[0][0][1][0][0]; + x1[1][0][1][0] = x[1][1][0][1][0] + x[0][0][1][0][1]; + x1[1][0][0][1] = x[1][1][0][0][1] + x[0][0][1][1][0]; + x1[1][0][0][0] = x[1][1][0][0][0] + x[0][0][1][1][1]; + x1[0][1][1][1] = x[1][0][1][1][1] + x[0][1][0][0][0]; + x1[0][1][1][0] = x[1][0][1][1][0] + x[0][1][0][0][1]; + x1[0][1][0][1] = x[1][0][1][0][1] + x[0][1][0][1][0]; + x1[0][1][0][0] = x[1][0][1][0][0] + x[0][1][0][1][1]; + x1[0][0][1][1] = x[1][0][0][1][1] + x[0][1][1][0][0]; + x1[0][0][1][0] = x[1][0][0][1][0] + x[0][1][1][0][1]; + x1[0][0][0][1] = x[1][0][0][0][1] + x[0][1][1][1][0]; + x1[0][0][0][0] = x[1][0][0][0][0] + x[0][1][1][1][1]; + + /* "xor x_1j~k~l~m into x_0jklm" */ + x[0][0][0][0][0] = x0[0][0][0][0] ^ x1[0][1][1][1]; + x[0][0][0][0][1] = x0[0][0][0][1] ^ x1[0][1][1][0]; + x[0][0][0][1][0] = x0[0][0][1][0] ^ x1[0][1][0][1]; + x[0][0][0][1][1] = x0[0][0][1][1] ^ x1[0][1][0][0]; + x[0][0][1][0][0] = x0[0][1][0][0] ^ x1[0][0][1][1]; + x[0][0][1][0][1] = x0[0][1][0][1] ^ x1[0][0][1][0]; + x[0][0][1][1][0] = x0[0][1][1][0] ^ x1[0][0][0][1]; + x[0][0][1][1][1] = x0[0][1][1][1] ^ x1[0][0][0][0]; + x[0][1][0][0][0] = x0[1][0][0][0] ^ x1[1][1][1][1]; + x[0][1][0][0][1] = x0[1][0][0][1] ^ x1[1][1][1][0]; + x[0][1][0][1][0] = x0[1][0][1][0] ^ x1[1][1][0][1]; + x[0][1][0][1][1] = x0[1][0][1][1] ^ x1[1][1][0][0]; + x[0][1][1][0][0] = x0[1][1][0][0] ^ x1[1][0][1][1]; + x[0][1][1][0][1] = x0[1][1][0][1] ^ x1[1][0][1][0]; + x[0][1][1][1][0] = x0[1][1][1][0] ^ x1[1][0][0][1]; + x[0][1][1][1][1] = x0[1][1][1][1] ^ x1[1][0][0][0]; -__device__ __forceinline__ -void Update32_const(uint32_t x[2][2][2][2][2]) -{ - x[0][0][0][0][0] ^= 0x80; - rrounds(x); + /* "rotate x_0jklm upwards by 11 bits" */ + x0[0][0][0][0] = ROTATEUPWARDS11(x[0][0][0][0][0]); + x0[0][0][0][1] = ROTATEUPWARDS11(x[0][0][0][0][1]); + x0[0][0][1][0] = ROTATEUPWARDS11(x[0][0][0][1][0]); + x0[0][0][1][1] = ROTATEUPWARDS11(x[0][0][0][1][1]); + x0[0][1][0][0] = ROTATEUPWARDS11(x[0][0][1][0][0]); + x0[0][1][0][1] = ROTATEUPWARDS11(x[0][0][1][0][1]); + x0[0][1][1][0] = ROTATEUPWARDS11(x[0][0][1][1][0]); + x0[0][1][1][1] = ROTATEUPWARDS11(x[0][0][1][1][1]); + x0[1][0][0][0] = ROTATEUPWARDS11(x[0][1][0][0][0]); + x0[1][0][0][1] = ROTATEUPWARDS11(x[0][1][0][0][1]); + x0[1][0][1][0] = ROTATEUPWARDS11(x[0][1][0][1][0]); + x0[1][0][1][1] = ROTATEUPWARDS11(x[0][1][0][1][1]); + x0[1][1][0][0] = ROTATEUPWARDS11(x[0][1][1][0][0]); + x0[1][1][0][1] = ROTATEUPWARDS11(x[0][1][1][0][1]); + x0[1][1][1][0] = ROTATEUPWARDS11(x[0][1][1][1][0]); + x0[1][1][1][1] = ROTATEUPWARDS11(x[0][1][1][1][1]); + + /* "add x_0jklm into x_1j~kl~m modulo 2^32" */ + x[1][0][1][0][1] = x1[0][1][0][1] + x[0][0][0][0][0]; + x[1][0][1][0][0] = x1[0][1][0][0] + x[0][0][0][0][1]; + x[1][0][1][1][1] = x1[0][1][1][1] + x[0][0][0][1][0]; + x[1][0][1][1][0] = x1[0][1][1][0] + x[0][0][0][1][1]; + x[1][0][0][0][1] = x1[0][0][0][1] + x[0][0][1][0][0]; + x[1][0][0][0][0] = x1[0][0][0][0] + x[0][0][1][0][1]; + x[1][0][0][1][1] = x1[0][0][1][1] + x[0][0][1][1][0]; + x[1][0][0][1][0] = x1[0][0][1][0] + x[0][0][1][1][1]; + x[1][1][1][0][1] = x1[1][1][0][1] + x[0][1][0][0][0]; + x[1][1][1][0][0] = x1[1][1][0][0] + x[0][1][0][0][1]; + x[1][1][1][1][1] = x1[1][1][1][1] + x[0][1][0][1][0]; + x[1][1][1][1][0] = x1[1][1][1][0] + x[0][1][0][1][1]; + x[1][1][0][0][1] = x1[1][0][0][1] + x[0][1][1][0][0]; + x[1][1][0][0][0] = x1[1][0][0][0] + x[0][1][1][0][1]; + x[1][1][0][1][1] = x1[1][0][1][1] + x[0][1][1][1][0]; + x[1][1][0][1][0] = x1[1][0][1][0] + x[0][1][1][1][1]; + + /* "xor x_1jkl~m into x_0jklm" */ + x[0][0][0][0][0] = x0[0][0][0][0] ^ x[1][0][0][0][1]; + x[0][0][0][0][1] = x0[0][0][0][1] ^ x[1][0][0][0][0]; + x[0][0][0][1][0] = x0[0][0][1][0] ^ x[1][0][0][1][1]; + x[0][0][0][1][1] = x0[0][0][1][1] ^ x[1][0][0][1][0]; + x[0][0][1][0][0] = x0[0][1][0][0] ^ x[1][0][1][0][1]; + x[0][0][1][0][1] = x0[0][1][0][1] ^ x[1][0][1][0][0]; + x[0][0][1][1][0] = x0[0][1][1][0] ^ x[1][0][1][1][1]; + x[0][0][1][1][1] = x0[0][1][1][1] ^ x[1][0][1][1][0]; + x[0][1][0][0][0] = x0[1][0][0][0] ^ x[1][1][0][0][1]; + x[0][1][0][0][1] = x0[1][0][0][1] ^ x[1][1][0][0][0]; + x[0][1][0][1][0] = x0[1][0][1][0] ^ x[1][1][0][1][1]; + x[0][1][0][1][1] = x0[1][0][1][1] ^ x[1][1][0][1][0]; + x[0][1][1][0][0] = x0[1][1][0][0] ^ x[1][1][1][0][1]; + x[0][1][1][0][1] = x0[1][1][0][1] ^ x[1][1][1][0][0]; + x[0][1][1][1][0] = x0[1][1][1][0] ^ x[1][1][1][1][1]; + x[0][1][1][1][1] = x0[1][1][1][1] ^ x[1][1][1][1][0]; + } } __device__ __forceinline__ @@ -185,27 +253,44 @@ void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval) x[1][1][1][1][1] ^= 1U; /* "the state is then transformed invertibly through 10r identical rounds" */ - #pragma unroll 2 for (int i = 0; i < 10; ++i) rrounds(x); /* "output the first h/8 bytes of the state" */ - hash_fromx(hashval, x); + hashval[0] = x[0][0][0][0][0]; + hashval[1] = x[0][0][0][0][1]; + hashval[2] = x[0][0][0][1][0]; + hashval[3] = x[0][0][0][1][1]; + hashval[4] = x[0][0][1][0][0]; + hashval[5] = x[0][0][1][0][1]; + hashval[6] = x[0][0][1][1][0]; + hashval[7] = x[0][0][1][1][1]; } #if __CUDA_ARCH__ >= 500 - -__global__ __launch_bounds__(TPB, 1) +__global__ __launch_bounds__(TPB50, 1) +#else +__global__ __launch_bounds__(TPB35, 1) +#endif void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash) { uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { +#if __CUDA_ARCH__ >= 500 uint2 Hash[4]; Hash[0] = __ldg(&g_hash[thread]); Hash[1] = __ldg(&g_hash[thread + 1 * threads]); Hash[2] = __ldg(&g_hash[thread + 2 * threads]); Hash[3] = __ldg(&g_hash[thread + 3 * threads]); +#else + uint32_t Hash[8]; + + LOHI(Hash[0], Hash[1], __ldg(&((uint64_t*)g_hash)[thread])); + LOHI(Hash[2], Hash[3], __ldg(&((uint64_t*)g_hash)[thread + 1 * threads])); + LOHI(Hash[4], Hash[5], __ldg(&((uint64_t*)g_hash)[thread + 2 * threads])); + LOHI(Hash[6], Hash[7], __ldg(&((uint64_t*)g_hash)[thread + 3 * threads])); +#endif uint32_t x[2][2][2][2][2] = { @@ -219,6 +304,7 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_ha 0x15815AEB, 0x4AB6AAD6, 0x9CDAF8AF, 0xD6032C0A }; +#if __CUDA_ARCH__ >= 500 x[0][0][0][0][0] ^= Hash[0].x; x[0][0][0][0][1] ^= Hash[0].y; x[0][0][0][1][0] ^= Hash[1].x; @@ -227,48 +313,7 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_ha x[0][0][1][0][1] ^= Hash[2].y; x[0][0][1][1][0] ^= Hash[3].x; x[0][0][1][1][1] ^= Hash[3].y; - - rrounds(x); - x[0][0][0][0][0] ^= 0x80U; - rrounds(x); - - Final(x, (uint32_t*) Hash); - - g_hash[thread] = Hash[0]; - g_hash[1 * threads + thread] = Hash[1]; - g_hash[2 * threads + thread] = Hash[2]; - g_hash[3 * threads + thread] = Hash[3]; - } -} - #else - -__global__ __launch_bounds__(TPB, 1) -void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *d_hash) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t Hash[8]; - uint64_t* g_hash = (uint64_t*) d_hash; - - LOHI(Hash[0], Hash[1], __ldg(&g_hash[thread])); - LOHI(Hash[2], Hash[3], __ldg(&g_hash[thread + 1 * threads])); - LOHI(Hash[4], Hash[5], __ldg(&g_hash[thread + 2 * threads])); - LOHI(Hash[6], Hash[7], __ldg(&g_hash[thread + 3 * threads])); - - uint32_t x[2][2][2][2][2] = - { - 0xEA2BD4B4, 0xCCD6F29F, 0x63117E71, 0x35481EAE, - 0x22512D5B, 0xE5D94E63, 0x7E624131, 0xF4CC12BE, - 0xC2D0B696, 0x42AF2070, 0xD0720C35, 0x3361DA8C, - 0x28CCECA4, 0x8EF8AD83, 0x4680AC00, 0x40E5FBAB, - 0xD89041C3, 0x6107FBD5, 0x6C859D41, 0xF0B26679, - 0x09392549, 0x5FA25603, 0x65C892FD, 0x93CB6285, - 0x2AF2B5AE, 0x9E4B4E60, 0x774ABFDD, 0x85254725, - 0x15815AEB, 0x4AB6AAD6, 0x9CDAF8AF, 0xD6032C0A - }; - x[0][0][0][0][0] ^= Hash[0]; x[0][0][0][0][1] ^= Hash[1]; x[0][0][0][1][0] ^= Hash[2]; @@ -277,29 +322,48 @@ void cubehash256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *d_ha x[0][0][1][0][1] ^= Hash[5]; x[0][0][1][1][0] ^= Hash[6]; x[0][0][1][1][1] ^= Hash[7]; - +#endif rrounds(x); x[0][0][0][0][0] ^= 0x80U; rrounds(x); +#if __CUDA_ARCH__ >= 500 + Final(x, (uint32_t*)Hash); + + g_hash[thread] = Hash[0]; + g_hash[1 * threads + thread] = Hash[1]; + g_hash[2 * threads + thread] = Hash[2]; + g_hash[3 * threads + thread] = Hash[3]; +#else Final(x, Hash); - g_hash[thread] = ((uint64_t*)Hash)[0]; - g_hash[1 * threads + thread] = ((uint64_t*)Hash)[1]; - g_hash[2 * threads + thread] = ((uint64_t*)Hash)[2]; - g_hash[3 * threads + thread] = ((uint64_t*)Hash)[3]; + ((uint64_t*)g_hash)[thread] = ((uint64_t*)Hash)[0]; + ((uint64_t*)g_hash)[1 * threads + thread] = ((uint64_t*)Hash)[1]; + ((uint64_t*)g_hash)[2 * threads + thread] = ((uint64_t*)Hash)[2]; + ((uint64_t*)g_hash)[3 * threads + thread] = ((uint64_t*)Hash)[3]; +#endif } } -#endif - __host__ void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order) { - uint32_t tpb = TPB; + uint32_t tpb = TPB35; + if (cuda_arch[thr_id] >= 500) tpb = TPB50; + + dim3 grid((threads + tpb - 1) / tpb); + dim3 block(tpb); + + cubehash256_gpu_hash_32 << > > (threads, startNounce, (uint2*)d_hash); +} +__host__ +void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order, cudaStream_t stream) +{ + uint32_t tpb = TPB35; + if (cuda_arch[thr_id] >= 500) tpb = TPB50; - dim3 grid((threads + tpb-1)/tpb); + dim3 grid((threads + tpb - 1) / tpb); dim3 block(tpb); - cubehash256_gpu_hash_32 <<>> (threads, startNounce, (uint2*) d_hash); + cubehash256_gpu_hash_32 << > > (threads, startNounce, (uint2*)d_hash); } diff --git a/Algo256/cuda_skein256.cu b/Algo256/cuda_skein256.cu index cbeb660..44b3dad 100644 --- a/Algo256/cuda_skein256.cu +++ b/Algo256/cuda_skein256.cu @@ -13,40 +13,296 @@ void Round512v35(uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p } __forceinline__ __device__ -void Round_8_512v35(const uint2 *const __restrict__ ks, const uint2 *const __restrict__ ts, - uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7, int R) +void Round_8_512v35_1(const uint2 ks[9], const uint2 ts[3], +uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7) { Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); - Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); - p0 += ks[(R+0) % 9]; - p1 += ks[(R+1) % 9]; - p2 += ks[(R+2) % 9]; - p3 += ks[(R+3) % 9]; - p4 += ks[(R+4) % 9]; - p5 += ks[(R+5) % 9] + ts[(R+0) % 3]; - p6 += ks[(R+6) % 9] + ts[(R+1) % 3]; - p7 += ks[(R+7) % 9] + make_uint2(R, 0); + p0 += ks[1]; + p1 += ks[2]; + p2 += ks[3]; + p3 += ks[4]; + p4 += ks[5]; + p5 += ks[6] + ts[1]; + p6 += ks[7] + ts[2]; + p7 += ks[8] + make_uint2(1, 0); Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); - Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); + + p0 += ks[2]; + p1 += ks[3]; + p2 += ks[4]; + p3 += ks[5]; + p4 += ks[6]; + p5 += ks[7] + ts[2]; + p6 += ks[8] + ts[0]; + p7 += ks[0] + make_uint2(2, 0); +} +__forceinline__ __device__ +void Round_8_512v35_3(const uint2 ks[9], const uint2 ts[3], +uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7) +{ + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); - p0 += ks[(R+1) % 9]; - p1 += ks[(R+2) % 9]; - p2 += ks[(R+3) % 9]; - p3 += ks[(R+4) % 9]; - p4 += ks[(R+5) % 9]; - p5 += ks[(R+6) % 9] + ts[(R+1) % 3]; - p6 += ks[(R+7) % 9] + ts[(R+2) % 3]; - p7 += ks[(R+8) % 9] + make_uint2(R+1, 0); + p0 += ks[3]; + p1 += ks[4]; + p2 += ks[5]; + p3 += ks[6]; + p4 += ks[7]; + p5 += ks[8] + ts[0]; + p6 += ks[0] + ts[1]; + p7 += ks[1] + make_uint2(3, 0); + + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); + + p0 += ks[4]; + p1 += ks[5]; + p2 += ks[6]; + p3 += ks[7]; + p4 += ks[8]; + p5 += ks[0] + ts[1]; + p6 += ks[1] + ts[2]; + p7 += ks[2] + make_uint2(4, 0); +} +__forceinline__ __device__ +void Round_8_512v35_5(const uint2 ks[9], const uint2 ts[3], +uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7) +{ + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); + + p0 += ks[5]; + p1 += ks[6]; + p2 += ks[7]; + p3 += ks[8]; + p4 += ks[0]; + p5 += ks[1] + ts[2]; + p6 += ks[2] + ts[0]; + p7 += ks[3] + make_uint2(5, 0); + + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); + + p0 += ks[6]; + p1 += ks[7]; + p2 += ks[8]; + p3 += ks[0]; + p4 += ks[1]; + p5 += ks[2] + ts[0]; + p6 += ks[3] + ts[1]; + p7 += ks[4] + make_uint2(6, 0); +} +__forceinline__ __device__ +void Round_8_512v35_7(const uint2 ks[9], const uint2 ts[3], +uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7) +{ + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); + + p0 += ks[7]; + p1 += ks[8]; + p2 += ks[0]; + p3 += ks[1]; + p4 += ks[2]; + p5 += ks[3] + ts[1]; + p6 += ks[4] + ts[2]; + p7 += ks[5] + make_uint2(7, 0); + + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); + + p0 += ks[8]; + p1 += ks[0]; + p2 += ks[1]; + p3 += ks[2]; + p4 += ks[3]; + p5 += ks[4] + ts[2]; + p6 += ks[5] + ts[0]; + p7 += ks[6] + make_uint2(8, 0); } +__forceinline__ __device__ +void Round_8_512v35_9(const uint2 ks[9], const uint2 ts[3], +uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7) +{ + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); + + p0 += ks[0]; + p1 += ks[1]; + p2 += ks[2]; + p3 += ks[3]; + p4 += ks[4]; + p5 += ks[5] + ts[0]; + p6 += ks[6] + ts[1]; + p7 += ks[7] + make_uint2(9, 0); + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); + + p0 += ks[1]; + p1 += ks[2]; + p2 += ks[3]; + p3 += ks[4]; + p4 += ks[5]; + p5 += ks[6] + ts[1]; + p6 += ks[7] + ts[2]; + p7 += ks[8] + make_uint2(10, 0); +} __forceinline__ __device__ -void Round_8_512v35_final(const uint2 *const __restrict__ ks, const uint2 *const __restrict__ ts, +void Round_8_512v35_11(const uint2 ks[9], const uint2 ts[3], +uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7) +{ + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); + + p0 += ks[2]; + p1 += ks[3]; + p2 += ks[4]; + p3 += ks[5]; + p4 += ks[6]; + p5 += ks[7] + ts[2]; + p6 += ks[8] + ts[0]; + p7 += ks[0] + make_uint2(11, 0); + + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); + + p0 += ks[3]; + p1 += ks[4]; + p2 += ks[5]; + p3 += ks[6]; + p4 += ks[7]; + p5 += ks[8] + ts[0]; + p6 += ks[0] + ts[1]; + p7 += ks[1] + make_uint2(12, 0); +} +__forceinline__ __device__ +void Round_8_512v35_13(const uint2 ks[9], const uint2 ts[3], +uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7) +{ + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); + + p0 += ks[4]; + p1 += ks[5]; + p2 += ks[6]; + p3 += ks[7]; + p4 += ks[8]; + p5 += ks[0] + ts[1]; + p6 += ks[1] + ts[2]; + p7 += ks[2] + make_uint2(13, 0); + + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); + + p0 += ks[5]; + p1 += ks[6]; + p2 += ks[7]; + p3 += ks[8]; + p4 += ks[0]; + p5 += ks[1] + ts[2]; + p6 += ks[2] + ts[0]; + p7 += ks[3] + make_uint2(14, 0); +} +__forceinline__ __device__ +void Round_8_512v35_15(const uint2 ks[9], const uint2 ts[3], +uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7) +{ + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); + + p0 += ks[6]; + p1 += ks[7]; + p2 += ks[8]; + p3 += ks[0]; + p4 += ks[1]; + p5 += ks[2] + ts[0]; + p6 += ks[3] + ts[1]; + p7 += ks[4] + make_uint2(15, 0); + + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); + + p0 += ks[7]; + p1 += ks[8]; + p2 += ks[0]; + p3 += ks[1]; + p4 += ks[2]; + p5 += ks[3] + ts[1]; + p6 += ks[4] + ts[2]; + p7 += ks[5] + make_uint2(16, 0); +} +__forceinline__ __device__ +void Round_8_512v35_17(const uint2 ks[9], const uint2 ts[3], +uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7) +{ + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 33, 27, 14, 42); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 17, 49, 36, 39); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 44, 9, 54, 56); + + p0 += ks[8]; + p1 += ks[0]; + p2 += ks[1]; + p3 += ks[2]; + p4 += ks[3]; + p5 += ks[4] + ts[2]; + p6 += ks[5] + ts[0]; + p7 += ks[6] + make_uint2(17, 0); + + Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 39, 30, 34, 24); + Round512v35(p2, p1, p4, p7, p6, p5, p0, p3, 13, 50, 10, 17); + Round512v35(p4, p1, p6, p3, p0, p5, p2, p7, 25, 29, 39, 43); + Round512v35(p6, p1, p0, p7, p2, p5, p4, p3, 8, 35, 56, 22); + + p0 += ks[0]; + p1 += ks[1]; + p2 += ks[2]; + p3 += ks[3]; + p4 += ks[4]; + p5 += ks[5] + ts[0]; + p6 += ks[6] + ts[1]; + p7 += ks[7] + make_uint2(18, 0); +} + +__forceinline__ __device__ +void Round_8_512v35_final(const uint2 ks[9], const uint2 ts[3], uint2 &p0, uint2 &p1, uint2 &p2, uint2 &p3, uint2 &p4, uint2 &p5, uint2 &p6, uint2 &p7) { Round512v35(p0, p1, p2, p3, p4, p5, p6, p7, 46, 36, 19, 37); @@ -74,96 +330,88 @@ void Round_8_512v35_final(const uint2 *const __restrict__ ks, const uint2 *const p3 += ks[3]; } -__global__ __launch_bounds__(256,3) -void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint64_t *outputHash) +__global__ __launch_bounds__(256, 4) +void skein256_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *outputHash) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint2 skein_ks_parity = { 0xA9FC1A22, 0x1BD11BDA }; + + const uint2 h2[9] = { + { 0x2FDB3E13, 0xCCD044A1 }, + { 0x1A79A9EB, 0xE8359030 }, + { 0x4F816E6F, 0x55AEA061 }, + { 0xAE9B94DB, 0x2A2767A4 }, + { 0x74DD7683, 0xEC06025E }, + { 0xC4746251, 0xE7A436CD }, + { 0x393AD185, 0xC36FBAF9 }, + { 0x33EDFC13, 0x3EEDBA18 }, + { 0xC73A4E2A, 0xB69D3CFC } + }; + const uint2 t12[2][3] = { + { { 0x20, 0 }, + { 0, 0xf0000000 }, + { 0x20, 0xf0000000 } }, + { { 0x08, 0 }, + { 0, 0xff000000 }, + { 0x08, 0xff000000 } } + }; if (thread < threads) { - const uint2 skein_ks_parity = { 0xA9FC1A22, 0x1BD11BDA }; - const uint2 t12[6] = { - { 0x20, 0 }, - { 0, 0xf0000000 }, - { 0x20, 0xf0000000 }, - { 0x08, 0 }, - { 0, 0xff000000 }, - { 0x08, 0xff000000 } - }; - uint2 h[9] = { - { 0x2FDB3E13, 0xCCD044A1 }, - { 0x1A79A9EB, 0xE8359030 }, - { 0x4F816E6F, 0x55AEA061 }, - { 0xAE9B94DB, 0x2A2767A4 }, - { 0x74DD7683, 0xEC06025E }, - { 0xC4746251, 0xE7A436CD }, - { 0x393AD185, 0xC36FBAF9 }, - { 0x33EDFC13, 0x3EEDBA18 }, - { 0xC73A4E2A, 0xB69D3CFC } - }; uint2 dt0,dt1,dt2,dt3; uint2 p0, p1, p2, p3, p4, p5, p6, p7; - LOHI(dt0.x,dt0.y,outputHash[thread]); - LOHI(dt1.x,dt1.y,outputHash[threads+thread]); - LOHI(dt2.x,dt2.y,outputHash[2*threads+thread]); - LOHI(dt3.x,dt3.y,outputHash[3*threads+thread]); + dt0 = __ldg(&outputHash[0 * threads + thread]); + dt1 = __ldg(&outputHash[1 * threads + thread]); + dt2 = __ldg(&outputHash[2 * threads + thread]); + dt3 = __ldg(&outputHash[3 * threads + thread]); - p0 = h[0] + dt0; - p1 = h[1] + dt1; - p2 = h[2] + dt2; - p3 = h[3] + dt3; - p4 = h[4]; - p5 = h[5] + t12[0]; - p6 = h[6] + t12[1]; - p7 = h[7]; + p0 = h2[0] + dt0; + p1 = h2[1] + dt1; + p2 = h2[2] + dt2; + p3 = h2[3] + dt3; + p4 = h2[4]; + p5 = h2[5] + t12[0][0]; + p6 = h2[6] + t12[0][1]; + p7 = h2[7]; // forced unroll required - Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 1); - Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 3); - Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 5); - Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 7); - Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 9); - Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 11); - Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 13); - Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 15); - Round_8_512v35(h, t12, p0, p1, p2, p3, p4, p5, p6, p7, 17); + Round_8_512v35_1(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7); + Round_8_512v35_3(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7); + Round_8_512v35_5(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7); + Round_8_512v35_7(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7); + Round_8_512v35_9(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7); + Round_8_512v35_11(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7); + Round_8_512v35_13(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7); + Round_8_512v35_15(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7); + Round_8_512v35_17(h2, t12[0], p0, p1, p2, p3, p4, p5, p6, p7); p0 ^= dt0; p1 ^= dt1; p2 ^= dt2; p3 ^= dt3; - h[0] = p0; - h[1] = p1; - h[2] = p2; - h[3] = p3; - h[4] = p4; - h[5] = p5; - h[6] = p6; - h[7] = p7; - h[8] = skein_ks_parity ^ h[0] ^ h[1] ^ h[2] ^ h[3] ^ h[4] ^ h[5] ^ h[6] ^ h[7]; + const uint2 h[9] = { p0, p1, p2, p3, p4, p5, p6, p7, skein_ks_parity ^ h[0] ^ h[1] ^ h[2] ^ h[3] ^ h[4] ^ h[5] ^ h[6] ^ h[7] }; - const uint2 *t = t12+3; - p5 += t12[3]; //p5 already equal h[5] - p6 += t12[4]; + p5 += t12[1][0]; //p5 already equal h[5] + p6 += t12[1][1]; // forced unroll - Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 1); - Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 3); - Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 5); - Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 7); - Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 9); - Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 11); - Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 13); - Round_8_512v35(h, t, p0, p1, p2, p3, p4, p5, p6, p7, 15); - Round_8_512v35_final(h, t, p0, p1, p2, p3, p4, p5, p6, p7); - - outputHash[thread] = devectorize(p0); - outputHash[threads+thread] = devectorize(p1); - outputHash[2*threads+thread] = devectorize(p2); - outputHash[3*threads+thread] = devectorize(p3); + Round_8_512v35_1(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7); + Round_8_512v35_3(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7); + Round_8_512v35_5(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7); + Round_8_512v35_7(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7); + Round_8_512v35_9(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7); + Round_8_512v35_11(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7); + Round_8_512v35_13(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7); + Round_8_512v35_15(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7); + Round_8_512v35_final(h, t12[1], p0, p1, p2, p3, p4, p5, p6, p7); + + outputHash[0 * threads + thread] = p0; + outputHash[1 * threads + thread] = p1; + outputHash[2 * threads + thread] = p2; + outputHash[3 * threads + thread] = p3; } } @@ -304,10 +552,27 @@ void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, ui // only 1kH/s perf change between kernels on a 960... if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300) - skein256_gpu_hash_32<<>>(threads, startNounce, d_outputHash); + skein256_gpu_hash_32 << > >(threads, startNounce, (uint2*)d_outputHash); else - skein256_gpu_hash_32_v30<<>>(threads, startNounce, d_outputHash); + skein256_gpu_hash_32_v30 << > >(threads, startNounce, d_outputHash); - MyStreamSynchronize(NULL, order, thr_id); + //MyStreamSynchronize(NULL, order, thr_id); } +__host__ +void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order, cudaStream_t stream) +{ + const uint32_t threadsperblock = 256; + int dev_id = device_map[thr_id]; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + // only 1kH/s perf change between kernels on a 960... + if (device_sm[dev_id] > 300 && cuda_arch[dev_id] > 300) + skein256_gpu_hash_32 << > >(threads, startNounce, (uint2*)d_outputHash); + else + skein256_gpu_hash_32_v30 << > >(threads, startNounce, d_outputHash); + + //MyStreamSynchronize(NULL, order, thr_id); +} diff --git a/ccminer.cpp b/ccminer.cpp index ec35a77..9123c64 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -83,6 +83,7 @@ bool opt_debug_threads = false; bool opt_protocol = false; bool opt_benchmark = false; bool opt_showdiff = false; +bool opt_eco_mode = false; // todo: limit use of these flags, // prefer the pools[] attributes @@ -91,6 +92,7 @@ bool have_longpoll = false; bool want_stratum = true; bool have_stratum = false; bool allow_gbt = true; +bool allow_getwork = true; bool allow_mininginfo = true; bool check_dups = false; bool check_stratum_jobs = false; @@ -165,6 +167,8 @@ char *short_url = NULL; struct stratum_ctx stratum = { 0 }; pthread_mutex_t stratum_sock_lock; pthread_mutex_t stratum_work_lock; +static unsigned char pk_script[25] = { 0 }; +static size_t pk_script_size = 0; char *opt_cert; char *opt_proxy; @@ -185,6 +189,7 @@ pthread_mutex_t stats_lock; double thr_hashrates[MAX_GPUS] = { 0 }; uint64_t global_hashrate = 0; double stratum_diff = 0.0; +static char *lp_id; double net_diff = 0; uint64_t net_hashrate = 0; uint64_t net_blocks = 0; @@ -226,8 +231,8 @@ Options:\n\ jackpot Jackpot\n\ keccak Keccak-256 (Maxcoin)\n\ luffa Joincoin\n\ - lyra2 LyraBar\n\ - lyra2v2 VertCoin\n\ + lyra2 Lyra2RE(Crypto)\n\ + lyra2v2 Lyra2REv2(VertCoin)\n\ mjollnir Mjollnircoin\n\ myr-gr Myriad-Groestl\n\ neoscrypt FeatherCoin, Phoenix, UFO...\n\ @@ -256,6 +261,8 @@ Options:\n\ (matching 2nd gt640 in the PC)\n\ -i --intensity=N[,N] GPU intensity 8.0-25.0 (default: auto) \n\ Decimals are allowed for fine tuning \n\ + --eco Use Eco mode\n\ + Auto tuning for low energy (Lyra2REv2 only)\n\ --cuda-schedule Set device threads scheduling mode (default: auto)\n\ -f, --diff-factor Divide difficulty by this factor (default 1.0) \n\ -m, --diff-multiplier Multiply difficulty by this value (default 1.0) \n\ @@ -278,6 +285,8 @@ Options:\n\ long polling is unavailable, in seconds (default: 10)\n\ -n, --ndevs list cuda devices\n\ -N, --statsavg number of samples used to compute hashrate (default: 30)\n\ + --coinbase-addr=ADDR payout address for solo mining\n\ + --no-getwork disable getwork support\n\ --no-gbt disable getblocktemplate support (height check in solo)\n\ --no-longpoll disable X-Long-Polling support\n\ --no-stratum disable X-Stratum support\n\ @@ -329,6 +338,7 @@ struct option options[] = { { "background", 0, NULL, 'B' }, { "benchmark", 0, NULL, 1005 }, { "cert", 1, NULL, 1001 }, + { "coinbase-addr", 1, NULL, 1016 }, { "config", 1, NULL, 'c' }, { "cputest", 0, NULL, 1006 }, { "cpu-affinity", 1, NULL, 1020 }, @@ -341,6 +351,7 @@ struct option options[] = { { "no-color", 0, NULL, 1002 }, { "no-extranonce", 0, NULL, 1012 }, { "no-gbt", 0, NULL, 1011 }, + { "no-getwork", 0, NULL, 1010 }, { "no-longpoll", 0, NULL, 1003 }, { "no-stratum", 0, NULL, 1007 }, { "no-autotune", 0, NULL, 1004 }, // scrypt @@ -394,6 +405,7 @@ struct option options[] = { { "diff-multiplier", 1, NULL, 'm' }, { "diff-factor", 1, NULL, 'f' }, { "diff", 1, NULL, 'f' }, // compat + { "eco", 0, NULL, 1080 }, { 0, 0, 0, 0 } }; @@ -892,7 +904,65 @@ static bool submit_upstream_work(CURL *curl, struct work *work) if (check_dups) hashlog_remember_submit(work, nonce); - } else { + } + else if (work->txs2) + { + + char data_str[2 * sizeof(work->data) + 1]; + char *req; + + for (int i = 0; i < ARRAY_SIZE(work->data); i++) + be32enc(work->data + i, work->data[i]); + cbin2hex(data_str, (char *)work->data, 80); + if (work->workid) { + char *params; + val = json_object(); + json_object_set_new(val, "workid", json_string(work->workid)); + params = json_dumps(val, 0); + json_decref(val); + req = (char*)malloc(128 + 2 * 80 + strlen(work->txs2) + strlen(params)); + sprintf(req, + "{\"method\": \"submitblock\", \"params\": [\"%s%s\", %s], \"id\":4}\r\n", + data_str, work->txs2, params); + free(params); + } + else { + req = (char*)malloc(128 + 2 * 80 + strlen(work->txs2)); + sprintf(req, + "{\"method\": \"submitblock\", \"params\": [\"%s%s\"], \"id\":4}\r\n", + data_str, work->txs2); + } + + val = json_rpc_call_pool(curl, pool, req, false, false, NULL); + free(req); + if (unlikely(!val)) { + applog(LOG_ERR, "submit_upstream_work json_rpc_call failed"); + return false; + } + + res = json_object_get(val, "result"); + if (json_is_object(res)) { + char *res_str; + bool sumres = false; + void *iter = json_object_iter(res); + while (iter) { + if (json_is_null(json_object_iter_value(iter))) { + sumres = true; + break; + } + iter = json_object_iter_next(res, iter); + } + res_str = json_dumps(res, 0); + share_result(sumres, work->pooln, work->sharediff, res_str); + free(res_str); + } + else + share_result(json_is_null(res), work->pooln, work->sharediff, json_string_value(res)); + + json_decref(val); + + } + else { int data_size = 128; int adata_sz = data_size / sizeof(uint32_t); @@ -924,6 +994,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work) /* issue JSON-RPC request */ val = json_rpc_call_pool(curl, pool, s, false, false, NULL); + free(str); if (unlikely(!val)) { applog(LOG_ERR, "submit_upstream_work json_rpc_call failed"); return false; @@ -940,12 +1011,15 @@ static bool submit_upstream_work(CURL *curl, struct work *work) json_decref(val); - free(str); } return true; } +#ifndef ORG +#define BLOCK_VERSION_CURRENT 7 +#endif + /* simplified method to only get some extra infos in solo mode */ static bool gbt_work_decode(const json_t *val, struct work *work) { @@ -985,8 +1059,311 @@ static bool gbt_work_decode(const json_t *val, struct work *work) return true; } +#ifndef ORG +int varint_encode(unsigned char *p, uint64_t n) +{ + int i; + if (n < 0xfd) { + p[0] = (uchar)n; + return 1; + } + if (n <= 0xffff) { + p[0] = 0xfd; + p[1] = n & 0xff; + p[2] = (uchar)(n >> 8); + return 3; + } + if (n <= 0xffffffff) { + p[0] = 0xfe; + for (i = 1; i < 5; i++) { + p[i] = n & 0xff; + n >>= 8; + } + return 5; + } + p[0] = 0xff; + for (i = 1; i < 9; i++) { + p[i] = n & 0xff; + n >>= 8; + } + return 9; +} + +static bool gbt_work_decode_full(const json_t *val, struct work *work) +{ + int i, n; + uint32_t version, curtime, bits; + uint32_t prevhash[8]; + uint32_t target[8]; + int cbtx_size; + uchar *cbtx = NULL; + int tx_count, tx_size; + uchar txc_vi[9]; + uchar(*merkle_tree)[32] = NULL; + bool coinbase_append = false; + bool submit_coinbase = false; + bool version_force = false; + bool version_reduce = false; + json_t *tmp, *txa; + bool rc = false; + + tmp = json_object_get(val, "mutable"); + if (tmp && json_is_array(tmp)) { + n = (int)json_array_size(tmp); + for (i = 0; i < n; i++) { + const char *s = json_string_value(json_array_get(tmp, i)); + if (!s) + continue; + if (!strcmp(s, "coinbase/append")) + coinbase_append = true; + else if (!strcmp(s, "submit/coinbase")) + submit_coinbase = true; + else if (!strcmp(s, "version/force")) + version_force = true; + else if (!strcmp(s, "version/reduce")) + version_reduce = true; + } + } + + tmp = json_object_get(val, "height"); + if (!tmp || !json_is_integer(tmp)) { + applog(LOG_ERR, "JSON invalid height"); + goto out; + } + work->height = (int)json_integer_value(tmp); + applog(LOG_BLUE, "Current block is %d", work->height); + + tmp = json_object_get(val, "version"); + if (!tmp || !json_is_integer(tmp)) { + applog(LOG_ERR, "JSON invalid version"); + goto out; + } + version = (uint32_t)json_integer_value(tmp); + if ((version & 0xffU) > BLOCK_VERSION_CURRENT) { + if (version_reduce) { + version = (version & ~0xffU) | BLOCK_VERSION_CURRENT; + } + else if (allow_gbt && allow_getwork && !version_force) { + applog(LOG_DEBUG, "Switching to getwork, gbt version %d", version); + allow_gbt = false; + goto out; + } + else if (!version_force) { + applog(LOG_ERR, "Unrecognized block version: %u", version); + goto out; + } + } + + if (unlikely(!jobj_binary(val, "previousblockhash", prevhash, sizeof(prevhash)))) { + applog(LOG_ERR, "JSON invalid previousblockhash"); + goto out; + } + + tmp = json_object_get(val, "curtime"); + if (!tmp || !json_is_integer(tmp)) { + applog(LOG_ERR, "JSON invalid curtime"); + goto out; + } + curtime = (uint32_t)json_integer_value(tmp); + + if (unlikely(!jobj_binary(val, "bits", &bits, sizeof(bits)))) { + applog(LOG_ERR, "JSON invalid bits"); + goto out; + } + + /* find count and size of transactions */ + txa = json_object_get(val, "transactions"); + if (!txa || !json_is_array(txa)) { + applog(LOG_ERR, "JSON invalid transactions"); + goto out; + } + tx_count = (int)json_array_size(txa); + tx_size = 0; + for (i = 0; i < tx_count; i++) { + const json_t *tx = json_array_get(txa, i); + const char *tx_hex = json_string_value(json_object_get(tx, "data")); + if (!tx_hex) { + applog(LOG_ERR, "JSON invalid transactions"); + goto out; + } + tx_size += (int)(strlen(tx_hex) / 2); + } + + /* build coinbase transaction */ + tmp = json_object_get(val, "coinbasetxn"); + if (tmp) { + const char *cbtx_hex = json_string_value(json_object_get(tmp, "data")); + cbtx_size = cbtx_hex ? (int)strlen(cbtx_hex) / 2 : 0; + cbtx = (uchar*)malloc(cbtx_size + 100); + if (cbtx_size < 60 || !hex2bin(cbtx, cbtx_hex, cbtx_size)) { + applog(LOG_ERR, "JSON invalid coinbasetxn"); + goto out; + } + } + else { + int64_t cbvalue; + if (!pk_script_size) { + if (allow_getwork) { + applog(LOG_INFO, "No payout address provided, switching to getwork"); + allow_gbt = false; + } + else + applog(LOG_ERR, "No payout address provided"); + goto out; + } + tmp = json_object_get(val, "coinbasevalue"); + if (!tmp || !json_is_number(tmp)) { + applog(LOG_ERR, "JSON invalid coinbasevalue"); + goto out; + } + cbvalue = (int64_t)(json_is_integer(tmp) ? json_integer_value(tmp) : json_number_value(tmp)); + cbtx = (uchar*)malloc(256); + le32enc((uint32_t *)cbtx, 1); /* version */ + cbtx[4] = 1; /* in-counter */ + memset(cbtx + 5, 0x00, 32); /* prev txout hash */ + le32enc((uint32_t *)(cbtx + 37), 0xffffffff); /* prev txout index */ + cbtx_size = 43; + /* BIP 34: height in coinbase */ + for (n = work->height; n; n >>= 8) + cbtx[cbtx_size++] = n & 0xff; + cbtx[42] = cbtx_size - 43; + cbtx[41] = cbtx_size - 42; /* scriptsig length */ + le32enc((uint32_t *)(cbtx + cbtx_size), 0xffffffff); /* sequence */ + cbtx_size += 4; + cbtx[cbtx_size++] = 1; /* out-counter */ + le32enc((uint32_t *)(cbtx + cbtx_size), (uint32_t)cbvalue); /* value */ + le32enc((uint32_t *)(cbtx + cbtx_size + 4), cbvalue >> 32); + cbtx_size += 8; + cbtx[cbtx_size++] = (uint8_t)pk_script_size; /* txout-script length */ + memcpy(cbtx + cbtx_size, pk_script, pk_script_size); + cbtx_size += (int)pk_script_size; + le32enc((uint32_t *)(cbtx + cbtx_size), 0); /* lock time */ + cbtx_size += 4; + coinbase_append = true; + } + if (coinbase_append) { + unsigned char xsig[100]; + int xsig_len = 0; + tmp = json_object_get(val, "coinbaseaux"); + if (tmp && json_is_object(tmp)) { + void *iter = json_object_iter(tmp); + while (iter) { + unsigned char buf[100]; + const char *s = json_string_value(json_object_iter_value(iter)); + n = s ? (int)(strlen(s) / 2) : 0; + if (!s || n > 100 || !hex2bin(buf, s, n)) { + applog(LOG_ERR, "JSON invalid coinbaseaux"); + break; + } + if (cbtx[41] + xsig_len + n <= 100) { + memcpy(xsig + xsig_len, buf, n); + xsig_len += n; + } + iter = json_object_iter_next(tmp, iter); + } + } + if (xsig_len) { + unsigned char *ssig_end = cbtx + 42 + cbtx[41]; + int push_len = cbtx[41] + xsig_len < 76 ? 1 : + cbtx[41] + 2 + xsig_len > 100 ? 0 : 2; + n = xsig_len + push_len; + memmove(ssig_end + n, ssig_end, cbtx_size - 42 - cbtx[41]); + cbtx[41] += n; + if (push_len == 2) + *(ssig_end++) = 0x4c; /* OP_PUSHDATA1 */ + if (push_len) + *(ssig_end++) = xsig_len; + memcpy(ssig_end, xsig, xsig_len); + cbtx_size += n; + } + } + + n = varint_encode(txc_vi, 1 + tx_count); + + work->txs2 = (char*)malloc(2 * (n + cbtx_size + tx_size) + 1); + cbin2hex(work->txs2, (char *)txc_vi, n); + cbin2hex(work->txs2 + 2 * n, (char *)cbtx, cbtx_size); + + /* generate merkle root */ + merkle_tree = (uchar(*)[32]) calloc(((1 + tx_count + 1) & ~1), 32); + sha256d(merkle_tree[0], cbtx, cbtx_size); + + for (i = 0; i < tx_count; i++) { + tmp = json_array_get(txa, i); + const char *tx_hex = json_string_value(json_object_get(tmp, "data")); + const int tx_size = tx_hex ? (int)(strlen(tx_hex) / 2) : 0; + unsigned char *tx = (uchar*)malloc(tx_size); + if (!tx_hex || !hex2bin(tx, tx_hex, tx_size)) { + applog(LOG_ERR, "JSON invalid transactions"); + free(tx); + goto out; + } + sha256d(merkle_tree[1 + i], tx, tx_size); + if (!submit_coinbase) + strcat(work->txs2, tx_hex); + } + n = 1 + tx_count; + while (n > 1) { + if (n % 2) { + memcpy(merkle_tree[n], merkle_tree[n - 1], 32); + ++n; + } + n /= 2; + for (i = 0; i < n; i++) + sha256d(merkle_tree[i], merkle_tree[2 * i], 64); + } + + /* assemble block header */ + work->data[0] = swab32(version); + for (i = 0; i < 8; i++) + work->data[8 - i] = le32dec(prevhash + i); + for (i = 0; i < 8; i++) + work->data[9 + i] = be32dec((uint32_t *)merkle_tree[0] + i); + work->data[17] = swab32(curtime); + work->data[18] = le32dec(&bits); + memset(work->data + 19, 0x00, 52); + work->data[20] = 0x80000000; + work->data[31] = 0x00000280; + + if (unlikely(!jobj_binary(val, "target", target, sizeof(target)))) { + applog(LOG_ERR, "JSON invalid target"); + goto out; + } + for (i = 0; i < ARRAY_SIZE(work->target); i++) + work->target[7 - i] = be32dec(target + i); + tmp = json_object_get(val, "workid"); + if (tmp) { + if (!json_is_string(tmp)) { + applog(LOG_ERR, "JSON invalid workid"); + goto out; + } + work->workid = strdup(json_string_value(tmp)); + } + + rc = true; +out: + /* Long polling */ + tmp = json_object_get(val, "longpollid"); + if (want_longpoll && json_is_string(tmp)) { + free(lp_id); + lp_id = strdup(json_string_value(tmp)); + if (!have_longpoll) { + char *lp_uri; + tmp = json_object_get(val, "longpolluri"); + lp_uri = json_is_string(tmp) ? strdup(json_string_value(tmp)) : rpc_url; + have_longpoll = true; + tq_push(thr_info[longpoll_thr_id].q, lp_uri); + } + } + + free(merkle_tree); + free(cbtx); + return rc; +} +#endif + #define GBT_CAPABILITIES "[\"coinbasetxn\", \"coinbasevalue\", \"longpoll\", \"workid\"]" -static const char *gbt_req = +static const char *gbt_req_ = "{\"method\": \"getblocktemplate\", \"params\": [{" // "\"capabilities\": " GBT_CAPABILITIES "" "}], \"id\":9}\r\n"; @@ -998,7 +1375,7 @@ static bool get_blocktemplate(CURL *curl, struct work *work) return false; int curl_err = 0; - json_t *val = json_rpc_call_pool(curl, pool, gbt_req, false, false, &curl_err); + json_t *val = json_rpc_call_pool(curl, pool, gbt_req_, false, false, &curl_err); if (!val && curl_err == -1) { // when getblocktemplate is not supported, disable it @@ -1068,8 +1445,19 @@ static bool get_mininginfo(CURL *curl, struct work *work) return true; } +#ifdef ORG static const char *rpc_req = "{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n"; +#else +static const char *getwork_req = +"{\"method\": \"getwork\", \"params\": [], \"id\":0}\r\n"; +static const char *gbt_req = +"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": " +GBT_CAPABILITIES "}], \"id\":0}\r\n"; +#endif +static const char *gbt_lp_req = +"{\"method\": \"getblocktemplate\", \"params\": [{\"capabilities\": " +GBT_CAPABILITIES ", \"longpollid\": \"%s\"}], \"id\":0}\r\n"; static bool get_upstream_work(CURL *curl, struct work *work) { @@ -1082,9 +1470,18 @@ static bool get_upstream_work(CURL *curl, struct work *work) applog(LOG_DEBUG, "%s: want_longpoll=%d have_longpoll=%d", __func__, want_longpoll, have_longpoll); +#ifndef ORG + int err; +start: +#endif gettimeofday(&tv_start, NULL); /* want_longpoll/have_longpoll required here to init/unlock the lp thread */ +#ifdef ORG val = json_rpc_call_pool(curl, pool, rpc_req, want_longpoll, have_longpoll, NULL); +#else + val = json_rpc_call_pool(curl, pool, allow_gbt ? gbt_req : getwork_req, want_longpoll, have_longpoll, &err); + +#endif gettimeofday(&tv_end, NULL); if (have_stratum || unlikely(work->pooln != cur_pooln)) { @@ -1093,10 +1490,39 @@ static bool get_upstream_work(CURL *curl, struct work *work) return false; } +#ifndef ORG + if (!allow_gbt && !allow_getwork) { + applog(LOG_ERR, "No usable protocol"); + if (val) + json_decref(val); + return false; + } + + if (allow_gbt && allow_getwork && !val && err == CURLE_OK) { + applog(LOG_NOTICE, "getblocktemplate failed, falling back to getwork"); + allow_gbt = false; + goto start; + } + +#endif + if (!val) return false; - rc = work_decode(json_object_get(val, "result"), work); +#ifndef ORG + if (allow_gbt) { + rc = gbt_work_decode_full(json_object_get(val, "result"), work); + if (!allow_gbt) { + json_decref(val); + goto start; + } + } + else { +#endif + rc = work_decode(json_object_get(val, "result"), work); +#ifndef ORG + } +#endif if (opt_protocol && rc) { timeval_subtract(&diff, &tv_end, &tv_start); @@ -1393,7 +1819,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) else sha256d(merkle_root, merkle_root, 64); } - + /* Increment extranonce2 */ for (i = 0; i < (int)sctx->xnonce2_size && !++sctx->job.xnonce2[i]; i++); @@ -1720,8 +2146,8 @@ static void *miner_thread(void *userdata) #endif memcpy(&work, &g_work, sizeof(struct work)); nonceptr[0] = (UINT32_MAX / opt_n_threads) * thr_id; // 0 if single thr - } else - nonceptr[0]++; //?? + } + //else nonceptr[0]++; //?? if (opt_algo == ALGO_DECRED) { // suprnova job_id check without data/target/height change... @@ -2136,10 +2562,15 @@ static void *miner_thread(void *userdata) } } - if (rc > 0) + + +/* if (rc > 0) work.scanned_to = work.nonces[0]; if (rc > 1) work.scanned_to = max(work.nonces[0], work.nonces[1]); +*/ + if (rc > 0) + work.scanned_to = start_nonce + hashes_done; else { work.scanned_to = max_nonce; if (opt_debug && opt_benchmark) { @@ -2209,6 +2640,7 @@ static void *miner_thread(void *userdata) break; } } + nonceptr[0] = start_nonce + hashes_done; } out: @@ -2278,6 +2710,7 @@ longpoll_retry: while (!abort_flag) { json_t *val = NULL, *soval; + char *req = NULL; int err = 0; if (opt_debug_threads) @@ -2288,7 +2721,12 @@ longpoll_retry: if (switchn != pool_switch_count) goto need_reinit; - val = json_rpc_longpoll(curl, lp_url, pool, rpc_req, &err); + if (allow_gbt) { + req = (char*)malloc(strlen(gbt_lp_req) + strlen(lp_id) + 1); + sprintf(req, gbt_lp_req, lp_id); + } + val = json_rpc_longpoll(curl, lp_url, pool, req ? req : getwork_req, &err); + if (allow_gbt) free(req); if (have_stratum || switchn != pool_switch_count) { if (val) json_decref(val); @@ -2486,7 +2924,7 @@ wait_stratum_url: } pthread_mutex_unlock(&g_work_lock); } - + // check we are on the right pool if (switchn != pool_switch_count) goto pool_switched; @@ -2552,6 +2990,109 @@ static void show_usage_and_exit(int status) } proper_exit(status); } +static const char b58digits[] = "123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"; + +static bool b58dec(unsigned char *bin, size_t binsz, const char *b58) +{ + size_t i, j; + uint64_t t; + uint32_t c; + uint32_t *outi; + size_t outisz = (binsz + 3) / 4; + int rem = binsz % 4; + uint32_t remmask = 0xffffffff << (8 * rem); + size_t b58sz = strlen(b58); + bool rc = false; + + outi = (uint32_t *)calloc(outisz, sizeof(*outi)); + + for (i = 0; i < b58sz; ++i) { + for (c = 0; b58digits[c] != b58[i]; c++) + if (!b58digits[c]) + goto out; + for (j = outisz; j--;) { + t = (uint64_t)outi[j] * 58 + c; + c = t >> 32; + outi[j] = t & 0xffffffff; + } + if (c || outi[0] & remmask) + goto out; + } + + j = 0; + switch (rem) { + case 3: + *(bin++) = (outi[0] >> 16) & 0xff; + case 2: + *(bin++) = (outi[0] >> 8) & 0xff; + case 1: + *(bin++) = outi[0] & 0xff; + ++j; + default: + break; + } + for (; j < outisz; ++j) { + be32enc((uint32_t *)bin, outi[j]); + bin += sizeof(uint32_t); + } + + rc = true; +out: + free(outi); + return rc; +} + +static int b58check(unsigned char *bin, size_t binsz, const char *b58) +{ + unsigned char buf[32]; + int i; + + sha256d(buf, bin, (int)(binsz - 4)); + if (memcmp(&bin[binsz - 4], buf, 4)) + return -1; + + /* Check number of zeros is correct AFTER verifying checksum + * (to avoid possibility of accessing the string beyond the end) */ + for (i = 0; bin[i] == '\0' && b58[i] == '1'; ++i); + if (bin[i] == '\0' || b58[i] == '1') + return -3; + + return bin[0]; +} + +size_t address_to_script(unsigned char *out, size_t outsz, const char *addr) +{ + unsigned char addrbin[25]; + int addrver; + size_t rv; + + if (!b58dec(addrbin, sizeof(addrbin), addr)) + return 0; + addrver = b58check(addrbin, sizeof(addrbin), addr); + if (addrver < 0) + return 0; + switch (addrver) { + case 5: /* Bitcoin script hash */ + case 196: /* Testnet script hash */ + if (outsz < (rv = 23)) + return rv; + out[0] = 0xa9; /* OP_HASH160 */ + out[1] = 0x14; /* push 20 bytes */ + memcpy(&out[2], &addrbin[1], 20); + out[22] = 0x87; /* OP_EQUAL */ + return rv; + default: + if (outsz < (rv = 25)) + return rv; + out[0] = 0x76; /* OP_DUP */ + out[1] = 0xa9; /* OP_HASH160 */ + out[2] = 0x14; /* push 20 bytes */ + memcpy(&out[3], &addrbin[1], 20); + out[23] = 0x88; /* OP_EQUALVERIFY */ + out[24] = 0xac; /* OP_CHECKSIG */ + return rv; + } +} void parse_arg(int key, char *arg) { @@ -2611,6 +3152,9 @@ void parse_arg(int key, char *arg) case 1030: /* --api-remote */ opt_api_remote = 1; break; + case 1080: + opt_eco_mode = true; + break; case 'B': opt_background = true; break; @@ -2946,9 +3490,19 @@ void parse_arg(int key, char *arg) case 1009: opt_shares_limit = atoi(arg); break; + case 1010: + allow_getwork = false; + break; case 1011: allow_gbt = false; break; + case 1016: /* --coinbase-addr */ + pk_script_size = address_to_script(pk_script, sizeof(pk_script), arg); + if (!pk_script_size) { + fprintf(stderr, "invalid address -- '%s'\n", arg); + show_usage_and_exit(1); + } + break; case 1012: opt_extranonce = false; break; @@ -3186,7 +3740,7 @@ static void parse_cmdline(int argc, char *argv[]) show_usage_and_exit(1); } - if (opt_algo == ALGO_DECRED && opt_vote == 9999) { + if (opt_vote == 9999) { opt_vote = 0; // default, don't vote } } diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 10d32f2..7b79951 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -41,10 +41,7 @@ false - - - - + @@ -83,10 +80,10 @@ false true - 80 + 255 true true - compute_50,sm_50 + compute_50,sm_50;compute_52,sm_52;compute_35,sm_35;compute_30,sm_30;compute_20,sm_20 $(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99 @@ -115,15 +112,16 @@ false true - 80 + 255 true true - compute_50,sm_50 + compute_50,sm_50;compute_52,sm_52;compute_35,sm_35;compute_30,sm_30;compute_20,sm_20 $(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99 64 false + O3 @@ -158,16 +156,16 @@ false - 80 + 255 true true - compute_50,sm_50;compute_52,sm_52;compute_30,sm_30;compute_20,sm_21 + compute_50,sm_50;compute_52,sm_52;compute_35,sm_35;compute_30,sm_30;compute_20,sm_20 --ptxas-options="-O2" %(AdditionalOptions) - O2 + O3 false - O3 + O2 @@ -201,10 +199,10 @@ false - 80 + 255 true true - compute_52,sm_52;compute_50,sm_50;compute_35,sm_35;compute_30,sm_30;compute_20,sm_21 + compute_50,sm_50;compute_52,sm_52;compute_35,sm_35;compute_30,sm_30;compute_20,sm_20 $(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99 O3 64 @@ -250,6 +248,7 @@ + @@ -347,7 +346,6 @@ - @@ -527,10 +525,7 @@ - - - - + @@ -540,4 +535,4 @@ - + \ No newline at end of file diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index ed942f3..c13b63f 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -437,9 +437,6 @@ Header Files - - Source Files\CUDA\lyra2 - Source Files\CUDA\lyra2 @@ -455,6 +452,9 @@ Source Files\CUDA\x11 + + Source Files\CUDA\lyra2 + @@ -728,4 +728,4 @@ Ressources - + \ No newline at end of file diff --git a/configure.ac b/configure.ac index aec2fe6..4381840 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([ccminer], [1.7.6], [], [ccminer], [http://github.com/tpruvot/ccminer]) +AC_INIT([ccminer], [1.7.6-r10], [], [ccminer], [http://github.com/tpruvot/ccminer]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cuda_helper.h b/cuda_helper.h index 1358892..2b0bd73 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -96,7 +96,6 @@ __device__ __forceinline__ uint64_t REPLACE_LODWORD(const uint64_t &x, const uin return (x & 0xFFFFFFFF00000000ULL) | ((uint64_t)y); } -// Endian Drehung für 32 Bit Typen #ifdef __CUDA_ARCH__ __device__ __forceinline__ uint32_t cuda_swab32(uint32_t x) { @@ -471,6 +470,15 @@ static __host__ __device__ __forceinline__ uint64_t devectorize(uint2 v) { #endif } +static __device__ __forceinline__ uint2 eorswap32(uint2 u, uint2 v) +{ + uint2 result; + result.y = u.x ^ v.x; + result.x = u.y ^ v.y; + return result; +} + + /** * uint2 direct ops by c++ operator definitions */ @@ -561,11 +569,9 @@ uint2 ROR2(const uint2 a, const int offset) return result; } -__device__ __forceinline__ -uint2 ROL2(const uint2 a, const int offset) -{ +#if __CUDA_ARCH__ >= 350 +__inline__ __device__ uint2 ROL2(const uint2 a, const int offset) { uint2 result; -#if __CUDA_ARCH__ > 300 if (offset >= 32) { asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.x), "r"(a.y), "r"(offset)); asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); @@ -574,14 +580,20 @@ uint2 ROL2(const uint2 a, const int offset) asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.x) : "r"(a.y), "r"(a.x), "r"(offset)); asm("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); } + return result; +} #else - if (!offset) - result = a; +__inline__ __device__ uint2 ROL2(const uint2 v, const int n) +{ + uint2 result; + if (!n) + result = v; else - result = ROR2(a, 64 - offset); -#endif + result = ROR2(v, 64 - n); + return result; } +#endif __device__ __forceinline__ uint2 SWAPUINT2(uint2 value) diff --git a/lyra2/cuda_lyra2.cu b/lyra2/cuda_lyra2.cu index 88d4bce..67e81fc 100644 --- a/lyra2/cuda_lyra2.cu +++ b/lyra2/cuda_lyra2.cu @@ -1,41 +1,211 @@ /** - * Lyra2 (v1) cuda implementation based on djm34 work - SM 5/5.2 - * tpruvot@github 2015 - */ +* Lyra2 (v1) cuda implementation based on djm34 work - SM 5/5.2 +* tpruvot@github 2015 +*/ #include #include -#define TPB50 16 -#define TPB52 8 +#define TPB52 32 #include "cuda_lyra2_sm2.cuh" +#include "cuda_lyra2_sm5.cuh" #ifdef __INTELLISENSE__ /* just for vstudio code colors */ -#define __CUDA_ARCH__ 500 +#define __CUDA_ARCH__ 520 #endif -#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 500 +#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ > 500 -#include "cuda_vector_uint2x4.h" +#include "cuda_lyra2_vectors.h" -#define memshift 3 +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +__device__ uint32_t __shfl(uint32_t a, uint32_t b, uint32_t c); +#endif +#define Nrow 8 #define Ncol 8 -#define NcolMask 0x7 +#define memshift 3 + +#define BUF_COUNT 0 + +__device__ uint2 *DMatrix; + +__device__ __forceinline__ void LD4S(uint2 res[3], const int row, const int col, const int thread, const int threads) +{ +#if BUF_COUNT != 8 + extern __shared__ uint2 shared_mem[]; + const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift; +#endif +#if BUF_COUNT != 0 + const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x; +#endif + +#if BUF_COUNT == 8 +#pragma unroll + for (int j = 0; j < 3; j++) + res[j] = *(DMatrix + d0 + j * threads * blockDim.x); +#elif BUF_COUNT == 0 +#pragma unroll + for (int j = 0; j < 3; j++) + res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x]; +#else + if (row < BUF_COUNT) + { +#pragma unroll + for (int j = 0; j < 3; j++) + res[j] = *(DMatrix + d0 + j * threads * blockDim.x); + } + else + { +#pragma unroll + for (int j = 0; j < 3; j++) + res[j] = shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x]; + } +#endif +} -__device__ uint2x4* DMatrix; +__device__ __forceinline__ void ST4S(const int row, const int col, const uint2 data[3], const int thread, const int threads) +{ +#if BUF_COUNT != 8 + extern __shared__ uint2 shared_mem[]; + const int s0 = (Ncol * (row - BUF_COUNT) + col) * memshift; +#endif +#if BUF_COUNT != 0 + const int d0 = (memshift *(Ncol * row + col) * threads + thread)*blockDim.x + threadIdx.x; +#endif + +#if BUF_COUNT == 8 +#pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + d0 + j * threads * blockDim.x) = data[j]; +#elif BUF_COUNT == 0 +#pragma unroll + for (int j = 0; j < 3; j++) + shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j]; +#else + if (row < BUF_COUNT) + { +#pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + d0 + j * threads * blockDim.x) = data[j]; + } + else + { +#pragma unroll + for (int j = 0; j < 3; j++) + shared_mem[((s0 + j) * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data[j]; + } +#endif +} + +#if __CUDA_ARCH__ >= 300 +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + return __shfl(a, b, c); +} + +__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) +{ + return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c)); +} + +__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + a1 = WarpShuffle(a1, b1, c); + a2 = WarpShuffle(a2, b2, c); + a3 = WarpShuffle(a3, b3, c); +} + +#else +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + uint32_t *_ptr = (uint32_t*)shared_mem; + + __threadfence_block(); + uint32_t buf = _ptr[thread]; + + _ptr[thread] = a; + __threadfence_block(); + uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))]; + + __threadfence_block(); + _ptr[thread] = buf; + + __threadfence_block(); + return result; +} + +__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + __threadfence_block(); + uint2 buf = shared_mem[thread]; + + shared_mem[thread] = a; + __threadfence_block(); + uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))]; + + __threadfence_block(); + shared_mem[thread] = buf; + + __threadfence_block(); + return result; +} + +__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + __threadfence_block(); + uint2 buf = shared_mem[thread]; + + shared_mem[thread] = a1; + __threadfence_block(); + a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))]; + __threadfence_block(); + shared_mem[thread] = a2; + __threadfence_block(); + a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))]; + __threadfence_block(); + shared_mem[thread] = a3; + __threadfence_block(); + a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))]; + + __threadfence_block(); + shared_mem[thread] = buf; + __threadfence_block(); +} + +#endif static __device__ __forceinline__ void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d) { - a += b; d ^= a; d = SWAPUINT2(d); - c += d; b ^= c; b = ROR2(b, 24); - a += b; d ^= a; d = ROR2(d, 16); + a += b; d = eorswap32(a, d); + c += d; b ^= c; b = ROR24(b); + a += b; d ^= a; d = ROR16(d); c += d; b ^= c; b = ROR2(b, 63); } +__device__ __forceinline__ void round_lyra(uint2 s[4]) +{ + Gfunc(s[0], s[1], s[2], s[3]); + WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4); + Gfunc(s[0], s[1], s[2], s[3]); + WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4); +} + static __device__ __forceinline__ void round_lyra(uint2x4* s) { @@ -50,21 +220,24 @@ void round_lyra(uint2x4* s) } static __device__ __forceinline__ -void reduceDuplex(uint2x4 state[4], uint32_t thread) +void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads) { - uint2x4 state1[3]; - - const uint32_t ps1 = (256 * thread); - const uint32_t ps2 = (memshift * 7 + memshift * 8 + 256 * thread); + uint2 state1[3]; - #pragma unroll 4 - for (int i = 0; i < 8; i++) +#if __CUDA_ARCH__ > 500 +#pragma unroll +#endif + for (int i = 0; i < Nrow; i++) { - const uint32_t s1 = ps1 + i*memshift; - const uint32_t s2 = ps2 - i*memshift; + ST4S(0, Ncol - i - 1, state, thread, threads); - for (int j = 0; j < 3; j++) - state1[j] = __ldg4(&(DMatrix+s1)[j]); + round_lyra(state); + } + +#pragma unroll 4 + for (int i = 0; i < Nrow; i++) + { + LD4S(state1, 0, i, thread, threads); for (int j = 0; j < 3; j++) state[j] ^= state1[j]; @@ -72,208 +245,342 @@ void reduceDuplex(uint2x4 state[4], uint32_t thread) for (int j = 0; j < 3; j++) state1[j] ^= state[j]; - for (int j = 0; j < 3; j++) - (DMatrix + s2)[j] = state1[j]; + ST4S(1, Ncol - i - 1, state1, thread, threads); } } static __device__ __forceinline__ -void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2x4 state[4], uint32_t thread) +void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], uint32_t thread, const uint32_t threads) { - uint2x4 state1[3], state2[3]; - - const uint32_t ps1 = ( memshift*8 * rowIn + 256 * thread); - const uint32_t ps2 = ( memshift*8 * rowInOut + 256 * thread); - const uint32_t ps3 = (memshift*7 + memshift*8 * rowOut + 256 * thread); + uint2 state1[3], state2[3]; - #pragma unroll 1 - for (int i = 0; i < 8; i++) +#pragma unroll 1 + for (int i = 0; i < Nrow; i++) { - const uint32_t s1 = ps1 + i*memshift; - const uint32_t s2 = ps2 + i*memshift; + LD4S(state1, rowIn, i, thread, threads); + LD4S(state2, rowInOut, i, thread, threads); for (int j = 0; j < 3; j++) - state1[j]= __ldg4(&(DMatrix + s1)[j]); + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + +#pragma unroll for (int j = 0; j < 3; j++) - state2[j]= __ldg4(&(DMatrix + s2)[j]); - for (int j = 0; j < 3; j++) { - uint2x4 tmp = state1[j] + state2[j]; - state[j] ^= tmp; + state1[j] ^= state[j]; + + ST4S(rowOut, Ncol - i - 1, state1, thread, threads); + + //一個手å‰ã®ã‚¹ãƒ¬ãƒƒãƒ‰ã‹ã‚‰ãƒ‡ãƒ¼ã‚¿ã‚’è²°ã†(åŒæ™‚ã«ä¸€å€‹å…ˆã®ã‚¹ãƒ¬ãƒƒãƒ‰ã«ãƒ‡ãƒ¼ã‚¿ã‚’é€ã‚‹) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + + ST4S(rowInOut, i, state2, thread, threads); + } +} + +static __device__ __forceinline__ +void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads) +{ + for (int i = 0; i < Nrow; i++) + { + uint2 state1[3], state2[3]; + + LD4S(state1, rowIn, i, thread, threads); + LD4S(state2, rowInOut, i, thread, threads); + +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; round_lyra(state); - for (int j = 0; j < 3; j++) { - const uint32_t s3 = ps3 - i*memshift; - state1[j] ^= state[j]; - (DMatrix + s3)[j] = state1[j]; + //一個手å‰ã®ã‚¹ãƒ¬ãƒƒãƒ‰ã‹ã‚‰ãƒ‡ãƒ¼ã‚¿ã‚’è²°ã†(åŒæ™‚ã«ä¸€å€‹å…ˆã®ã‚¹ãƒ¬ãƒƒãƒ‰ã«ãƒ‡ãƒ¼ã‚¿ã‚’é€ã‚‹) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; } - ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + ST4S(rowInOut, i, state2, thread, threads); - for (int j = 0; j < 11; j++) - ((uint2*)state2)[j+1] ^= ((uint2*)state)[j]; + LD4S(state1, rowOut, i, thread, threads); +#pragma unroll for (int j = 0; j < 3; j++) - (DMatrix + s2)[j] = state2[j]; + state1[j] ^= state[j]; + + ST4S(rowOut, i, state1, thread, threads); } } static __device__ __forceinline__ -void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uint2x4* state, const uint32_t thread) +void reduceDuplexRowt_8(const int rowInOut, uint2* state, const uint32_t thread, const uint32_t threads) { - const uint32_t ps1 = (memshift * 8 * rowIn + 256 * thread); - const uint32_t ps2 = (memshift * 8 * rowInOut + 256 * thread); - const uint32_t ps3 = (memshift * 8 * rowOut + 256 * thread); - #pragma unroll 1 - for (int i = 0; i < 8; i++) + uint2 state1[3], state2[3], last[3]; + + LD4S(state1, 2, 0, thread, threads); + LD4S(last, rowInOut, 0, thread, threads); + +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + last[j]; + + round_lyra(state); + + //一個手å‰ã®ã‚¹ãƒ¬ãƒƒãƒ‰ã‹ã‚‰ãƒ‡ãƒ¼ã‚¿ã‚’è²°ã†(åŒæ™‚ã«ä¸€å€‹å…ˆã®ã‚¹ãƒ¬ãƒƒãƒ‰ã«ãƒ‡ãƒ¼ã‚¿ã‚’é€ã‚‹) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + last[0] ^= Data2; + last[1] ^= Data0; + last[2] ^= Data1; + } + else { - uint2x4 state1[3], state2[3]; + last[0] ^= Data0; + last[1] ^= Data1; + last[2] ^= Data2; + } - const uint32_t s1 = ps1 + i*memshift; - const uint32_t s2 = ps2 + i*memshift; + if (rowInOut == 5) + { +#pragma unroll + for (int j = 0; j < 3; j++) + last[j] ^= state[j]; + } - for (int j = 0; j < 3; j++) { - state1[j] = __ldg4(&(DMatrix + s1)[j]); - state2[j] = __ldg4(&(DMatrix + s2)[j]); - } + for (int i = 1; i < Nrow; i++) + { + LD4S(state1, 2, i, thread, threads); + LD4S(state2, rowInOut, i, thread, threads); - #pragma unroll - for (int j = 0; j < 3; j++) { - state1[j] += state2[j]; - state[j] ^= state1[j]; - } +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; round_lyra(state); - - ((uint2*)state2)[0] ^= ((uint2*)state)[11]; - - for (int j = 0; j < 11; j++) - ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; - - if (rowInOut == rowOut) { - for (int j = 0; j < 3; j++) { - state2[j] ^= state[j]; - (DMatrix + s2)[j]=state2[j]; - } - } else { - const uint32_t s3 = ps3 + i*memshift; - for (int j = 0; j < 3; j++) { - (DMatrix + s2)[j] = state2[j]; - (DMatrix + s3)[j] ^= state[j]; - } - } } + + +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= last[j]; } -#if __CUDA_ARCH__ == 500 -__global__ __launch_bounds__(TPB50, 1) -#else -__global__ __launch_bounds__(TPB52, 2) -#endif -void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +__constant__ uint2x4 blake2b_IV[2] = { + 0xf3bcc908lu, 0x6a09e667lu, + 0x84caa73blu, 0xbb67ae85lu, + 0xfe94f82blu, 0x3c6ef372lu, + 0x5f1d36f1lu, 0xa54ff53alu, + 0xade682d1lu, 0x510e527flu, + 0x2b3e6c1flu, 0x9b05688clu, + 0xfb41bd6blu, 0x1f83d9ablu, + 0x137e2179lu, 0x5be0cd19lu +}; + +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - const uint2x4 blake2b_IV[2] = { - {{ 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a }}, - {{ 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 }} - }; - if (thread < threads) { uint2x4 state[4]; - ((uint2*)state)[0] = __ldg(&g_hash[thread]); - ((uint2*)state)[1] = __ldg(&g_hash[thread + threads]); - ((uint2*)state)[2] = __ldg(&g_hash[thread + threads*2]); - ((uint2*)state)[3] = __ldg(&g_hash[thread + threads*3]); - - state[1] = state[0]; + state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]); + state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]); + state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]); + state[0].w = state[1].w = __ldg(&g_hash[thread + threads * 3]); state[2] = blake2b_IV[0]; state[3] = blake2b_IV[1]; for (int i = 0; i<24; i++) round_lyra(state); //because 12 is not enough - const uint32_t ps1 = (memshift * 7 + 256 * thread); - for (int i = 0; i < 8; i++) - { - const uint32_t s1 = ps1 - memshift * i; - for (int j = 0; j < 3; j++) - (DMatrix + s1)[j] = (state)[j]; - round_lyra(state); - } + ((uint2x4*)DMatrix)[threads * 0 + thread] = state[0]; + ((uint2x4*)DMatrix)[threads * 1 + thread] = state[1]; + ((uint2x4*)DMatrix)[threads * 2 + thread] = state[2]; + ((uint2x4*)DMatrix)[threads * 3 + thread] = state[3]; + } +} - reduceDuplex(state, thread); - - reduceDuplexRowSetup(1, 0, 2, state, thread); - reduceDuplexRowSetup(2, 1, 3, state, thread); - reduceDuplexRowSetup(3, 0, 4, state, thread); - reduceDuplexRowSetup(4, 3, 5, state, thread); - reduceDuplexRowSetup(5, 2, 6, state, thread); - reduceDuplexRowSetup(6, 1, 7, state, thread); - - uint32_t rowa = state[0].x.x & 7; - reduceDuplexRowt(7, rowa, 0, state, thread); - rowa = state[0].x.x & 7; - reduceDuplexRowt(0, rowa, 3, state, thread); - rowa = state[0].x.x & 7; - reduceDuplexRowt(3, rowa, 6, state, thread); - rowa = state[0].x.x & 7; - reduceDuplexRowt(6, rowa, 1, state, thread); - rowa = state[0].x.x & 7; - reduceDuplexRowt(1, rowa, 4, state, thread); - rowa = state[0].x.x & 7; - reduceDuplexRowt(4, rowa, 7, state, thread); - rowa = state[0].x.x & 7; - reduceDuplexRowt(7, rowa, 2, state, thread); - rowa = state[0].x.x & 7; - reduceDuplexRowt(2, rowa, 5, state, thread); - - const int32_t shift = (memshift * 8 * rowa + 256 * thread); - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= __ldg4(&(DMatrix + shift)[j]); +#if __CUDA_ARCH__ < 300 +__global__ __launch_bounds__(TPB20, 1) +#elif __CUDA_ARCH__ < 500 +__global__ __launch_bounds__(TPB30, 1) +#elif __CUDA_ARCH__ == 500 +__global__ __launch_bounds__(TPB50, 1) +#else +__global__ __launch_bounds__(TPB52, 1) +#endif +void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) +{ + const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y; + + if (thread < threads) + { + uint2 state[4]; + state[0] = __ldg(&DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x]); + state[1] = __ldg(&DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x]); + state[2] = __ldg(&DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x]); + state[3] = __ldg(&DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x]); + + reduceDuplex(state, thread, threads); + + reduceDuplexRowSetup(1, 0, 2, state, thread, threads); + reduceDuplexRowSetup(2, 1, 3, state, thread, threads); + reduceDuplexRowSetup(3, 0, 4, state, thread, threads); + reduceDuplexRowSetup(4, 3, 5, state, thread, threads); + reduceDuplexRowSetup(5, 2, 6, state, thread, threads); + reduceDuplexRowSetup(6, 1, 7, state, thread, threads); + + uint32_t rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(7, rowa, 0, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(0, rowa, 3, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(3, rowa, 6, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(6, rowa, 1, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(1, rowa, 4, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(4, rowa, 7, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt(7, rowa, 2, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowt_8(rowa, state, thread, threads); + + DMatrix[(0 * threads + thread) * blockDim.x + threadIdx.x] = state[0]; + DMatrix[(1 * threads + thread) * blockDim.x + threadIdx.x] = state[1]; + DMatrix[(2 * threads + thread) * blockDim.x + threadIdx.x] = state[2]; + DMatrix[(3 * threads + thread) * blockDim.x + threadIdx.x] = state[3]; + } +} + +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +{ + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + + uint28 state[4]; + + if (thread < threads) + { + state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]); + state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]); + state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]); + state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]); for (int i = 0; i < 12; i++) round_lyra(state); - g_hash[thread] = ((uint2*)state)[0]; - g_hash[thread + threads] = ((uint2*)state)[1]; - g_hash[thread + threads*2] = ((uint2*)state)[2]; - g_hash[thread + threads*3] = ((uint2*)state)[3]; - } + g_hash[thread + threads * 0] = state[0].x; + g_hash[thread + threads * 1] = state[0].y; + g_hash[thread + threads * 2] = state[0].z; + g_hash[thread + threads * 3] = state[0].w; + + } //thread } #else +#if __CUDA_ARCH__ < 500 + /* for unsupported SM arch */ __device__ void* DMatrix; -__global__ void lyra2_gpu_hash_32(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} +#endif +__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {} +__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} #endif __host__ -void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix) +void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix) { - cuda_get_arch(thr_id); + int dev_id = device_map[thr_id % MAX_GPUS]; + // just assign the device pointer allocated in main loop cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice); } __host__ -void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order) +void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, bool gtx750ti) { int dev_id = device_map[thr_id % MAX_GPUS]; + uint32_t tpb = TPB52; - if (device_sm[dev_id] == 500) tpb = TPB50; - if (device_sm[dev_id] == 350) tpb = TPB30; // to enhance (or not) - if (device_sm[dev_id] <= 300) tpb = TPB30; - dim3 grid((threads + tpb - 1) / tpb); - dim3 block(tpb); + if (cuda_arch[dev_id] >= 520) tpb = TPB52; + else if (cuda_arch[dev_id] >= 500) tpb = TPB50; + else if (cuda_arch[dev_id] >= 200) tpb = TPB20; - if (device_sm[dev_id] >= 500) - lyra2_gpu_hash_32 <<< grid, block >>> (threads, startNounce, (uint2*)d_hash); - else - lyra2_gpu_hash_32_sm2 <<< grid, block >>> (threads, startNounce, d_hash); + dim3 grid1((threads * 4 + tpb - 1) / tpb); + dim3 block1(4, tpb >> 2); + dim3 grid2((threads + 64 - 1) / 64); + dim3 block2(64); + + dim3 grid3((threads + tpb - 1) / tpb); + dim3 block3(tpb); + + size_t shared_mem = 0; + + //if (cuda_arch[dev_id] < 500) cudaFuncSetCacheConfig(lyra2_gpu_hash_32_2, cudaFuncCachePreferShared); + + if (cuda_arch[dev_id] >= 520) + { + lyra2_gpu_hash_32_1 << > > (threads, startNounce, (uint2*)d_hash); + + lyra2_gpu_hash_32_2 << > > (threads, startNounce, d_hash); + + lyra2_gpu_hash_32_3 << > > (threads, startNounce, (uint2*)d_hash); + } + else if (cuda_arch[dev_id] >= 500) + { + if (gtx750ti) + // 8Warpã«èª¿æ•´ã®ãŸã‚ã€8192ãƒã‚¤ãƒˆç¢ºä¿ã™ã‚‹ + shared_mem = 8192; + else + // 10Warpã«èª¿æ•´ã®ãŸã‚ã€6144ãƒã‚¤ãƒˆç¢ºä¿ã™ã‚‹ + shared_mem = 6144; + + + lyra2_gpu_hash_32_1_sm5 << > > (threads, startNounce, (uint2*)d_hash); + + lyra2_gpu_hash_32_2_sm5 << > > (threads, startNounce, (uint2*)d_hash); + + lyra2_gpu_hash_32_3_sm5 << > > (threads, startNounce, (uint2*)d_hash); + } + else + lyra2_gpu_hash_32_sm2 << < grid3, block3 >> > (threads, startNounce, d_hash); } diff --git a/lyra2/cuda_lyra2_sm2.cuh b/lyra2/cuda_lyra2_sm2.cuh index 7998d17..94e8756 100644 --- a/lyra2/cuda_lyra2_sm2.cuh +++ b/lyra2/cuda_lyra2_sm2.cuh @@ -3,15 +3,16 @@ #ifdef __INTELLISENSE__ /* just for vstudio code colors */ #undef __CUDA_ARCH__ -#define __CUDA_ARCH__ 300 +#define __CUDA_ARCH__ 500 #endif #include "cuda_helper.h" #define TPB30 160 +#define TPB20 160 #if (__CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350) || !defined(__CUDA_ARCH__) -__constant__ static uint2 blake2b_IV[8] = { +__constant__ static uint2 blake2b_IV_sm2[8] = { { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, @@ -149,7 +150,7 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_h #pragma unroll for (int i = 0; i<8; i++) { - state[i + 8] = blake2b_IV[i]; + state[i + 8] = blake2b_IV_sm2[i]; } // blake2blyra x2 diff --git a/lyra2/cuda_lyra2_sm5.cuh b/lyra2/cuda_lyra2_sm5.cuh new file mode 100644 index 0000000..1db4e63 --- /dev/null +++ b/lyra2/cuda_lyra2_sm5.cuh @@ -0,0 +1,701 @@ +#include + +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +#undef __CUDA_ARCH__ +#define __CUDA_ARCH__ 500 +#endif + +#include "cuda_helper.h" + +#define TPB50 32 + +#if __CUDA_ARCH__ == 500 +#include "cuda_lyra2_vectors.h" + +#define Nrow 8 +#define Ncol 8 +#define memshift 3 + +__device__ uint2 *DMatrix; + +__device__ __forceinline__ uint2 LD4S(const int index) +{ + extern __shared__ uint2 shared_mem[]; + + return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x]; +} + +__device__ __forceinline__ void ST4S(const int index, const uint2 data) +{ + extern __shared__ uint2 shared_mem[]; + + shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data; +} + +#if __CUDA_ARCH__ >= 300 +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + return __shfl(a, b, c); +} + +__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) +{ + return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c)); +} + +__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + a1 = WarpShuffle(a1, b1, c); + a2 = WarpShuffle(a2, b2, c); + a3 = WarpShuffle(a3, b3, c); +} + +#else +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + uint32_t *_ptr = (uint32_t*)shared_mem; + + __threadfence_block(); + uint32_t buf = _ptr[thread]; + + _ptr[thread] = a; + __threadfence_block(); + uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))]; + + __threadfence_block(); + _ptr[thread] = buf; + + __threadfence_block(); + return result; +} + +__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + __threadfence_block(); + uint2 buf = shared_mem[thread]; + + shared_mem[thread] = a; + __threadfence_block(); + uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))]; + + __threadfence_block(); + shared_mem[thread] = buf; + + __threadfence_block(); + return result; +} + +__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + __threadfence_block(); + uint2 buf = shared_mem[thread]; + + shared_mem[thread] = a1; + __threadfence_block(); + a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))]; + __threadfence_block(); + shared_mem[thread] = a2; + __threadfence_block(); + a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))]; + __threadfence_block(); + shared_mem[thread] = a3; + __threadfence_block(); + a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))]; + + __threadfence_block(); + shared_mem[thread] = buf; + __threadfence_block(); +} + +#endif + +static __device__ __forceinline__ +void Gfunc(uint2 &a, uint2 &b, uint2 &c, uint2 &d) +{ + a += b; d ^= a; d = SWAPUINT2(d); + c += d; b ^= c; b = ROR2(b, 24); + a += b; d ^= a; d = ROR2(d, 16); + c += d; b ^= c; b = ROR2(b, 63); +} + +__device__ __forceinline__ void round_lyra(uint2 s[4]) +{ + Gfunc(s[0], s[1], s[2], s[3]); + WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4); + Gfunc(s[0], s[1], s[2], s[3]); + WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4); +} + +static __device__ __forceinline__ +void round_lyra(uint2x4* s) +{ + Gfunc(s[0].x, s[1].x, s[2].x, s[3].x); + Gfunc(s[0].y, s[1].y, s[2].y, s[3].y); + Gfunc(s[0].z, s[1].z, s[2].z, s[3].z); + Gfunc(s[0].w, s[1].w, s[2].w, s[3].w); + Gfunc(s[0].x, s[1].y, s[2].z, s[3].w); + Gfunc(s[0].y, s[1].z, s[2].w, s[3].x); + Gfunc(s[0].z, s[1].w, s[2].x, s[3].y); + Gfunc(s[0].w, s[1].x, s[2].y, s[3].z); +} + +static __device__ __forceinline__ +void reduceDuplexV5(uint2 state[4], const uint32_t thread, const uint32_t threads) +{ + uint2 state1[3], state2[3]; + + const uint32_t ps0 = (memshift * Ncol * 0 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps1 = (memshift * Ncol * 1 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps2 = (memshift * Ncol * 2 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps3 = (memshift * Ncol * 3 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps4 = (memshift * Ncol * 4 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps5 = (memshift * Ncol * 5 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps6 = (memshift * Ncol * 6 * threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps7 = (memshift * Ncol * 7 * threads + thread)*blockDim.x + threadIdx.x; + + for (int i = 0; i < 8; i++) + { + const uint32_t s0 = memshift * Ncol * 0 + (Ncol - 1 - i) * memshift; +#pragma unroll + for (int j = 0; j < 3; j++) + ST4S(s0 + j, state[j]); + round_lyra(state); + } + + for (int i = 0; i < 8; i++) + { + const uint32_t s0 = memshift * Ncol * 0 + i * memshift; + const uint32_t s1 = ps1 + (7 - i)*memshift* threads*blockDim.x; +#pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = LD4S(s0 + j); +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j]; + + round_lyra(state); + +#pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s1 + j*threads*blockDim.x) = state1[j] ^ state[j]; + } + + // 1, 0, 2 + for (int i = 0; i < 8; i++) + { + const uint32_t s0 = memshift * Ncol * 0 + i * memshift; + const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x; + const uint32_t s2 = ps2 + (7 - i)*memshift* threads*blockDim.x; +#pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = *(DMatrix + s1 + j*threads*blockDim.x); +#pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = LD4S(s0 + j); +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + +#pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s2 + j*threads*blockDim.x) = state1[j] ^ state[j]; + + //一個手å‰ã®ã‚¹ãƒ¬ãƒƒãƒ‰ã‹ã‚‰ãƒ‡ãƒ¼ã‚¿ã‚’è²°ã†(åŒæ™‚ã«ä¸€å€‹å…ˆã®ã‚¹ãƒ¬ãƒƒãƒ‰ã«ãƒ‡ãƒ¼ã‚¿ã‚’é€ã‚‹) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + +#pragma unroll + for (int j = 0; j < 3; j++) + ST4S(s0 + j, state2[j]); + } + + // 2, 1, 3 + for (int i = 0; i < 8; i++) + { + const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x; + const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x; + const uint32_t s3 = ps3 + (7 - i)*memshift* threads*blockDim.x; +#pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = *(DMatrix + s2 + j*threads*blockDim.x); +#pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = *(DMatrix + s1 + j*threads*blockDim.x); +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + +#pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s3 + j*threads*blockDim.x) = state1[j] ^ state[j]; + + //一個手å‰ã®ã‚¹ãƒ¬ãƒƒãƒ‰ã‹ã‚‰ãƒ‡ãƒ¼ã‚¿ã‚’è²°ã†(åŒæ™‚ã«ä¸€å€‹å…ˆã®ã‚¹ãƒ¬ãƒƒãƒ‰ã«ãƒ‡ãƒ¼ã‚¿ã‚’é€ã‚‹) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + +#pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s1 + j*threads*blockDim.x) = state2[j]; + } + + // 3, 0, 4 + for (int i = 0; i < 8; i++) + { + const uint32_t ls0 = memshift * Ncol * 0 + i * memshift; + const uint32_t s0 = ps0 + i * memshift* threads*blockDim.x; + const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x; + const uint32_t s4 = ps4 + (7 - i)*memshift* threads*blockDim.x; +#pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = *(DMatrix + s3 + j*threads*blockDim.x); +#pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = LD4S(ls0 + j); +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + +#pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s4 + j*threads*blockDim.x) = state1[j] ^ state[j]; + + //一個手å‰ã®ã‚¹ãƒ¬ãƒƒãƒ‰ã‹ã‚‰ãƒ‡ãƒ¼ã‚¿ã‚’è²°ã†(åŒæ™‚ã«ä¸€å€‹å…ˆã®ã‚¹ãƒ¬ãƒƒãƒ‰ã«ãƒ‡ãƒ¼ã‚¿ã‚’é€ã‚‹) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + +#pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s0 + j*threads*blockDim.x) = state2[j]; + } + + // 4, 3, 5 + for (int i = 0; i < 8; i++) + { + const uint32_t s3 = ps3 + i * memshift* threads*blockDim.x; + const uint32_t s4 = ps4 + i * memshift* threads*blockDim.x; + const uint32_t s5 = ps5 + (7 - i)*memshift* threads*blockDim.x; +#pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = *(DMatrix + s4 + j*threads*blockDim.x); +#pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = *(DMatrix + s3 + j*threads*blockDim.x); +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + +#pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s5 + j*threads*blockDim.x) = state1[j] ^ state[j]; + + //一個手å‰ã®ã‚¹ãƒ¬ãƒƒãƒ‰ã‹ã‚‰ãƒ‡ãƒ¼ã‚¿ã‚’è²°ã†(åŒæ™‚ã«ä¸€å€‹å…ˆã®ã‚¹ãƒ¬ãƒƒãƒ‰ã«ãƒ‡ãƒ¼ã‚¿ã‚’é€ã‚‹) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + +#pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s3 + j*threads*blockDim.x) = state2[j]; + } + + // 5, 2, 6 + for (int i = 0; i < 8; i++) + { + const uint32_t s2 = ps2 + i * memshift* threads*blockDim.x; + const uint32_t s5 = ps5 + i * memshift* threads*blockDim.x; + const uint32_t s6 = ps6 + (7 - i)*memshift* threads*blockDim.x; +#pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = *(DMatrix + s5 + j*threads*blockDim.x); +#pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = *(DMatrix + s2 + j*threads*blockDim.x); +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + +#pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s6 + j*threads*blockDim.x) = state1[j] ^ state[j]; + + //一個手å‰ã®ã‚¹ãƒ¬ãƒƒãƒ‰ã‹ã‚‰ãƒ‡ãƒ¼ã‚¿ã‚’è²°ã†(åŒæ™‚ã«ä¸€å€‹å…ˆã®ã‚¹ãƒ¬ãƒƒãƒ‰ã«ãƒ‡ãƒ¼ã‚¿ã‚’é€ã‚‹) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + +#pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s2 + j*threads*blockDim.x) = state2[j]; + } + + // 6, 1, 7 + for (int i = 0; i < 8; i++) + { + const uint32_t s1 = ps1 + i * memshift* threads*blockDim.x; + const uint32_t s6 = ps6 + i * memshift* threads*blockDim.x; + const uint32_t s7 = ps7 + (7 - i)*memshift* threads*blockDim.x; +#pragma unroll + for (int j = 0; j < 3; j++) + state1[j] = *(DMatrix + s6 + j*threads*blockDim.x); +#pragma unroll + for (int j = 0; j < 3; j++) + state2[j] = *(DMatrix + s1 + j*threads*blockDim.x); +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= state1[j] + state2[j]; + + round_lyra(state); + +#pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s7 + j*threads*blockDim.x) = state1[j] ^ state[j]; + + //一個手å‰ã®ã‚¹ãƒ¬ãƒƒãƒ‰ã‹ã‚‰ãƒ‡ãƒ¼ã‚¿ã‚’è²°ã†(åŒæ™‚ã«ä¸€å€‹å…ˆã®ã‚¹ãƒ¬ãƒƒãƒ‰ã«ãƒ‡ãƒ¼ã‚¿ã‚’é€ã‚‹) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + +#pragma unroll + for (int j = 0; j < 3; j++) + *(DMatrix + s1 + j*threads*blockDim.x) = state2[j]; + } +} + +static __device__ __forceinline__ +void reduceDuplexRowV50(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4], const uint32_t thread, const uint32_t threads) +{ + const uint32_t ps1 = (memshift * Ncol * rowIn*threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps3 = (memshift * Ncol * rowOut*threads + thread)*blockDim.x + threadIdx.x; + +#pragma unroll 1 + for (int i = 0; i < 8; i++) + { + uint2 state1[3], state2[3]; + + const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x; + const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x; + const uint32_t s3 = ps3 + i*memshift*threads *blockDim.x; + +#pragma unroll + for (int j = 0; j < 3; j++) { + state1[j] = *(DMatrix + s1 + j*threads*blockDim.x); + state2[j] = *(DMatrix + s2 + j*threads*blockDim.x); + } + +#pragma unroll + for (int j = 0; j < 3; j++) { + state1[j] += state2[j]; + state[j] ^= state1[j]; + } + + round_lyra(state); + + //一個手å‰ã®ã‚¹ãƒ¬ãƒƒãƒ‰ã‹ã‚‰ãƒ‡ãƒ¼ã‚¿ã‚’è²°ã†(åŒæ™‚ã«ä¸€å€‹å…ˆã®ã‚¹ãƒ¬ãƒƒãƒ‰ã«ãƒ‡ãƒ¼ã‚¿ã‚’é€ã‚‹) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; + } + else + { + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; + } + +#pragma unroll + for (int j = 0; j < 3; j++) + { + *(DMatrix + s2 + j*threads*blockDim.x) = state2[j]; + *(DMatrix + s3 + j*threads*blockDim.x) ^= state[j]; + } + } +} + +static __device__ __forceinline__ +void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thread, const uint32_t threads) +{ + const uint32_t ps1 = (memshift * Ncol * 2*threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps2 = (memshift * Ncol * rowInOut *threads + thread)*blockDim.x + threadIdx.x; + const uint32_t ps3 = (memshift * Ncol * 5*threads + thread)*blockDim.x + threadIdx.x; + + uint2 state1[3], last[3]; + +#pragma unroll + for (int j = 0; j < 3; j++) { + state1[j] = *(DMatrix + ps1 + j*threads*blockDim.x); + last[j] = *(DMatrix + ps2 + j*threads*blockDim.x); + } + +#pragma unroll + for (int j = 0; j < 3; j++) { + state1[j] += last[j]; + state[j] ^= state1[j]; + } + + round_lyra(state); + + //一個手å‰ã®ã‚¹ãƒ¬ãƒƒãƒ‰ã‹ã‚‰ãƒ‡ãƒ¼ã‚¿ã‚’è²°ã†(åŒæ™‚ã«ä¸€å€‹å…ˆã®ã‚¹ãƒ¬ãƒƒãƒ‰ã«ãƒ‡ãƒ¼ã‚¿ã‚’é€ã‚‹) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + last[0] ^= Data2; + last[1] ^= Data0; + last[2] ^= Data1; + } + else + { + last[0] ^= Data0; + last[1] ^= Data1; + last[2] ^= Data2; + } + + if (rowInOut == 5) + { +#pragma unroll + for (int j = 0; j < 3; j++) + last[j] ^= state[j]; + } + + for (int i = 1; i < 8; i++) + { + const uint32_t s1 = ps1 + i*memshift*threads *blockDim.x; + const uint32_t s2 = ps2 + i*memshift*threads *blockDim.x; + +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= *(DMatrix + s1 + j*threads*blockDim.x) + *(DMatrix + s2 + j*threads*blockDim.x); + + round_lyra(state); + } + + +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= last[j]; + +} + +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + const uint2x4 blake2b_IV[2] = { + { { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } }, + { { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } } + }; + + if (thread < threads) + { + uint2x4 state[4]; + + ((uint2*)state)[0] = __ldg(&g_hash[thread]); + ((uint2*)state)[1] = __ldg(&g_hash[thread + threads]); + ((uint2*)state)[2] = __ldg(&g_hash[thread + threads * 2]); + ((uint2*)state)[3] = __ldg(&g_hash[thread + threads * 3]); + + state[1] = state[0]; + state[2] = blake2b_IV[0]; + state[3] = blake2b_IV[1]; + + for (int i = 0; i < 24; i++) + round_lyra(state); //because 12 is not enough + + ((uint2x4*)DMatrix)[0 * threads + thread] = state[0]; + ((uint2x4*)DMatrix)[1 * threads + thread] = state[1]; + ((uint2x4*)DMatrix)[2 * threads + thread] = state[2]; + ((uint2x4*)DMatrix)[3 * threads + thread] = state[3]; + } +} + +__global__ __launch_bounds__(TPB50, 1) +void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +{ + const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y); + + if (thread < threads) + { + uint2 state[4]; + + state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]); + state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]); + state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]); + state[3] = __ldg(&DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x]); + + reduceDuplexV5(state, thread, threads); + + uint32_t rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(7, rowa, 0, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(0, rowa, 3, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(3, rowa, 6, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(6, rowa, 1, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(1, rowa, 4, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(4, rowa, 7, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50(7, rowa, 2, state, thread, threads); + rowa = WarpShuffle(state[0].x, 0, 4) & 7; + reduceDuplexRowV50_8(rowa, state, thread, threads); + + DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x] = state[0]; + DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x] = state[1]; + DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x] = state[2]; + DMatrix[(3 * threads + thread)*blockDim.x + threadIdx.x] = state[3]; + } +} + +__global__ __launch_bounds__(64, 1) +void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + if (thread < threads) + { + uint2x4 state[4]; + + state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]); + state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]); + state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]); + state[3] = __ldg4(&((uint2x4*)DMatrix)[3 * threads + thread]); + + for (int i = 0; i < 12; i++) + round_lyra(state); + + g_hash[thread] = ((uint2*)state)[0]; + g_hash[thread + threads] = ((uint2*)state)[1]; + g_hash[thread + threads * 2] = ((uint2*)state)[2]; + g_hash[thread + threads * 3] = ((uint2*)state)[3]; + } +} + +#else +/* if __CUDA_ARCH__ != 500 .. host */ +__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} +__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} +#endif diff --git a/lyra2/cuda_lyra2v2.cu b/lyra2/cuda_lyra2v2.cu index c6c4d1a..265d433 100644 --- a/lyra2/cuda_lyra2v2.cu +++ b/lyra2/cuda_lyra2v2.cu @@ -2,35 +2,152 @@ #include #include -#define TPB52 8 -#define TPB50 16 - -#include "cuda_lyra2v2_sm3.cuh" +#define TPB52 32 +#define TPB50 32 +#define TPB30 32 +#define TPB20 32 #ifdef __INTELLISENSE__ /* just for vstudio code colors */ -#define __CUDA_ARCH__ 500 +#define __CUDA_ARCH__ 200 #endif -#if __CUDA_ARCH__ >= 500 - #include "cuda_lyra2_vectors.h" +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +#if __CUDA_ARCH__ >= 300 +__device__ uint32_t __shfl(uint32_t a, uint32_t b, uint32_t c); +#endif +#endif + #define Nrow 4 #define Ncol 4 #define memshift 3 -__device__ uint2x4 *DMatrix; +__device__ uint2x4 *DState; + +__device__ __forceinline__ uint2 LD4S(const int index) +{ + extern __shared__ uint2 shared_mem[]; + + return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x]; +} + +__device__ __forceinline__ void ST4S(const int index, const uint2 data) +{ + extern __shared__ uint2 shared_mem[]; + + shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data; +} __device__ __forceinline__ void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d) { - a += b; d ^= a; d = SWAPUINT2(d); - c += d; b ^= c; b = ROR2(b, 24); - a += b; d ^= a; d = ROR2(d, 16); + a += b; d = eorswap32(a, d); + c += d; b ^= c; b = ROR24(b); + a += b; d ^= a; d = ROR16(d); c += d; b ^= c; b = ROR2(b, 63); } + +#if __CUDA_ARCH__ >= 300 +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + return __shfl(a, b, c); +} + +__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) +{ + return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c)); +} + +__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + a1 = WarpShuffle(a1, b1, c); + a2 = WarpShuffle(a2, b2, c); + a3 = WarpShuffle(a3, b3, c); +} + +#else +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + uint32_t *_ptr = (uint32_t*)shared_mem; + + __threadfence_block(); + uint32_t buf = _ptr[thread]; + + _ptr[thread] = a; + __threadfence_block(); + uint32_t result = _ptr[(thread&~(c - 1)) + (b&(c - 1))]; + + __threadfence_block(); + _ptr[thread] = buf; + + __threadfence_block(); + return result; +} + +__device__ __forceinline__ uint2 WarpShuffle(uint2 a, uint32_t b, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + __threadfence_block(); + uint2 buf = shared_mem[thread]; + + shared_mem[thread] = a; + __threadfence_block(); + uint2 result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))]; + + __threadfence_block(); + shared_mem[thread] = buf; + + __threadfence_block(); + return result; +} + +__device__ __forceinline__ void WarpShuffle3(uint2 &a1, uint2 &a2, uint2 &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + extern __shared__ uint2 shared_mem[]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + __threadfence_block(); + uint2 buf = shared_mem[thread]; + + shared_mem[thread] = a1; + __threadfence_block(); + a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))]; + __threadfence_block(); + shared_mem[thread] = a2; + __threadfence_block(); + a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))]; + __threadfence_block(); + shared_mem[thread] = a3; + __threadfence_block(); + a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))]; + + __threadfence_block(); + shared_mem[thread] = buf; + __threadfence_block(); +} + +#endif + + +__device__ __forceinline__ void round_lyra_v35(uint2 s[4]) +{ + Gfunc_v5(s[0], s[1], s[2], s[3]); + WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3, 4); + Gfunc_v5(s[0], s[1], s[2], s[3]); + WarpShuffle3(s[1], s[2], s[3], threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1, 4); +} + __device__ __forceinline__ void round_lyra_v5(uint2x4* s) { @@ -45,145 +162,142 @@ void round_lyra_v5(uint2x4* s) Gfunc_v5(s[0].w, s[1].x, s[2].y, s[3].z); } -__device__ __forceinline__ -void reduceDuplex(uint2x4 state[4], const uint32_t thread) + +__device__ __forceinline__ void reduceDuplexRowSetupV2(uint2 state[4]) { - uint2x4 state1[3]; - const uint32_t ps1 = (Nrow * Ncol * memshift * thread); - const uint32_t ps2 = (memshift * (Ncol-1) + memshift * Ncol + Nrow * Ncol * memshift * thread); + int i, j; + uint2 state1[Ncol][3], state0[Ncol][3], state2[3]; - #pragma unroll 4 +#if __CUDA_ARCH__ > 500 +#pragma unroll +#endif for (int i = 0; i < Ncol; i++) { - uint32_t s1 = ps1 + i*memshift; - uint32_t s2 = ps2 - i*memshift; - - #pragma unroll - for (int j = 0; j < 3; j++) - state1[j] = __ldg4(&(DMatrix+s1)[j]); - - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= state1[j]; - - round_lyra_v5(state); - - #pragma unroll - for (int j = 0; j < 3; j++) - state1[j] ^= state[j]; - - #pragma unroll - for (int j = 0; j < 3; j++) - (DMatrix + s2)[j] = state1[j]; +#pragma unroll + for (j = 0; j < 3; j++) + state0[Ncol - i - 1][j] = state[j]; + round_lyra_v35(state); } -} - -__device__ __forceinline__ -void reduceDuplex50(uint2x4 state[4], const uint32_t thread) -{ - const uint32_t ps1 = (Nrow * Ncol * memshift * thread); - const uint32_t ps2 = (memshift * (Ncol - 1) + memshift * Ncol + Nrow * Ncol * memshift * thread); - #pragma unroll 4 - for (int i = 0; i < Ncol; i++) + //#pragma unroll 4 + for (i = 0; i < Ncol; i++) { - const uint32_t s1 = ps1 + i*memshift; - const int32_t s2 = ps2 - i*memshift; +#pragma unroll + for (j = 0; j < 3; j++) + state[j] ^= state0[i][j]; - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= __ldg4(&(DMatrix + s1)[j]); + round_lyra_v35(state); - round_lyra_v5(state); +#pragma unroll + for (j = 0; j < 3; j++) + state1[Ncol - i - 1][j] = state0[i][j]; - #pragma unroll - for (int j = 0; j < 3; j++) - (DMatrix + s2)[j] = __ldg4(&(DMatrix + s1)[j]) ^ state[j]; +#pragma unroll + for (j = 0; j < 3; j++) + state1[Ncol - i - 1][j] ^= state[j]; } -} -__device__ __forceinline__ -void reduceDuplexRowSetupV2(const int rowIn, const int rowInOut, const int rowOut, uint2x4 state[4], const uint32_t thread) -{ - uint2x4 state2[3], state1[3]; - - const uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread); - const uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread); - const uint32_t ps3 = (memshift * (Ncol-1) + memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread); - - for (int i = 0; i < Ncol; i++) + for (i = 0; i < Ncol; i++) { - const uint32_t s1 = ps1 + i*memshift; - const uint32_t s2 = ps2 + i*memshift; - const uint32_t s3 = ps3 - i*memshift; + const uint32_t s0 = memshift * Ncol * 0 + i * memshift; + const uint32_t s2 = memshift * Ncol * 2 + memshift * (Ncol - 1) - i*memshift; +#pragma unroll + for (j = 0; j < 3; j++) + state[j] ^= state1[i][j] + state0[i][j]; -#if __CUDA_ARCH__ == 500 + round_lyra_v35(state); - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] = state[j] ^ (__ldg4(&(DMatrix + s1)[j]) + __ldg4(&(DMatrix + s2)[j])); +#pragma unroll + for (j = 0; j < 3; j++) + state2[j] = state1[i][j]; - round_lyra_v5(state); - #pragma unroll - for (int j = 0; j < 3; j++) - state1[j] = __ldg4(&(DMatrix + s1)[j]); +#pragma unroll + for (j = 0; j < 3; j++) + state2[j] ^= state[j]; - #pragma unroll - for (int j = 0; j < 3; j++) - state2[j] = __ldg4(&(DMatrix + s2)[j]); +#pragma unroll + for (j = 0; j < 3; j++) + ST4S(s2 + j, state2[j]); - #pragma unroll - for (int j = 0; j < 3; j++) + //一個手å‰ã®ã‚¹ãƒ¬ãƒƒãƒ‰ã‹ã‚‰ãƒ‡ãƒ¼ã‚¿ã‚’è²°ã†(åŒæ™‚ã«ä¸€å€‹å…ˆã®ã‚¹ãƒ¬ãƒƒãƒ‰ã«ãƒ‡ãƒ¼ã‚¿ã‚’é€ã‚‹) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) { - state1[j] ^= state[j]; - (DMatrix + s3)[j] = state1[j]; + state0[i][0] ^= Data2; + state0[i][1] ^= Data0; + state0[i][2] ^= Data1; } - -#else /* 5.2 */ - - #pragma unroll - for (int j = 0; j < 3; j++) - state1[j] = __ldg4(&(DMatrix + s1)[j]); - #pragma unroll - for (int j = 0; j < 3; j++) - state2[j] = __ldg4(&(DMatrix + s2)[j]); - #pragma unroll - for (int j = 0; j < 3; j++) + else { - uint2x4 tmp = state1[j] + state2[j]; - state[j] ^= tmp; + state0[i][0] ^= Data0; + state0[i][1] ^= Data1; + state0[i][2] ^= Data2; } - round_lyra_v5(state); +#pragma unroll + for (j = 0; j < 3; j++) + ST4S(s0 + j, state0[i][j]); - #pragma unroll - for (int j = 0; j < 3; j++) +#pragma unroll + for (j = 0; j < 3; j++) + state0[i][j] = state2[j]; + + } + + for (i = 0; i < Ncol; i++) + { + const uint32_t s1 = memshift * Ncol * 1 + i*memshift; + const uint32_t s3 = memshift * Ncol * 3 + memshift * (Ncol - 1) - i*memshift; +#pragma unroll + for (j = 0; j < 3; j++) + state[j] ^= state1[i][j] + state0[Ncol - i - 1][j]; + + round_lyra_v35(state); + +#pragma unroll + for (j = 0; j < 3; j++) + state0[Ncol - i - 1][j] ^= state[j]; +#pragma unroll + for (j = 0; j < 3; j++) + ST4S(s3 + j, state0[Ncol - i - 1][j]); + + //一個手å‰ã®ã‚¹ãƒ¬ãƒƒãƒ‰ã‹ã‚‰ãƒ‡ãƒ¼ã‚¿ã‚’è²°ã†(åŒæ™‚ã«ä¸€å€‹å…ˆã®ã‚¹ãƒ¬ãƒƒãƒ‰ã«ãƒ‡ãƒ¼ã‚¿ã‚’é€ã‚‹) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) { - state1[j] ^= state[j]; - (DMatrix + s3)[j] = state1[j]; + state1[i][0] ^= Data2; + state1[i][1] ^= Data0; + state1[i][2] ^= Data1; } + else + { + state1[i][0] ^= Data0; + state1[i][1] ^= Data1; + state1[i][2] ^= Data2; + } -#endif - ((uint2*)state2)[0] ^= ((uint2*)state)[11]; +#pragma unroll + for (j = 0; j < 3; j++) + ST4S(s1 + j, state1[i][j]); - #pragma unroll - for (int j = 0; j < 11; j++) - ((uint2*)state2)[j+1] ^= ((uint2*)state)[j]; - #pragma unroll - for (int j = 0; j < 3; j++) - (DMatrix + s2)[j] = state2[j]; } } - -__device__ __forceinline__ -void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, uint2x4* state, const uint32_t thread) +__device__ void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4]) { - uint2x4 state1[3], state2[3]; - const uint32_t ps1 = (memshift * Ncol * rowIn + Nrow * Ncol * memshift * thread); - const uint32_t ps2 = (memshift * Ncol * rowInOut + Nrow * Ncol * memshift * thread); - const uint32_t ps3 = (memshift * Ncol * rowOut + Nrow * Ncol * memshift * thread); + uint2 state1[3], state2[3]; + const uint32_t ps1 = memshift * Ncol * rowIn; + const uint32_t ps2 = memshift * Ncol * rowInOut; + const uint32_t ps3 = memshift * Ncol * rowOut; for (int i = 0; i < Ncol; i++) { @@ -191,190 +305,268 @@ void reduceDuplexRowtV2(const int rowIn, const int rowInOut, const int rowOut, u const uint32_t s2 = ps2 + i*memshift; const uint32_t s3 = ps3 + i*memshift; - #pragma unroll +#pragma unroll for (int j = 0; j < 3; j++) - state1[j] = __ldg4(&(DMatrix + s1)[j]); + state1[j] = LD4S(s1 + j); - - #pragma unroll - for (int j = 0; j < 3; j++) - state2[j] = __ldg4(&(DMatrix + s2)[j]); - - #pragma unroll +#pragma unroll for (int j = 0; j < 3; j++) - state1[j] += state2[j]; + state2[j] = LD4S(s2 + j); - #pragma unroll +#pragma unroll for (int j = 0; j < 3; j++) - state[j] ^= state1[j]; - - round_lyra_v5(state); - - ((uint2*)state2)[0] ^= ((uint2*)state)[11]; + state[j] ^= state1[j] + state2[j]; - #pragma unroll - for (int j = 0; j < 11; j++) - ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; + round_lyra_v35(state); -#if __CUDA_ARCH__ == 500 - if (rowInOut != rowOut) - { - #pragma unroll - for (int j = 0; j < 3; j++) - (DMatrix + s3)[j] ^= state[j]; + //一個手å‰ã®ã‚¹ãƒ¬ãƒƒãƒ‰ã‹ã‚‰ãƒ‡ãƒ¼ã‚¿ã‚’è²°ã†(åŒæ™‚ã«ä¸€å€‹å…ˆã®ã‚¹ãƒ¬ãƒƒãƒ‰ã«ãƒ‡ãƒ¼ã‚¿ã‚’é€ã‚‹) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); - } - if (rowInOut == rowOut) + if (threadIdx.x == 0) { - #pragma unroll - for (int j = 0; j < 3; j++) - state2[j] ^= state[j]; + state2[0] ^= Data2; + state2[1] ^= Data0; + state2[2] ^= Data1; } -#else - if (rowInOut != rowOut) + else { - #pragma unroll - for (int j = 0; j < 3; j++) - (DMatrix + s3)[j] ^= state[j]; - } else { - #pragma unroll - for (int j = 0; j < 3; j++) - state2[j] ^= state[j]; + state2[0] ^= Data0; + state2[1] ^= Data1; + state2[2] ^= Data2; } -#endif - #pragma unroll + +#pragma unroll + for (int j = 0; j < 3; j++) + ST4S(s2 + j, state2[j]); +#pragma unroll for (int j = 0; j < 3; j++) - (DMatrix + s2)[j] = state2[j]; + ST4S(s3 + j, LD4S(s3 + j) ^ state[j]); } } - -#if __CUDA_ARCH__ == 500 -__global__ __launch_bounds__(TPB50, 1) -#else -__global__ __launch_bounds__(TPB52, 1) -#endif -void lyra2v2_gpu_hash_32(const uint32_t threads, uint32_t startNounce, uint2 *g_hash) +__device__ void reduceDuplexRowtV2_4(const int rowInOut, uint2 state[4]) { - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const int rowIn = 2; + const int rowOut = 3; - uint2x4 blake2b_IV[2]; + int i, j; + uint2 state2[3], state1[3], last[3]; + const uint32_t ps1 = memshift * Ncol * rowIn; + const uint32_t ps2 = memshift * Ncol * rowInOut; + const uint32_t ps3 = memshift * Ncol * rowOut; - if (threadIdx.x == 0) { +#pragma unroll + for (int j = 0; j < 3; j++) + last[j] = LD4S(ps2 + j); - ((uint16*)blake2b_IV)[0] = make_uint16( - 0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85, - 0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a, - 0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c, - 0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19 - ); +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= LD4S(ps1 + j) + last[j]; + + round_lyra_v35(state); + + //一個手å‰ã®ã‚¹ãƒ¬ãƒƒãƒ‰ã‹ã‚‰ãƒ‡ãƒ¼ã‚¿ã‚’è²°ã†(åŒæ™‚ã«ä¸€å€‹å…ˆã®ã‚¹ãƒ¬ãƒƒãƒ‰ã«ãƒ‡ãƒ¼ã‚¿ã‚’é€ã‚‹) + uint2 Data0 = state[0]; + uint2 Data1 = state[1]; + uint2 Data2 = state[2]; + WarpShuffle3(Data0, Data1, Data2, threadIdx.x - 1, threadIdx.x - 1, threadIdx.x - 1, 4); + + if (threadIdx.x == 0) + { + last[0] ^= Data2; + last[1] ^= Data0; + last[2] ^= Data1; + } + else + { + last[0] ^= Data0; + last[1] ^= Data1; + last[2] ^= Data2; } - if (thread < threads) + if (rowInOut == rowOut) + { +#pragma unroll + for (j = 0; j < 3; j++) + last[j] ^= state[j]; + } + + for (i = 1; i < Ncol; i++) { - uint2x4 state[4]; + const uint32_t s1 = ps1 + i*memshift; + const uint32_t s2 = ps2 + i*memshift; - ((uint2*)state)[0] = __ldg(&g_hash[thread]); - ((uint2*)state)[1] = __ldg(&g_hash[thread + threads]); - ((uint2*)state)[2] = __ldg(&g_hash[thread + threads*2]); - ((uint2*)state)[3] = __ldg(&g_hash[thread + threads*3]); +#pragma unroll + for (j = 0; j < 3; j++) + state[j] ^= LD4S(s1 + j) + LD4S(s2 + j); - state[1] = state[0]; + round_lyra_v35(state); + } + +#pragma unroll + for (int j = 0; j < 3; j++) + state[j] ^= last[j]; +} - state[2] = ((blake2b_IV)[0]); - state[3] = ((blake2b_IV)[1]); +__constant__ uint28 blake2b_IV[2] = { + 0xf3bcc908lu, 0x6a09e667lu, + 0x84caa73blu, 0xbb67ae85lu, + 0xfe94f82blu, 0x3c6ef372lu, + 0x5f1d36f1lu, 0xa54ff53alu, + 0xade682d1lu, 0x510e527flu, + 0x2b3e6c1flu, 0x9b05688clu, + 0xfb41bd6blu, 0x1f83d9ablu, + 0x137e2179lu, 0x5be0cd19lu +}; + +__constant__ uint28 Mask[2] = { + 0x00000020lu, 0x00000000lu, + 0x00000020lu, 0x00000000lu, + 0x00000020lu, 0x00000000lu, + 0x00000001lu, 0x00000000lu, + 0x00000004lu, 0x00000000lu, + 0x00000004lu, 0x00000000lu, + 0x00000080lu, 0x00000000lu, + 0x00000000lu, 0x01000000lu +}; + +__global__ __launch_bounds__(64, 1) +void lyra2v2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *outputHash) +{ + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + + uint28 state[4]; + + if (thread < threads) + { + state[0].x = state[1].x = __ldg(&outputHash[thread + threads * 0]); + state[0].y = state[1].y = __ldg(&outputHash[thread + threads * 1]); + state[0].z = state[1].z = __ldg(&outputHash[thread + threads * 2]); + state[0].w = state[1].w = __ldg(&outputHash[thread + threads * 3]); + state[2] = blake2b_IV[0]; + state[3] = blake2b_IV[1]; for (int i = 0; i<12; i++) round_lyra_v5(state); - ((uint2*)state)[0].x ^= 0x20; - ((uint2*)state)[1].x ^= 0x20; - ((uint2*)state)[2].x ^= 0x20; - ((uint2*)state)[3].x ^= 0x01; - ((uint2*)state)[4].x ^= 0x04; - ((uint2*)state)[5].x ^= 0x04; - ((uint2*)state)[6].x ^= 0x80; - ((uint2*)state)[7].y ^= 0x01000000; + state[0] ^= Mask[0]; + state[1] ^= Mask[1]; for (int i = 0; i<12; i++) round_lyra_v5(state); - const uint32_t ps1 = (memshift * (Ncol - 1) + Nrow * Ncol * memshift * thread); + DState[blockDim.x * gridDim.x * 0 + blockDim.x * blockIdx.x + threadIdx.x] = state[0]; + DState[blockDim.x * gridDim.x * 1 + blockDim.x * blockIdx.x + threadIdx.x] = state[1]; + DState[blockDim.x * gridDim.x * 2 + blockDim.x * blockIdx.x + threadIdx.x] = state[2]; + DState[blockDim.x * gridDim.x * 3 + blockDim.x * blockIdx.x + threadIdx.x] = state[3]; - for (int i = 0; i < Ncol; i++) - { - const uint32_t s1 = ps1 - memshift * i; - DMatrix[s1] = state[0]; - DMatrix[s1+1] = state[1]; - DMatrix[s1+2] = state[2]; - round_lyra_v5(state); - } + } //thread +} + +#if __CUDA_ARCH__ < 300 +__global__ __launch_bounds__(TPB20, 1) +#elif __CUDA_ARCH__ < 500 +__global__ __launch_bounds__(TPB30, 1) +#elif __CUDA_ARCH__ == 500 +__global__ __launch_bounds__(TPB50, 1) +#else +__global__ __launch_bounds__(TPB52, 1) +#endif +void lyra2v2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *outputHash) +{ + const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y; - reduceDuplex50(state, thread); + if (thread < threads) + { + uint2 state[4]; + state[0] = ((uint2*)DState)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; + state[1] = ((uint2*)DState)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; + state[2] = ((uint2*)DState)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; + state[3] = ((uint2*)DState)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x]; - reduceDuplexRowSetupV2(1, 0, 2, state, thread); - reduceDuplexRowSetupV2(2, 1, 3, state, thread); + reduceDuplexRowSetupV2(state); uint32_t rowa; - int prev=3; + int prev = 3; - for (int i = 0; i < 4; i++) + for (int i = 0; i < 3; i++) { - rowa = ((uint2*)state)[0].x & 3; - reduceDuplexRowtV2(prev, rowa, i, state, thread); + rowa = WarpShuffle(state[0].x, 0, 4) & 3; + reduceDuplexRowtV2(prev, rowa, i, state); prev = i; } - const uint32_t shift = (memshift * Ncol * rowa + Nrow * Ncol * memshift * thread); + rowa = WarpShuffle(state[0].x, 0, 4) & 3; + reduceDuplexRowtV2_4(rowa, state); - #pragma unroll - for (int j = 0; j < 3; j++) - state[j] ^= __ldg4(&(DMatrix + shift)[j]); + ((uint2*)DState)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[0]; + ((uint2*)DState)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[1]; + ((uint2*)DState)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[2]; + ((uint2*)DState)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[3]; + } //thread +} + +__global__ __launch_bounds__(64, 1) +void lyra2v2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) +{ + const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x; + + uint28 state[4]; + + if (thread < threads) + { + state[0] = __ldg4(&DState[blockDim.x * gridDim.x * 0 + blockDim.x * blockIdx.x + threadIdx.x]); + state[1] = __ldg4(&DState[blockDim.x * gridDim.x * 1 + blockDim.x * blockIdx.x + threadIdx.x]); + state[2] = __ldg4(&DState[blockDim.x * gridDim.x * 2 + blockDim.x * blockIdx.x + threadIdx.x]); + state[3] = __ldg4(&DState[blockDim.x * gridDim.x * 3 + blockDim.x * blockIdx.x + threadIdx.x]); for (int i = 0; i < 12; i++) round_lyra_v5(state); - g_hash[thread] = ((uint2*)state)[0]; - g_hash[thread + threads] = ((uint2*)state)[1]; - g_hash[thread + threads*2] = ((uint2*)state)[2]; - g_hash[thread + threads*3] = ((uint2*)state)[3]; - } + outputHash[thread + threads * 0] = state[0].x; + outputHash[thread + threads * 1] = state[0].y; + outputHash[thread + threads * 2] = state[0].z; + outputHash[thread + threads * 3] = state[0].w; + + } //thread } -#else -#include "cuda_helper.h" -#if __CUDA_ARCH__ < 200 -__device__ void* DMatrix; -#endif -__global__ void lyra2v2_gpu_hash_32(const uint32_t threads, uint32_t startNounce, uint2 *g_hash) {} -#endif __host__ void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix) { - cuda_get_arch(thr_id); + int dev_id = device_map[thr_id % MAX_GPUS]; // just assign the device pointer allocated in main loop - cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(DState, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice); } __host__ void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, int order) { int dev_id = device_map[thr_id % MAX_GPUS]; + uint32_t tpb = TPB52; if (cuda_arch[dev_id] > 500) tpb = TPB52; else if (cuda_arch[dev_id] == 500) tpb = TPB50; - else if (cuda_arch[dev_id] >= 350) tpb = TPB35; else if (cuda_arch[dev_id] >= 300) tpb = TPB30; else if (cuda_arch[dev_id] >= 200) tpb = TPB20; - dim3 grid((threads + tpb - 1) / tpb); - dim3 block(tpb); + dim3 grid1((threads * 4 + tpb - 1) / tpb); + dim3 block1(4, tpb >> 2); - if (device_sm[dev_id] >= 500 && cuda_arch[dev_id] >= 500) - lyra2v2_gpu_hash_32 <<>> (threads, startNounce, (uint2*)g_hash); - else - lyra2v2_gpu_hash_32_v3 <<>> (threads, startNounce, (uint2*)g_hash); + dim3 grid2((threads + 64 - 1) / 64); + dim3 block2(64); + + if (cuda_arch[dev_id] < 500) + cudaFuncSetCacheConfig(lyra2v2_gpu_hash_32_2, cudaFuncCachePreferShared); + + lyra2v2_gpu_hash_32_1 << > > (threads, startNounce, (uint2*)g_hash); + + lyra2v2_gpu_hash_32_2 << > > (threads, startNounce, g_hash); + lyra2v2_gpu_hash_32_3 << > > (threads, startNounce, (uint2*)g_hash); //MyStreamSynchronize(NULL, order, thr_id); } diff --git a/lyra2/cuda_lyra2v2_sm3.cuh b/lyra2/cuda_lyra2v2_sm3.cuh deleted file mode 100644 index 1b20485..0000000 --- a/lyra2/cuda_lyra2v2_sm3.cuh +++ /dev/null @@ -1,338 +0,0 @@ -/* SM 2/3/3.5 Variant for lyra2REv2 */ - -#ifdef __INTELLISENSE__ -/* just for vstudio code colors */ -#undef __CUDA_ARCH__ -#define __CUDA_ARCH__ 350 -#endif - -#define TPB20 64 -#define TPB30 64 -#define TPB35 64 - -#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500 - -#include "cuda_lyra2_vectors.h" - -#define Nrow 4 -#define Ncol 4 - -#define vectype ulonglong4 -#define memshift 4 - -__device__ vectype *DMatrix; - -static __device__ __forceinline__ -void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d) -{ - a += b; d ^= a; d = ROTR64(d, 32); - c += d; b ^= c; b = ROTR64(b, 24); - a += b; d ^= a; d = ROTR64(d, 16); - c += d; b ^= c; b = ROTR64(b, 63); -} - -static __device__ __forceinline__ -void round_lyra_v35(vectype* s) -{ - Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x); - Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y); - Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z); - Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w); - - Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w); - Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x); - Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y); - Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z); -} - -static __device__ __forceinline__ -void reduceDuplexV3(vectype state[4], uint32_t thread) -{ - vectype state1[3]; - uint32_t ps1 = (Nrow * Ncol * memshift * thread); - uint32_t ps2 = (memshift * (Ncol - 1) * Nrow + memshift * 1 + Nrow * Ncol * memshift * thread); - - #pragma unroll 4 - for (int i = 0; i < Ncol; i++) - { - uint32_t s1 = ps1 + Nrow * i *memshift; - uint32_t s2 = ps2 - Nrow * i *memshift; - - for (int j = 0; j < 3; j++) - state1[j] = __ldg4(&(DMatrix + s1)[j]); - - for (int j = 0; j < 3; j++) - state[j] ^= state1[j]; - round_lyra_v35(state); - - for (int j = 0; j < 3; j++) - state1[j] ^= state[j]; - - for (int j = 0; j < 3; j++) - (DMatrix + s2)[j] = state1[j]; - } -} - -static __device__ __forceinline__ -void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread) -{ - vectype state2[3], state1[3]; - - uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread); - uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread); - uint32_t ps3 = (Nrow * memshift * (Ncol - 1) + memshift * rowOut + Nrow * Ncol * memshift * thread); - - for (int i = 0; i < Ncol; i++) - { - uint32_t s1 = ps1 + Nrow*i*memshift; - uint32_t s2 = ps2 + Nrow*i*memshift; - uint32_t s3 = ps3 - Nrow*i*memshift; - - for (int j = 0; j < 3; j++) - state1[j] = __ldg4(&(DMatrix + s1 )[j]); - for (int j = 0; j < 3; j++) - state2[j] = __ldg4(&(DMatrix + s2 )[j]); - for (int j = 0; j < 3; j++) { - vectype tmp = state1[j] + state2[j]; - state[j] ^= tmp; - } - - round_lyra_v35(state); - - for (int j = 0; j < 3; j++) { - state1[j] ^= state[j]; - (DMatrix + s3)[j] = state1[j]; - } - - ((uint2*)state2)[0] ^= ((uint2*)state)[11]; - for (int j = 0; j < 11; j++) - ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; - - for (int j = 0; j < 3; j++) - (DMatrix + s2)[j] = state2[j]; - } -} - -static __device__ __forceinline__ -void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread) -{ - vectype state1[3], state2[3]; - uint32_t ps1 = (memshift * rowIn + Nrow * Ncol * memshift * thread); - uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread); - uint32_t ps3 = (memshift * rowOut + Nrow * Ncol * memshift * thread); - - #pragma nounroll - for (int i = 0; i < Ncol; i++) - { - uint32_t s1 = ps1 + Nrow * i*memshift; - uint32_t s2 = ps2 + Nrow * i*memshift; - uint32_t s3 = ps3 + Nrow * i*memshift; - - for (int j = 0; j < 3; j++) - state1[j] = __ldg4(&(DMatrix + s1)[j]); - - for (int j = 0; j < 3; j++) - state2[j] = __ldg4(&(DMatrix + s2)[j]); - - for (int j = 0; j < 3; j++) - state1[j] += state2[j]; - - for (int j = 0; j < 3; j++) - state[j] ^= state1[j]; - - round_lyra_v35(state); - - ((uint2*)state2)[0] ^= ((uint2*)state)[11]; - - for (int j = 0; j < 11; j++) - ((uint2*)state2)[j + 1] ^= ((uint2*)state)[j]; - - if (rowInOut != rowOut) { - - for (int j = 0; j < 3; j++) - (DMatrix + s2)[j] = state2[j]; - - for (int j = 0; j < 3; j++) - (DMatrix + s3)[j] ^= state[j]; - - } else { - - for (int j = 0; j < 3; j++) - state2[j] ^= state[j]; - - for (int j = 0; j < 3; j++) - (DMatrix + s2)[j] = state2[j]; - } - } -} - -#if __CUDA_ARCH__ >= 300 -__global__ __launch_bounds__(TPB35, 1) -void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - - vectype state[4]; - vectype blake2b_IV[2]; - vectype padding[2]; - - if (threadIdx.x == 0) { - - ((uint16*)blake2b_IV)[0] = make_uint16( - 0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85, - 0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a, - 0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c, - 0xfb41bd6b, 0x1f83d9ab , 0x137e2179, 0x5be0cd19 - ); - ((uint16*)padding)[0] = make_uint16( - 0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0, - 0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000 - ); - } - - if (thread < threads) - { - ((uint2*)state)[0] = __ldg(&outputHash[thread]); - ((uint2*)state)[1] = __ldg(&outputHash[thread + threads]); - ((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]); - ((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]); - - state[1] = state[0]; - state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0); - state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0); - - for (int i = 0; i<12; i++) - round_lyra_v35(state); - - state[0] ^= shuffle4(((vectype*)padding)[0], 0); - state[1] ^= shuffle4(((vectype*)padding)[1], 0); - - for (int i = 0; i<12; i++) - round_lyra_v35(state); - - uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread); - - //#pragma unroll 4 - for (int i = 0; i < 4; i++) - { - uint32_t s1 = ps1 - 4 * memshift * i; - for (int j = 0; j < 3; j++) - (DMatrix + s1)[j] = (state)[j]; - - round_lyra_v35(state); - } - - reduceDuplexV3(state, thread); - reduceDuplexRowSetupV3(1, 0, 2, state, thread); - reduceDuplexRowSetupV3(2, 1, 3, state, thread); - - uint32_t rowa; - int prev = 3; - for (int i = 0; i < 4; i++) - { - rowa = ((uint2*)state)[0].x & 3; reduceDuplexRowtV3(prev, rowa, i, state, thread); - prev = i; - } - - uint32_t shift = (memshift * rowa + 16 * memshift * thread); - - for (int j = 0; j < 3; j++) - state[j] ^= __ldg4(&(DMatrix + shift)[j]); - - for (int i = 0; i < 12; i++) - round_lyra_v35(state); - - outputHash[thread] = ((uint2*)state)[0]; - outputHash[thread + threads] = ((uint2*)state)[1]; - outputHash[thread + 2 * threads] = ((uint2*)state)[2]; - outputHash[thread + 3 * threads] = ((uint2*)state)[3]; - - } //thread -} -#elif __CUDA_ARCH__ >= 200 -__global__ __launch_bounds__(TPB20, 1) -void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - - vectype state[4]; - vectype blake2b_IV[2]; - vectype padding[2]; - - ((uint16*)blake2b_IV)[0] = make_uint16( - 0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85, - 0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a, - 0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c, - 0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19 - ); - ((uint16*)padding)[0] = make_uint16( - 0x20, 0x0, 0x20, 0x0, 0x20, 0x0, 0x01, 0x0, - 0x04, 0x0, 0x04, 0x0, 0x80, 0x0, 0x0, 0x01000000 - ); - - if (thread < threads) - { - - ((uint2*)state)[0] = outputHash[thread]; - ((uint2*)state)[1] = outputHash[thread + threads]; - ((uint2*)state)[2] = outputHash[thread + 2 * threads]; - ((uint2*)state)[3] = outputHash[thread + 3 * threads]; - - state[1] = state[0]; - state[2] = ((vectype*)blake2b_IV)[0]; - state[3] = ((vectype*)blake2b_IV)[1]; - - for (int i = 0; i<12; i++) - round_lyra_v35(state); - - state[0] ^= ((vectype*)padding)[0]; - state[1] ^= ((vectype*)padding)[1]; - - for (int i = 0; i<12; i++) - round_lyra_v35(state); - - uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread); - - //#pragma unroll 4 - for (int i = 0; i < 4; i++) - { - uint32_t s1 = ps1 - 4 * memshift * i; - for (int j = 0; j < 3; j++) - (DMatrix + s1)[j] = (state)[j]; - - round_lyra_v35(state); - } - - reduceDuplexV3(state, thread); - reduceDuplexRowSetupV3(1, 0, 2, state, thread); - reduceDuplexRowSetupV3(2, 1, 3, state, thread); - - uint32_t rowa; - int prev = 3; - for (int i = 0; i < 4; i++) - { - rowa = ((uint2*)state)[0].x & 3; reduceDuplexRowtV3(prev, rowa, i, state, thread); - prev = i; - } - - uint32_t shift = (memshift * rowa + 16 * memshift * thread); - - for (int j = 0; j < 3; j++) - state[j] ^= __ldg4(&(DMatrix + shift)[j]); - - for (int i = 0; i < 12; i++) - round_lyra_v35(state); - - outputHash[thread] = ((uint2*)state)[0]; - outputHash[thread + threads] = ((uint2*)state)[1]; - outputHash[thread + 2 * threads] = ((uint2*)state)[2]; - outputHash[thread + 3 * threads] = ((uint2*)state)[3]; - - } //thread -} -#endif - -#else -/* host & sm5+ */ -__global__ void lyra2v2_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) {} -#endif diff --git a/lyra2/lyra2RE.cu b/lyra2/lyra2RE.cu index d74bb16..463fa4a 100644 --- a/lyra2/lyra2RE.cu +++ b/lyra2/lyra2RE.cu @@ -23,7 +23,7 @@ extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNon extern void skein256_cpu_init(int thr_id, uint32_t threads); extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix); -extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); +extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti); extern void groestl256_cpu_init(int thr_id, uint32_t threads); extern void groestl256_cpu_free(int thr_id); @@ -85,30 +85,49 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; - int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 17 : 16; - uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4; - if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); if (opt_benchmark) - ptarget[7] = 0x000f; + ptarget[7] = 0x00ff; + static bool gtx750ti; + static uint32_t throughput[MAX_GPUS]; if (!init[thr_id]) { - cudaSetDevice(device_map[thr_id]); + int dev_id = device_map[thr_id]; + cudaSetDevice(dev_id); CUDA_LOG_ERROR(); - blake256_cpu_init(thr_id, throughput); - keccak256_cpu_init(thr_id,throughput); - skein256_cpu_init(thr_id, throughput); - groestl256_cpu_init(thr_id, throughput); + int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16; + if (device_sm[device_map[thr_id]] == 500) intensity = 15; + int temp = intensity; + throughput[thr_id] = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4; + if (init[thr_id]) throughput[thr_id] = min(throughput[thr_id], max_nonce - first_nonce); - // DMatrix - cudaMalloc(&d_matrix[thr_id], (size_t)16 * 8 * 8 * sizeof(uint64_t) * throughput); - lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]); + cudaDeviceProp props; + cudaGetDeviceProperties(&props, dev_id); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); + if (strstr(props.name, "750 Ti")) gtx750ti = true; + else gtx750ti = false; + + //blake256_cpu_init(thr_id, throughput); + keccak256_cpu_init(thr_id, throughput[thr_id]); + skein256_cpu_init(thr_id, throughput[thr_id]); + groestl256_cpu_init(thr_id, throughput[thr_id]); + + if (device_sm[dev_id] >= 500) + { + size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4; + CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput[thr_id])); + lyra2_cpu_init(thr_id, throughput[thr_id], d_matrix[thr_id]); + } + + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput[thr_id])); init[thr_id] = true; + if (temp != intensity){ + gpulog(LOG_INFO, thr_id, "Intensity set to %u, %u cuda threads", + intensity, throughput[thr_id]); + } } uint32_t _ALIGN(128) endiandata[20]; @@ -122,15 +141,15 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, int order = 0; uint32_t foundNonce; - blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); - skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + blake256_cpu_hash_80(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++); + keccak256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++); + lyra2_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], gtx750ti); + skein256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++); TRACE("S") - *hashes_done = pdata[19] - first_nonce + throughput; + *hashes_done = pdata[19] - first_nonce + throughput[thr_id]; - foundNonce = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + foundNonce = groestl256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++); if (foundNonce != UINT32_MAX) { uint32_t _ALIGN(64) vhash64[8]; @@ -162,11 +181,11 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, } } - if ((uint64_t)throughput + pdata[19] >= max_nonce) { + if ((uint64_t)throughput[thr_id] + pdata[19] >= max_nonce) { pdata[19] = max_nonce; break; } - pdata[19] += throughput; + pdata[19] += throughput[thr_id]; } while (!work_restart[thr_id].restart); diff --git a/lyra2/lyra2REv2.cu b/lyra2/lyra2REv2.cu index 2308d0c..9a14c4d 100644 --- a/lyra2/lyra2REv2.cu +++ b/lyra2/lyra2REv2.cu @@ -10,6 +10,7 @@ extern "C" { #include "miner.h" #include "cuda_helper.h" +#include static uint64_t *d_hash[MAX_GPUS]; static uint64_t* d_matrix[MAX_GPUS]; @@ -20,6 +21,9 @@ extern void blake256_cpu_setBlock_80(uint32_t *pdata); extern void keccak256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); extern void keccak256_cpu_init(int thr_id, uint32_t threads); extern void keccak256_cpu_free(int thr_id); +extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); +extern void blakeKeccakcube256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order); + extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); extern void skein256_cpu_init(int thr_id, uint32_t threads); extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order); @@ -27,10 +31,11 @@ extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t start extern void lyra2v2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order); extern void lyra2v2_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix); -extern void bmw256_setTarget(const void *ptarget); +//extern void bmw256_setTarget(const void *ptarget); extern void bmw256_cpu_init(int thr_id, uint32_t threads); extern void bmw256_cpu_free(int thr_id); -extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces); +extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces, uint32_t Target); +extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces, uint32_t Target, uint32_t **result); void lyra2v2_hash(void *state, const void *input) { @@ -79,7 +84,7 @@ void lyra2v2_hash(void *state, const void *input) uint32_t* debugbuf = NULL; \ cudaMallocHost(&debugbuf, 32); \ cudaMemcpy(debugbuf, d_hash[thr_id], 32, cudaMemcpyDeviceToHost); \ - printf("lyra2 %s %08x %08x %08x %08x...%08x... \n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \ + printf("lyra2 %s %08x %08x %08x %08x...%08x... Â¥n", algo, swab32(debugbuf[0]), swab32(debugbuf[1]), \ swab32(debugbuf[2]), swab32(debugbuf[3]), swab32(debugbuf[7])); \ cudaFreeHost(debugbuf); \ } \ @@ -89,23 +94,96 @@ void lyra2v2_hash(void *state, const void *input) #endif static bool init[MAX_GPUS] = { 0 }; +static uint32_t throughput[MAX_GPUS] = { 0 }; extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) { uint32_t *pdata = work->data; uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; - int dev_id = device_map[thr_id]; - int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 18; - uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity); - if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); - if (opt_benchmark) ptarget[7] = 0x000f; if (!init[thr_id]) { - size_t matrix_sz = 16 * sizeof(uint64_t) * 4 * 3; + int dev_id = device_map[thr_id]; + cudaDeviceProp props; + cudaGetDeviceProperties(&props, dev_id); + + int intensity = 0; + // Pascal + if (strstr(props.name, "1080")) intensity = 22; + else if (strstr(props.name, "1070")) intensity = 21; + // Maxwell + else if (strstr(props.name, "TITAN X")) intensity = 21; + else if (strstr(props.name, "980")) intensity = 21; + else if (strstr(props.name, "970")) intensity = 20; + else if (strstr(props.name, "960")) intensity = 20; + else if (strstr(props.name, "950")) intensity = 19; + else if (strstr(props.name, "750 Ti")) intensity = 19; + else if (strstr(props.name, "750")) intensity = 18; + // Kepler〜Fermi + else if (strstr(props.name, "TITAN Z")) intensity = 20; + else if (strstr(props.name, "TITAN")) intensity = 19; + else if (strstr(props.name, "780")) intensity = 19; + else if (strstr(props.name, "760")) intensity = 18; + else if (strstr(props.name, "730")) intensity = 16; + else if (strstr(props.name, "720")) intensity = 15; + else if (strstr(props.name, "710")) intensity = 16; + else if (strstr(props.name, "690")) intensity = 20; + else if (strstr(props.name, "680")) intensity = 19; + else if (strstr(props.name, "660")) intensity = 18; + else if (strstr(props.name, "650 Ti")) intensity = 18; + else if (strstr(props.name, "640")) intensity = 17; + else if (strstr(props.name, "630")) intensity = 16; + else if (strstr(props.name, "620")) intensity = 15; + + else if (strstr(props.name, "90")) intensity = 18; //590 + else if (strstr(props.name, "80")) intensity = 18; //480 580 + else if (strstr(props.name, "70")) intensity = 18; //470 570 670 770 + else if (strstr(props.name, "65")) intensity = 17; //465 + else if (strstr(props.name, "60")) intensity = 17; //460 560 + else if (strstr(props.name, "55")) intensity = 17; //555 + else if (strstr(props.name, "50")) intensity = 17; //450 550Ti 650 + else if (strstr(props.name, "45")) intensity = 16; //545 + else if (strstr(props.name, "40")) intensity = 15; //440 + else if (strstr(props.name, "30")) intensity = 15; //430 530 + else if (strstr(props.name, "20")) intensity = 14; //420 520 + else if (strstr(props.name, "10")) intensity = 14; //510 610 + + if (intensity != 0 && opt_eco_mode) intensity -= 3.0; + + if (intensity == 0) + { + intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 18; + throughput[thr_id] = cuda_default_throughput(dev_id, 1UL << (int)intensity); + } + else + { + //uint32_t adds = 0; + // double d = floor(intensity); + + /* if ((intensity - d) > 0.0) { + adds = (uint32_t)floor((intensity - d) * (1 << (int)(d - 10.0)) * 1024; + throughput = (1 << (int)d) + adds; + gpulog(LOG_INFO, thr_id, "Adding %u threads to intensity %u, %u cuda threads", + adds, (int)d, throughput); + } + else if (gpus_intensity[n] != (1 << (int)intensity)) { + throughput = (1 << (int)intensity); + applog(LOG_INFO, "Intensity set to %u, %u cuda threads", + v, gpus_intensity[n]); + } + */ + uint32_t temp = 1UL << intensity; + throughput[thr_id] = cuda_default_throughput(dev_id, temp); + + if (temp == throughput[thr_id]) + { + gpulog(LOG_INFO, thr_id, "Intensity set to %u, %u cuda threads", + intensity, throughput[thr_id]); + } + } cudaSetDevice(dev_id); if (opt_cudaschedule == -1 && gpu_threads == 1) { cudaDeviceReset(); @@ -113,52 +191,84 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); CUDA_LOG_ERROR(); } + cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); - blake256_cpu_init(thr_id, throughput); - keccak256_cpu_init(thr_id,throughput); - skein256_cpu_init(thr_id, throughput); - bmw256_cpu_init(thr_id, throughput); + //blake256_cpu_init(thr_id, throughput); + //keccak256_cpu_init(thr_id,throughput); + skein256_cpu_init(thr_id, throughput[thr_id]); + bmw256_cpu_init(thr_id, throughput[thr_id]); // SM 3 implentation requires a bit more memory - if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500) - matrix_sz = 16 * sizeof(uint64_t) * 4 * 4; - - CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput)); - lyra2v2_cpu_init(thr_id, throughput, d_matrix[thr_id]); + //if (device_sm[dev_id] < 300 || cuda_arch[dev_id] < 300) + // matrix_sz = 16 * sizeof(uint64_t) * 4 * 4; + //else + size_t matrix_sz = sizeof(uint64_t) * 4 * 4; + CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput[thr_id])); + lyra2v2_cpu_init(thr_id, throughput[thr_id], d_matrix[thr_id]); - CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput)); + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput[thr_id])); - api_set_throughput(thr_id, throughput); + api_set_throughput(thr_id, throughput[thr_id]); init[thr_id] = true; } + else throughput[thr_id] = min(throughput[thr_id], max_nonce - first_nonce); uint32_t endiandata[20]; - for (int k=0; k < 20; k++) + for (int k = 0; k < 20; k++) be32enc(&endiandata[k], pdata[k]); blake256_cpu_setBlock_80(pdata); - bmw256_setTarget(ptarget); + //bmw256_setTarget(ptarget); + //uint32_t *vhash64[2]; do { int order = 0; uint32_t foundNonces[2] = { 0, 0 }; - blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + blakeKeccak256_cpu_hash_80(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++); + //blakeKeccakcube256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); TRACE("blake :"); - keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + //keccak256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); TRACE("keccak :"); - cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + cubehash256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++); TRACE("cube :"); - lyra2v2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + lyra2v2_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++); TRACE("lyra2 :"); - skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + skein256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++); TRACE("skein :"); - cubehash256_cpu_hash_32(thr_id, throughput,pdata[19], d_hash[thr_id], order++); + cubehash256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], order++); TRACE("cube :"); - bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonces); + bmw256_cpu_hash_32(thr_id, throughput[thr_id], pdata[19], d_hash[thr_id], foundNonces, ptarget[7]); + //bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], foundNonces, ptarget[7], vhash64); + + *hashes_done = pdata[19] - first_nonce + throughput[thr_id]; - *hashes_done = pdata[19] - first_nonce + throughput; + + /*if (foundNonces[1] != 0) + { + if (fulltest(vhash64[0], ptarget)) + { + gpulog(LOG_WARNING, thr_id, "result two foundNonces!"); + pdata[19] = foundNonces[1]; + pdata[21] = foundNonces[0]; + work_set_target_ratio(work, vhash64[0]); + if (bn_hash_target_ratio(vhash64[1], ptarget) > work->shareratio) { + work_set_target_ratio(work, vhash64[1]); + } + return 2; + } + } + if (foundNonces[0] != 0) + { + if (fulltest(vhash64[0], ptarget)) + { + gpulog(LOG_WARNING, thr_id, "result one foundNonce!"); + pdata[19] = foundNonces[0]; + work_set_target_ratio(work, vhash64[0]); + return 1; + } + }*/ if (foundNonces[0] != 0) { @@ -176,25 +286,25 @@ extern "C" int scanhash_lyra2v2(int thr_id, struct work* work, uint32_t max_nonc be32enc(&endiandata[19], foundNonces[1]); lyra2v2_hash(vhash64, endiandata); pdata[21] = foundNonces[1]; + xchg(pdata[19], pdata[21]); if (bn_hash_target_ratio(vhash64, ptarget) > work->shareratio) { work_set_target_ratio(work, vhash64); - xchg(pdata[19], pdata[21]); } res++; } return res; } - else + else if (vhash64[7] > ptarget[7]) { gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonces[0]); } } - if ((uint64_t)throughput + pdata[19] >= max_nonce) { + if ((uint64_t)throughput[thr_id] + pdata[19] >= max_nonce) { pdata[19] = max_nonce; break; } - pdata[19] += throughput; + pdata[19] += throughput[thr_id]; } while (!work_restart[thr_id].restart && !abort_flag); @@ -214,7 +324,7 @@ extern "C" void free_lyra2v2(int thr_id) cudaFree(d_matrix[thr_id]); bmw256_cpu_free(thr_id); - keccak256_cpu_free(thr_id); + //keccak256_cpu_free(thr_id); init[thr_id] = false; diff --git a/miner.h b/miner.h index 1efc579..146aa56 100644 --- a/miner.h +++ b/miner.h @@ -445,6 +445,7 @@ struct option { #endif extern int options_count(); +extern bool opt_eco_mode; extern bool opt_benchmark; extern bool opt_debug; extern bool opt_quiet; @@ -646,6 +647,9 @@ struct work { /* pok getwork txs */ uint32_t tx_count; struct tx txs[POK_MAX_TXS]; + + char *txs2; + char *workid; }; #define POK_BOOL_MASK 0x00008000 diff --git a/neoscrypt/cuda_neoscrypt.cu b/neoscrypt/cuda_neoscrypt.cu index e7f4b21..f163c03 100644 --- a/neoscrypt/cuda_neoscrypt.cu +++ b/neoscrypt/cuda_neoscrypt.cu @@ -1,18 +1,33 @@ +// originally from djm34 - github.com/djm34/ccminer-sp-neoscrypt + #include #include -#include "cuda_helper.h" -#include "cuda_vectors.h" /* NOT COMPATIBLE WITH SM 3.0 !!! */ +#include +#include +#include "cuda_vectors.h" + +typedef uint48 uint4x2; + +#include "miner.h" + +#ifdef __INTELLISENSE__ +#define __CUDA_ARCH__ 500 +#define __byte_perm(x,y,c) x +#define atomicExch(p,x) x +#endif + +static uint32_t* d_NNonce[MAX_GPUS]; + +__device__ uint2x4* W; +__device__ uint2x4* Tr; +__device__ uint2x4* Tr2; +__device__ uint2x4* Input; -static uint32_t *d_buffer[MAX_GPUS]; -static uint32_t *d_NNonce[MAX_GPUS]; -__constant__ uint4* W; -__constant__ uint32_t pTarget[8]; +__constant__ uint32_t c_data[64]; +__constant__ uint32_t c_target[2]; __constant__ uint32_t key_init[16]; __constant__ uint32_t input_init[16]; -__constant__ uint32_t c_data[80]; - -/// constants /// static const __constant__ uint8 BLAKE2S_IV_Vec = { 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, @@ -37,26 +52,127 @@ static const uint32_t BLAKE2S_SIGMA_host[10][16] = { { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, }; -static __constant__ uint32_t BLAKE2S_SIGMA[10][16]; +__constant__ uint32_t BLAKE2S_SIGMA[10][16] = { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, +}; -#define FASTKDF_BUFFER_SIZE 256U +#define BLOCK_SIZE 64U +#define BLAKE2S_BLOCK_SIZE 64U +#define BLAKE2S_OUT_SIZE 32U -// Blake2S +#define SALSA(a,b,c,d) { \ + t = rotateL(a+d, 7U); b ^= t; \ + t = rotateL(b+a, 9U); c ^= t; \ + t = rotateL(c+b, 13U); d ^= t; \ + t = rotateL(d+c, 18U); a ^= t; \ +} -#define BLAKE2S_BLOCK_SIZE 64U -#define BLAKE2S_OUT_SIZE 32U -#define BLAKE2S_KEY_SIZE 32U +#define shf_r_clamp32(out,a,b,shift) \ + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(out) : "r"(a), "r"(b), "r"(shift)); -#if __CUDA_ARCH__ >= 500 -#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \ - idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \ +__device__ __forceinline__ +static void shift256R4(uint32_t* ret, const uint8 &vec4, const uint32_t shift2) +{ +#if __CUDA_ARCH__ >= 320 + uint32_t shift = 32U - shift2; + asm("shf.r.clamp.b32 %0, 0, %1, %2;" : "=r"(ret[0]) : "r"(vec4.s0), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[1]) : "r"(vec4.s0), "r"(vec4.s1), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[2]) : "r"(vec4.s1), "r"(vec4.s2), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[3]) : "r"(vec4.s2), "r"(vec4.s3), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[4]) : "r"(vec4.s3), "r"(vec4.s4), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[5]) : "r"(vec4.s4), "r"(vec4.s5), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[6]) : "r"(vec4.s5), "r"(vec4.s6), "r"(shift)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret[7]) : "r"(vec4.s6), "r"(vec4.s7), "r"(shift)); + asm("shr.b32 %0, %1, %2;" : "=r"(ret[8]) : "r"(vec4.s7), "r"(shift)); +#else + // to check + shift256R(ret, vec4, shift2); +#endif +} + +#if __CUDA_ARCH__ >= 300 +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + return __shfl(a, b, c); +} + +__device__ __forceinline__ void WarpShuffle3(uint32_t &a1, uint32_t &a2, uint32_t &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + a1 = WarpShuffle(a1, b1, c); + a2 = WarpShuffle(a2, b2, c); + a3 = WarpShuffle(a3, b3, c); +} + +#else +__device__ __forceinline__ uint32_t WarpShuffle(uint32_t a, uint32_t b, uint32_t c) +{ + __shared__ uint32_t shared_mem[32]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + shared_mem[thread] = a; + __threadfence_block(); + + uint32_t result = shared_mem[(thread&~(c - 1)) + (b&(c - 1))]; + __threadfence_block(); + + return result; +} + +__device__ __forceinline__ void WarpShuffle3(uint32_t &a1, uint32_t &a2, uint32_t &a3, uint32_t b1, uint32_t b2, uint32_t b3, uint32_t c) +{ + __shared__ uint32_t shared_mem[32]; + + const uint32_t thread = blockDim.x * threadIdx.y + threadIdx.x; + + shared_mem[thread] = a1; + __threadfence_block(); + + a1 = shared_mem[(thread&~(c - 1)) + (b1&(c - 1))]; + __threadfence_block(); + + shared_mem[thread] = a2; + __threadfence_block(); + + a2 = shared_mem[(thread&~(c - 1)) + (b2&(c - 1))]; + __threadfence_block(); + + shared_mem[thread] = a3; + __threadfence_block(); + + a3 = shared_mem[(thread&~(c - 1)) + (b3&(c - 1))]; + __threadfence_block(); +} + +#endif + +#define CHACHA_STEP(a,b,c,d) { \ a += b; d = __byte_perm(d^a, 0, 0x1032); \ + c += d; b = rotateL(b^c, 12); \ + a += b; d = __byte_perm(d^a, 0, 0x2103); \ + c += d; b = rotateL(b^c, 7); \ +} + +#if __CUDA_ARCH__ < 500 + +#define BLAKE(a, b, c, d, key1, key2) { \ + a += key1; \ + a += b; d = rotateL(d^a, 16); \ c += d; b = rotateR(b^c, 12); \ - idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \ - a += b; d = __byte_perm(d^a, 0, 0x0321); \ + a += key2; \ + a += b; d = rotateR(d^a, 8); \ c += d; b = rotateR(b^c, 7); \ } -#else + #define BLAKE_G(idx0, idx1, a, b, c, d, key) { \ idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \ a += b; d = rotateL(d^a, 16); \ @@ -65,39 +181,41 @@ static __constant__ uint32_t BLAKE2S_SIGMA[10][16]; a += b; d = rotateR(d^a, 8); \ c += d; b = rotateR(b^c, 7); \ } -#endif -#if __CUDA_ARCH__ >= 500 #define BLAKE_G_PRE(idx0, idx1, a, b, c, d, key) { \ a += key[idx0]; \ - a += b; d = __byte_perm(d^a, 0, 0x1032); \ + a += b; d = rotateL(d^a, 16); \ c += d; b = rotateR(b^c, 12); \ a += key[idx1]; \ - a += b; d = __byte_perm(d^a, 0, 0x0321); \ + a += b; d = rotateR(d^a, 8); \ c += d; b = rotateR(b^c, 7); \ } -#else -#define BLAKE_G_PRE(idx0, idx1, a, b, c, d, key) { \ + +#define BLAKE_G_PRE0(idx0, idx1, a, b, c, d, key) { \ + a += b; d = rotateL(d^a, 16); \ + c += d; b = rotateR(b^c, 12); \ + a += b; d = rotateR(d^a, 8); \ + c += d; b = rotateR(b^c, 7); \ +} + +#define BLAKE_G_PRE1(idx0, idx1, a, b, c, d, key) { \ a += key[idx0]; \ a += b; d = rotateL(d^a, 16); \ c += d; b = rotateR(b^c, 12); \ - a += key[idx1]; \ a += b; d = rotateR(d^a, 8); \ c += d; b = rotateR(b^c, 7); \ } -#endif -#define BLAKE_Ghost(idx0, idx1, a, b, c, d, key) { \ - idx = BLAKE2S_SIGMA_host[idx0][idx1]; a += key[idx]; \ - a += b; d = ROTR32(d^a,16); \ - c += d; b = ROTR32(b^c, 12); \ - idx = BLAKE2S_SIGMA_host[idx0][idx1+1]; a += key[idx]; \ - a += b; d = ROTR32(d^a,8); \ - c += d; b = ROTR32(b^c, 7); \ +#define BLAKE_G_PRE2(idx0, idx1, a, b, c, d, key) { \ + a += b; d = rotateL(d^a, 16); \ + c += d; b = rotateR(b^c, 12); \ + a += key[idx1]; \ + a += b; d = rotateR(d^a, 8); \ + c += d; b = rotateR(b^c, 7); \ } static __forceinline__ __device__ -void Blake2S(uint32_t * inout, const uint32_t * TheKey) +void Blake2S(uint32_t *out, const uint32_t* const __restrict__ inout, const uint32_t * const __restrict__ TheKey) { uint16 V; uint32_t idx; @@ -112,122 +230,98 @@ void Blake2S(uint32_t * inout, const uint32_t * TheKey) V.hi.s4 ^= BLAKE2S_BLOCK_SIZE; -#if 0 - for (int x = 0; x < 10; ++x) - { - BLAKE_G(x, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G(x, 0x2, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G(x, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G(x, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G(x, 0x8, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G(x, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G(x, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G(x, 0xE, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - } -#else - // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - BLAKE_G_PRE(0x0, 0x1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G_PRE(0x2, 0x3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G_PRE(0x4, 0x5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G_PRE(0x6, 0x7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G_PRE(0x8, 0x9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G_PRE(0xA, 0xB, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G_PRE(0xC, 0xD, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G_PRE(0xE, 0xF, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE0(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - BLAKE_G_PRE(0xE, 0xA, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G_PRE(0x4, 0x8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G_PRE(0x9, 0xF, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G_PRE(0xD, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G_PRE(0x1, 0xC, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G_PRE(0x0, 0x2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G_PRE(0xB, 0x7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G_PRE(0x5, 0x3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - + BLAKE_G_PRE0(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE2(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE1(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - BLAKE_G_PRE(0xB, 0x8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G_PRE(0xC, 0x0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G_PRE(0x5, 0x2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G_PRE(0xF, 0xD, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G_PRE(0xA, 0xE, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G_PRE(0x3, 0x6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G_PRE(0x7, 0x1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G_PRE(0x9, 0x4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - + BLAKE_G_PRE0(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE2(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - BLAKE_G_PRE(0x7, 0x9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G_PRE(0x3, 0x1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G_PRE(0xD, 0xC, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G_PRE(0xB, 0xE, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G_PRE(0x2, 0x6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G_PRE(0x5, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G_PRE(0x4, 0x0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G_PRE(0xF, 0x8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - + BLAKE_G_PRE1(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE1(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); // { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, - BLAKE_G_PRE(0x9, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G_PRE(0x5, 0x7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G_PRE(0x2, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G_PRE(0xA, 0xF, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G_PRE(0xE, 0x1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G_PRE(0xB, 0xC, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G_PRE(0x6, 0x8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G_PRE(0x3, 0xD, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - + BLAKE_G_PRE2(9, 0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(5, 7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(2, 4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(10, 15, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE2(14, 1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(11, 12, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE1(6, 8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE1(3, 13, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); // { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, - BLAKE_G_PRE(0x2, 0xC, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G_PRE(0x6, 0xA, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G_PRE(0x0, 0xB, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G_PRE(0x8, 0x3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G_PRE(0x4, 0xD, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G_PRE(0x7, 0x5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G_PRE(0xF, 0xE, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G_PRE(0x1, 0x9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - + BLAKE_G_PRE1(2, 12, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(6, 10, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE1(0, 11, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE2(8, 3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE1(4, 13, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(7, 5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE0(15, 14, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE1(1, 9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); // { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, - BLAKE_G_PRE(0xC, 0x5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G_PRE(0x1, 0xF, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G_PRE(0xE, 0xD, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G_PRE(0x4, 0xA, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G_PRE(0x0, 0x7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G_PRE(0x6, 0x3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G_PRE(0x9, 0x2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G_PRE(0x8, 0xB, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - + BLAKE_G_PRE2(12, 5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(1, 15, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(14, 13, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(4, 10, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(0, 7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(6, 3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(9, 2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(8, 11, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); // { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, - BLAKE_G_PRE(0xD, 0xB, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G_PRE(0x7, 0xE, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G_PRE(0xC, 0x1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G_PRE(0x3, 0x9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G_PRE(0x5, 0x0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G_PRE(0xF, 0x4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G_PRE(0x8, 0x6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G_PRE(0x2, 0xA, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - + BLAKE_G_PRE0(13, 11, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(7, 14, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE2(12, 1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(3, 9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(5, 0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE2(15, 4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(8, 6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE(2, 10, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); // { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, - BLAKE_G_PRE(0x6, 0xF, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G_PRE(0xE, 0x9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G_PRE(0xB, 0x3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G_PRE(0x0, 0x8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G_PRE(0xC, 0x2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G_PRE(0xD, 0x7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G_PRE(0x1, 0x4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G_PRE(0xA, 0x5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - + BLAKE_G_PRE1(6, 15, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE0(14, 9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE2(11, 3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(0, 8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE2(12, 2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE2(13, 7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(1, 4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(10, 5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); // { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, - BLAKE_G_PRE(0xA, 0x2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); - BLAKE_G_PRE(0x8, 0x4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); - BLAKE_G_PRE(0x7, 0x6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); - BLAKE_G_PRE(0x1, 0x5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); - BLAKE_G_PRE(0xF, 0xB, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); - BLAKE_G_PRE(0x9, 0xE, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); - BLAKE_G_PRE(0x3, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); - BLAKE_G_PRE(0xD, 0x0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); -#endif + BLAKE_G_PRE2(10, 2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE2(8, 4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(7, 6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE(1, 5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(15, 11, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(9, 14, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE1(3, 12, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(13, 0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); - V.lo ^= V.hi; - V.lo ^= tmpblock; + V.lo ^= V.hi ^ tmpblock; V.hi = BLAKE2S_IV_Vec; tmpblock = V.lo; @@ -235,86 +329,121 @@ void Blake2S(uint32_t * inout, const uint32_t * TheKey) V.hi.s4 ^= 128; V.hi.s6 = ~V.hi.s6; -#if 0 - for (int x = 0; x < 10; ++x) - { - BLAKE_G(x, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); - BLAKE_G(x, 0x2, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); - BLAKE_G(x, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); - BLAKE_G(x, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); - BLAKE_G(x, 0x8, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); - BLAKE_G(x, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); - BLAKE_G(x, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); - BLAKE_G(x, 0xE, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); - } -#else - // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, - BLAKE_G_PRE(0x0, 0x1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); - BLAKE_G_PRE(0x2, 0x3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); - BLAKE_G_PRE(0x4, 0x5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); - BLAKE_G_PRE(0x6, 0x7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); - BLAKE_G_PRE(0x8, 0x9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); - BLAKE_G_PRE(0xA, 0xB, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); - BLAKE_G_PRE(0xC, 0xD, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); - BLAKE_G_PRE(0xE, 0xF, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); - + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, - BLAKE_G_PRE(0xE, 0xA, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); - BLAKE_G_PRE(0x4, 0x8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); - BLAKE_G_PRE(0x9, 0xF, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); - BLAKE_G_PRE(0xD, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); - BLAKE_G_PRE(0x1, 0xC, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); - BLAKE_G_PRE(0x0, 0x2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); - BLAKE_G_PRE(0xB, 0x7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); - BLAKE_G_PRE(0x5, 0x3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); - + BLAKE_G_PRE(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, - BLAKE_G_PRE(0xB, 0x8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); - BLAKE_G_PRE(0xC, 0x0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); - BLAKE_G_PRE(0x5, 0x2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); - BLAKE_G_PRE(0xF, 0xD, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); - BLAKE_G_PRE(0xA, 0xE, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); - BLAKE_G_PRE(0x3, 0x6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); - BLAKE_G_PRE(0x7, 0x1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); - BLAKE_G_PRE(0x9, 0x4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); - + BLAKE_G_PRE(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, - BLAKE_G_PRE(0x7, 0x9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); - BLAKE_G_PRE(0x3, 0x1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); - BLAKE_G_PRE(0xD, 0xC, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); - BLAKE_G_PRE(0xB, 0xE, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); - BLAKE_G_PRE(0x2, 0x6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); - BLAKE_G_PRE(0x5, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); - BLAKE_G_PRE(0x4, 0x0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); - BLAKE_G_PRE(0xF, 0x8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); - - for (int x = 4; x < 10; ++x) + BLAKE_G_PRE(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + + for (uint32_t x = 4U; x < 10U; x++) { - BLAKE_G(x, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); - BLAKE_G(x, 0x2, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); - BLAKE_G(x, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); - BLAKE_G(x, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); - BLAKE_G(x, 0x8, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); - BLAKE_G(x, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); - BLAKE_G(x, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); - BLAKE_G(x, 0xE, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + BLAKE_G(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); } -#endif V.lo ^= V.hi ^ tmpblock; - ((uint8*)inout)[0]=V.lo; + ((uint8*)out)[0] = V.lo; } +#endif -static __forceinline__ __host__ -void Blake2Shost(uint32_t * inout, const uint32_t * inkey) +#if __CUDA_ARCH__ >= 500 + +#define BLAKE_G(idx0, idx1, a, b, c, d, key) { \ + idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \ + a += b; d = __byte_perm(d^a, 0, 0x1032); \ + c += d; b = rotateR(b^c, 12); \ + idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \ + a += b; d = __byte_perm(d^a, 0, 0x0321); \ + c += d; b = rotateR(b^c, 7); \ +} + +#define BLAKE(a, b, c, d, key1,key2) { \ + a += key1; \ + a += b; d = __byte_perm(d^a, 0, 0x1032); \ + c += d; b = rotateR(b^c, 12); \ + a += key2; \ + a += b; d = __byte_perm(d^a, 0, 0x0321); \ + c += d; b = rotateR(b^c, 7); \ +} + +#define BLAKE_G_PRE(idx0,idx1, a, b, c, d, key) { \ + a += key[idx0]; \ + a += b; d = __byte_perm(d^a, 0, 0x1032); \ + c += d; b = rotateR(b^c, 12); \ + a += key[idx1]; \ + a += b; d = __byte_perm(d^a, 0, 0x0321); \ + c += d; b = rotateR(b^c, 7); \ +} + +#define BLAKE_G_PRE0(idx0,idx1, a, b, c, d, key) { \ + a += b; d = __byte_perm(d^a, 0, 0x1032); \ + c += d; b = rotateR(b^c, 12); \ + a += b; d = __byte_perm(d^a, 0, 0x0321); \ + c += d; b = rotateR(b^c, 7); \ +} + +#define BLAKE_G_PRE1(idx0,idx1, a, b, c, d, key) { \ + a += key[idx0]; \ + a += b; d = __byte_perm(d^a, 0, 0x1032); \ + c += d; b = rotateR(b^c, 12); \ + a += b; d = __byte_perm(d^a, 0, 0x0321); \ + c += d; b = rotateR(b^c, 7); \ +} + +#define BLAKE_G_PRE2(idx0,idx1, a, b, c, d, key) { \ + a += b; d = __byte_perm(d^a, 0, 0x1032); \ + c += d; b = rotateR(b^c, 12); \ + a += key[idx1]; \ + a += b; d = __byte_perm(d^a, 0, 0x0321); \ + c += d; b = rotateR(b^c, 7); \ +} + +static __forceinline__ __device__ +void Blake2S_v2(uint32_t *out, const uint32_t* __restrict__ inout, const uint32_t * __restrict__ TheKey) { uint16 V; - uint32_t idx; uint8 tmpblock; - V.hi = BLAKE2S_IV_Vechost; - V.lo = BLAKE2S_IV_Vechost; + V.hi = BLAKE2S_IV_Vec; + V.lo = BLAKE2S_IV_Vec; V.lo.s0 ^= 0x01012020; // Copy input block for later @@ -322,469 +451,1100 @@ void Blake2Shost(uint32_t * inout, const uint32_t * inkey) V.hi.s4 ^= BLAKE2S_BLOCK_SIZE; - for (int x = 0; x < 10; ++x) - { - BLAKE_Ghost(x, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inkey); - BLAKE_Ghost(x, 0x2, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inkey); - BLAKE_Ghost(x, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inkey); - BLAKE_Ghost(x, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inkey); - BLAKE_Ghost(x, 0x8, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inkey); - BLAKE_Ghost(x, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inkey); - BLAKE_Ghost(x, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inkey); - BLAKE_Ghost(x, 0xE, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inkey); - } + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE0(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + BLAKE_G_PRE0(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE2(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE1(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + BLAKE_G_PRE0(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE2(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + BLAKE_G_PRE1(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE1(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 }, + BLAKE_G_PRE2(9, 0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE(5, 7, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(2, 4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE0(10, 15, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE2(14, 1, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(11, 12, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE1(6, 8, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE1(3, 13, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 }, + BLAKE_G_PRE1(2, 12, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(6, 10, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE1(0, 11, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE2(8, 3, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE1(4, 13, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(7, 5, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE0(15, 14, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE1(1, 9, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 }, + BLAKE_G_PRE2(12, 5, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(1, 15, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE0(14, 13, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(4, 10, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(0, 7, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE(6, 3, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(9, 2, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE0(8, 11, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 }, + BLAKE_G_PRE0(13, 11, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE1(7, 14, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE2(12, 1, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(3, 9, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE(5, 0, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE2(15, 4, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE2(8, 6, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE(2, 10, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 }, + BLAKE_G_PRE1(6, 15, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE0(14, 9, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE2(11, 3, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE1(0, 8, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE2(12, 2, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE2(13, 7, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE(1, 4, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(10, 5, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); + // { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0 }, + BLAKE_G_PRE2(10, 2, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, TheKey); + BLAKE_G_PRE2(8, 4, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, TheKey); + BLAKE_G_PRE(7, 6, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, TheKey); + BLAKE_G_PRE(1, 5, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, TheKey); + BLAKE_G_PRE0(15, 11, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, TheKey); + BLAKE_G_PRE0(9, 14, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, TheKey); + BLAKE_G_PRE1(3, 12, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, TheKey); + BLAKE_G_PRE2(13, 0, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, TheKey); V.lo ^= V.hi; V.lo ^= tmpblock; - V.hi = BLAKE2S_IV_Vechost; + V.hi = BLAKE2S_IV_Vec; tmpblock = V.lo; V.hi.s4 ^= 128; V.hi.s6 = ~V.hi.s6; - for (int x = 0; x < 10; ++x) - { - BLAKE_Ghost(x, 0x0, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); - BLAKE_Ghost(x, 0x2, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); - BLAKE_Ghost(x, 0x4, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); - BLAKE_Ghost(x, 0x6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); - BLAKE_Ghost(x, 0x8, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); - BLAKE_Ghost(x, 0xA, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); - BLAKE_Ghost(x, 0xC, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); - BLAKE_Ghost(x, 0xE, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + // { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, + BLAKE_G_PRE(0, 1, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(2, 3, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(4, 5, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(6, 7, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(8, 9, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(10, 11, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(12, 13, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(14, 15, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + // { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, + BLAKE_G_PRE(14, 10, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(4, 8, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(9, 15, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(13, 6, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(1, 12, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(0, 2, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(11, 7, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(5, 3, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + // { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 }, + BLAKE_G_PRE(11, 8, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(12, 0, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(5, 2, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(15, 13, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(10, 14, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(3, 6, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(7, 1, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(9, 4, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + // { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 }, + BLAKE_G_PRE(7, 9, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_G_PRE(3, 1, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_G_PRE(13, 12, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_G_PRE(11, 14, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_G_PRE(2, 6, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_G_PRE(5, 10, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_G_PRE(4, 0, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_G_PRE(15, 8, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[9], inout[0]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[5], inout[7]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[2], inout[4]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[10], inout[15]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[14], inout[1]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[11], inout[12]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[6], inout[8]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[3], inout[13]); + + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[2], inout[12]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[6], inout[10]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[0], inout[11]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[8], inout[3]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[4], inout[13]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[7], inout[5]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[15], inout[14]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[1], inout[9]); + + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[12], inout[5]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[1], inout[15]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[14], inout[13]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[4], inout[10]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[0], inout[7]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[6], inout[3]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[9], inout[2]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[8], inout[11]); + // 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10, + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[13], inout[11]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[7], inout[14]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[12], inout[1]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[3], inout[9]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[5], inout[0]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[15], inout[4]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[8], inout[6]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[2], inout[10]); + // 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5, + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[6], inout[15]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[14], inout[9]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[11], inout[3]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[0], inout[8]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[12], inout[2]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[13], inout[7]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[1], inout[4]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[10], inout[5]); + // 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0, + BLAKE(V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout[10], inout[2]); + BLAKE(V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout[8], inout[4]); + BLAKE(V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout[7], inout[6]); + BLAKE(V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout[1], inout[5]); + BLAKE(V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout[15], inout[11]); + BLAKE(V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout[9], inout[14]); + BLAKE(V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout[3], inout[12]); + BLAKE(V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout[13], inout[0]); + + V.lo ^= V.hi; + V.lo ^= tmpblock; + + ((uint8*)out)[0] = V.lo; +} + +#endif /* __CUDA_ARCH__ >= 500 */ + +#define SALSA_CORE(state) { \ + uint32_t t; \ + SALSA(state.x, state.y, state.z, state.w); \ + WarpShuffle3(state.y, state.z, state.w, threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1,4); \ + SALSA(state.x, state.w, state.z, state.y); \ + WarpShuffle3(state.y, state.z, state.w, threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3,4); \ +} + +#define CHACHA_CORE_PARALLEL(state) { \ + CHACHA_STEP(state.x, state.y, state.z, state.w); \ + WarpShuffle3(state.y, state.z, state.w, threadIdx.x + 1, threadIdx.x + 2, threadIdx.x + 3,4); \ + CHACHA_STEP(state.x, state.y, state.z, state.w); \ + WarpShuffle3(state.y, state.z, state.w, threadIdx.x + 3, threadIdx.x + 2, threadIdx.x + 1,4); \ +} + +__forceinline__ __device__ +uint4 salsa_small_scalar_rnd(const uint4 X) +{ + uint4 state = X; + +#pragma nounroll + for (int i = 0; i < 10; i++) { + SALSA_CORE(state); } - V.lo ^= V.hi ^ tmpblock; + return (X + state); +} - ((uint8*)inout)[0] = V.lo; +__device__ __forceinline__ +uint4 chacha_small_parallel_rnd(const uint4 X) +{ + uint4 state = X; + +#pragma nounroll + for (int i = 0; i < 10; i++) { + CHACHA_CORE_PARALLEL(state); + } + return (X + state); +} + +__device__ __forceinline__ +void neoscrypt_chacha(uint4 XV[4]) +{ + uint4 temp; + + XV[0] = chacha_small_parallel_rnd(XV[0] ^ XV[3]); + temp = chacha_small_parallel_rnd(XV[1] ^ XV[0]); + XV[1] = chacha_small_parallel_rnd(XV[2] ^ temp); + XV[3] = chacha_small_parallel_rnd(XV[3] ^ XV[1]); + XV[2] = temp; } +__device__ __forceinline__ +void neoscrypt_salsa(uint4 XV[4]) +{ + uint4 temp; + + XV[0] = salsa_small_scalar_rnd(XV[0] ^ XV[3]); + temp = salsa_small_scalar_rnd(XV[1] ^ XV[0]); + XV[1] = salsa_small_scalar_rnd(XV[2] ^ temp); + XV[3] = salsa_small_scalar_rnd(XV[3] ^ XV[1]); + XV[2] = temp; +} + + +#if __CUDA_ARCH__ < 500 static __forceinline__ __device__ -void fastkdf256(const uint32_t* password, uint8_t* output) +void fastkdf256_v1(const uint32_t thread, const uint32_t nonce, uint32_t* const s_data) { - uint8_t bufidx = 0; + uint2x4 output[8]; uchar4 bufhelper; - uint8_t A[320],B[288]; + uint32_t* B = (uint32_t*)&s_data[threadIdx.x * 64U]; + uint32_t qbuf, rbuf, bitbuf; + uint32_t input[BLAKE2S_BLOCK_SIZE / 4]; + uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = { 0 }; - ((uintx64*)A)[0] = ((uintx64*)password)[0]; - ((uint816 *)A)[4] = ((uint816 *)password)[0]; + const uint32_t data18 = c_data[18]; + const uint32_t data20 = c_data[0]; - ((uintx64*)B)[0] = ((uintx64*)password)[0]; - ((uint48 *)B)[8] = ((uint48 *)password)[0]; - - uint32_t input[BLAKE2S_BLOCK_SIZE/4]; uint32_t key[BLAKE2S_BLOCK_SIZE / 4] = { 0 }; + ((uintx64*)(B))[0] = ((uintx64*)c_data)[0]; + ((uint32_t*)B)[19] = nonce; + ((uint32_t*)B)[39] = nonce; + ((uint32_t*)B)[59] = nonce; ((uint816*)input)[0] = ((uint816*)input_init)[0]; - ((uint48*)key)[0] = ((uint48*)key_init)[0]; + ((uint4x2*)key)[0] = ((uint4x2*)key_init)[0]; - for (int i = 0; i < 32; ++i) +#pragma unroll 1 + for (int i = 0; i < 31; i++) { - bufhelper = ((uchar4*)input)[0]; - for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) - bufhelper += ((uchar4*)input)[x]; - bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w; - - int qbuf = bufidx/4; - int rbuf = bufidx&3; - int bitbuf = rbuf << 3; - uint32_t shifted[9]; + uint32_t bufidx = 0; +#pragma unroll + for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8); + bufhelper = bufhelper + (bufhelper >> 16); + bufidx += bufhelper; + } + bufidx &= 0x000000ff; + qbuf = bufidx >> 2; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; - shift256R(shifted, ((uint8*)input)[0], bitbuf); + uint32_t shifted[9]; + shift256R4(shifted, ((uint8*)input)[0], bitbuf); - for (int k = 0; k < 9; ++k) { - ((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k]; + uint32_t temp[9]; + //#pragma unroll + for (int k = 0; k < 9; k++) + { + uint32_t indice = (k + qbuf) & 0x3f; + temp[k] = B[indice] ^ shifted[k]; + B[indice] = temp[k]; + } +#if __CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__) + uint32_t a = c_data[qbuf & 0x3f], b; + //#pragma unroll + for (int k = 0; k<16; k += 2) + { + b = c_data[(qbuf + k + 1) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf)); + a = c_data[(qbuf + k + 2) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf)); } - if (bufidx < BLAKE2S_KEY_SIZE) {((uint8*)B)[8] = ((uint8*)B)[0];} - else if (bufidx > FASTKDF_BUFFER_SIZE-BLAKE2S_OUT_SIZE) {((uint8*)B)[0] = ((uint8*)B)[8];} - - if (i<31) { - for (int k = 0; k > 8); + bufhelper = bufhelper + (bufhelper >> 16); + bufidx += bufhelper; } - for (int i = 4*qleft; i < 4*qleft+rleft; ++i) { - output[i] = (B + bufidx)[i] ^ A[i]; + bufidx &= 0x000000ff; + qbuf = bufidx >> 2; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + +#if __CUDA_ARCH__ >= 320 + for (int i = 0; i<64; i++) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(((uint32_t*)output)[i]) : "r"(B[(qbuf + i) & 0x3f]), "r"(B[(qbuf + i + 1) & 0x3f4]), "r"(bitbuf)); +#endif + + ((ulonglong4*)output)[0] ^= ((ulonglong4*)input)[0]; + ((uintx64*)output)[0] ^= ((uintx64*)c_data)[0]; + ((uint32_t*)output)[19] ^= nonce; + ((uint32_t*)output)[39] ^= nonce; + ((uint32_t*)output)[59] ^= nonce; + + for (int i = 0; i<8; i++) + (Input + 8U * thread)[i] = output[i]; +} +#endif + +#if __CUDA_ARCH__ >= 500 +static __forceinline__ __device__ +void fastkdf256_v2(const uint32_t thread, const uint32_t nonce, uint32_t* const s_data) +{ + const uint32_t data18 = c_data[18]; + const uint32_t data20 = c_data[0]; + uint32_t input[16]; + uint32_t key[16] = { 0 }; + uint32_t qbuf, rbuf, bitbuf; + + uint32_t* B = (uint32_t*)&s_data[threadIdx.x * 64U]; + ((uintx64*)(B))[0] = ((uintx64*)c_data)[0]; + + B[19] = nonce; + B[39] = nonce; + B[59] = nonce; + + { + uint32_t bufidx = 0; +#pragma unroll + for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + uint32_t bufhelper = (input_init[x] & 0x00ff00ff) + ((input_init[x] & 0xff00ff00) >> 8); + bufhelper = bufhelper + (bufhelper >> 16); + bufidx += bufhelper; + } + bufidx &= 0x000000ff; + qbuf = bufidx >> 2; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + + uint32_t temp[9]; + + uint32_t shifted; + uint32_t shift = 32U - bitbuf; + asm("shl.b32 %0, %1, %2;" : "=r"(shifted) : "r"(input_init[0]), "r"(bitbuf)); + temp[0] = B[(0 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[0]), "r"(input_init[1]), "r"(shift)); + temp[1] = B[(1 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[1]), "r"(input_init[2]), "r"(shift)); + temp[2] = B[(2 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[2]), "r"(input_init[3]), "r"(shift)); + temp[3] = B[(3 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[3]), "r"(input_init[4]), "r"(shift)); + temp[4] = B[(4 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[4]), "r"(input_init[5]), "r"(shift)); + temp[5] = B[(5 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[5]), "r"(input_init[6]), "r"(shift)); + temp[6] = B[(6 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input_init[6]), "r"(input_init[7]), "r"(shift)); + temp[7] = B[(7 + qbuf) & 0x3f] ^ shifted; + asm("shr.b32 %0, %1, %2;" : "=r"(shifted) : "r"(input_init[7]), "r"(shift)); + temp[8] = B[(8 + qbuf) & 0x3f] ^ shifted; + + uint32_t a = c_data[qbuf & 0x3f], b; + +#pragma unroll + for (int k = 0; k<16; k += 2) + { + b = c_data[(qbuf + k + 1) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf)); + a = c_data[(qbuf + k + 2) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf)); + } + + const uint32_t noncepos = 19 - qbuf % 20U; + if (noncepos <= 16U && qbuf < 60U) + { + if (noncepos) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf)); + if (noncepos != 16U) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf)); + } + + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); + + Blake2S_v2(input, input, key); + +#pragma unroll + for (int k = 0; k < 9; k++) + B[(k + qbuf) & 0x3f] = temp[k]; } - for (int i = qleft*4+rleft; i < (qleft+1)*4; ++i) { - ((uint8_t *)output)[i] = ((uint8_t *)B)[i - left] ^ ((uint8_t *)A)[i]; + + for (int i = 1; i < 31; i++) + { + uint32_t bufidx = 0; +#pragma unroll + for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8); + bufhelper = bufhelper + (bufhelper >> 16); + bufidx += bufhelper; + } + bufidx &= 0x000000ff; + qbuf = bufidx >> 2; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + + uint32_t temp[9]; + + uint32_t shifted; + uint32_t shift = 32U - bitbuf; + asm("shl.b32 %0, %1, %2;" : "=r"(shifted) : "r"(input[0]), "r"(bitbuf)); + temp[0] = B[(0 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[0]), "r"(input[1]), "r"(shift)); + temp[1] = B[(1 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[1]), "r"(input[2]), "r"(shift)); + temp[2] = B[(2 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[2]), "r"(input[3]), "r"(shift)); + temp[3] = B[(3 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[3]), "r"(input[4]), "r"(shift)); + temp[4] = B[(4 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[4]), "r"(input[5]), "r"(shift)); + temp[5] = B[(5 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[5]), "r"(input[6]), "r"(shift)); + temp[6] = B[(6 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[6]), "r"(input[7]), "r"(shift)); + temp[7] = B[(7 + qbuf) & 0x3f] ^ shifted; + asm("shr.b32 %0, %1, %2;" : "=r"(shifted) : "r"(input[7]), "r"(shift)); + temp[8] = B[(8 + qbuf) & 0x3f] ^ shifted; + + uint32_t a = c_data[qbuf & 0x3f], b; + +#pragma unroll + for (int k = 0; k<16; k += 2) + { + b = c_data[(qbuf + k + 1) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf)); + a = c_data[(qbuf + k + 2) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf)); + } + + const uint32_t noncepos = 19 - qbuf % 20U; + if (noncepos <= 16U && qbuf < 60U) + { + if (noncepos) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf)); + if (noncepos != 16U) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf)); + } + + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); + + Blake2S_v2(input, input, key); + +#pragma unroll + for (int k = 0; k < 9; k++) + B[(k + qbuf) & 0x3f] = temp[k]; + } + + { + uint32_t bufidx = 0; +#pragma unroll + for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8); + bufhelper = bufhelper + (bufhelper >> 16); + bufidx += bufhelper; + } + bufidx &= 0x000000ff; + qbuf = bufidx >> 2; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; } - for (int i = qleft+1; i < FASTKDF_BUFFER_SIZE/4; ++i) { - ((uchar4 *)output)[i] = make_uchar4(B[4*i - left],B[4*i+1-left], - B[4*i+2-left],B[4*i+3-left]) ^ ((uchar4 *)A)[i]; + + uint2x4 output[8]; + for (int i = 0; i<64; i++) { + const uint32_t a = (qbuf + i) & 0x3f, b = (qbuf + i + 1) & 0x3f; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(((uint32_t*)output)[i]) : "r"(B[a]), "r"(B[b]), "r"(bitbuf)); } + + output[0] ^= ((uint2x4*)input)[0]; +#pragma unroll + for (int i = 0; i<8; i++) + output[i] ^= ((uint2x4*)c_data)[i]; + + ((uint32_t*)output)[19] ^= nonce; + ((uint32_t*)output)[39] ^= nonce; + ((uint32_t*)output)[59] ^= nonce;; + ((ulonglong16 *)(Input + 8U * thread))[0] = ((ulonglong16*)output)[0]; } +#endif +#if __CUDA_ARCH__ < 500 static __forceinline__ __device__ -void fastkdf32(const uint32_t * password, const uint32_t * salt, uint32_t * output) +uint32_t fastkdf32_v1(uint32_t thread, const uint32_t nonce, uint32_t* const salt, uint32_t* const s_data) { - uint8_t bufidx = 0; - uchar4 bufhelper; + const uint32_t cdata7 = c_data[7]; + const uint32_t data18 = c_data[18]; + const uint32_t data20 = c_data[0]; - uint8_t A[320]; - uint8_t B[288]; + uint32_t* B0 = (uint32_t*)&s_data[threadIdx.x * 64U]; + ((uintx64*)B0)[0] = ((uintx64*)salt)[0]; - // Initialize the password buffer - ((uintx64*)A)[0] = ((uintx64*)password)[0]; - ((uint816*)A)[4] = ((uint816*)password)[0]; - ((uintx64*)B)[0] = ((uintx64*)salt)[0]; - ((uintx64*)B)[1] = ((uintx64*)salt)[0]; + uint32_t input[BLAKE2S_BLOCK_SIZE / 4]; + ((uint816*)input)[0] = ((uint816*)c_data)[0]; - uint32_t input[BLAKE2S_BLOCK_SIZE/4]; - uint32_t key[BLAKE2S_BLOCK_SIZE/4] = { 0 }; + uint32_t key[BLAKE2S_BLOCK_SIZE / 4]; + ((uint4x2*)key)[0] = ((uint4x2*)salt)[0]; + ((uint4*)key)[2] = make_uint4(0, 0, 0, 0); + ((uint4*)key)[3] = make_uint4(0, 0, 0, 0); - ((uint816*)input)[0] = ((uint816*)password)[0]; - ((uint48*)key)[0] = ((uint48*)salt)[0]; + uint32_t qbuf, rbuf, bitbuf; + uint32_t temp[9]; - for (int i = 0; i < 32; ++i) +#pragma nounroll + for (int i = 0; i < 31; i++) { - Blake2S((uint32_t*)input, key); + Blake2S(input, input, key); - bufidx = 0; - bufhelper = ((uchar4*)input)[0]; + uint32_t bufidx = 0; +#pragma unroll + for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8); + bufhelper = bufhelper + (bufhelper >> 16); + bufidx += bufhelper; + } + bufidx &= 0x000000ff; + qbuf = bufidx >> 2; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + uint32_t shifted[9]; - for (int x = 1; x < BLAKE2S_OUT_SIZE / 4; ++x) - bufhelper += ((uchar4*)input)[x]; + shift256R4(shifted, ((uint8*)input)[0], bitbuf); - bufidx = bufhelper.x + bufhelper.y + bufhelper.z + bufhelper.w; - int qbuf = bufidx / 4; - int rbuf = bufidx & 3; - int bitbuf = rbuf << 3; - uint32_t shifted[9]; + for (int k = 0; k < 9; k++) { + temp[k] = B0[(k + qbuf) & 0x3f]; + } - shift256R(shifted, ((uint8*)input)[0], bitbuf); + ((uint2x4*)temp)[0] ^= ((uint2x4*)shifted)[0]; + temp[8] ^= shifted[8]; - for (int k = 0; k < 9; ++k) { - ((uint32_t *)B)[k + qbuf] ^= ((uint32_t *)shifted)[k]; +#if __CUDA_ARCH__ >= 320 || !defined(__CUDA_ARCH__) + uint32_t a = c_data[qbuf & 0x3f], b; + //#pragma unroll + for (int k = 0; k<16; k += 2) + { + b = c_data[(qbuf + k + 1) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf)); + a = c_data[(qbuf + k + 2) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf)); } - if (i<31) { - if (bufidx < BLAKE2S_KEY_SIZE) {((uint8*)B)[8] = ((uint8*)B)[0];} - else if (bufidx > FASTKDF_BUFFER_SIZE - BLAKE2S_OUT_SIZE) {((uint8*)B)[0] = ((uint8*)B)[8];} - - for (uint8_t k = 0; k < BLAKE2S_BLOCK_SIZE/4; k++) { - ((uchar4*)(input))[k] = make_uchar4( - (A + bufidx)[4 * k], (A + bufidx)[4 * k + 1], - (A + bufidx)[4 * k + 2], (A + bufidx)[4 * k + 3] - ); - } - for (uint8_t k = 0; k < BLAKE2S_KEY_SIZE / 4; k++) { - ((uchar4*)(key))[k] = make_uchar4( - (B + bufidx)[4 * k], (B + bufidx)[4 * k + 1], - (B + bufidx)[4 * k + 2], (B + bufidx)[4 * k + 3] - ); - } + const uint32_t noncepos = 19U - qbuf % 20U; + if (noncepos <= 16U && qbuf < 60U) + { + if (noncepos != 0) asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf)); + if (noncepos != 16U) asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf)); } - } - uchar4 unfucked[1]; - unfucked[0] = make_uchar4(B[28 + bufidx], B[29 + bufidx],B[30 + bufidx], B[31 + bufidx]); - ((uint32_t*)output)[7] = ((uint32_t*)unfucked)[0] ^ ((uint32_t*)A)[7]; -} + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); +#else + //#error SM 3.0 code missing here +#endif + for (int k = 0; k < 9; k++) { + B0[(k + qbuf) & 0x3f] = temp[k]; + } + } + Blake2S(input, input, key); -#define SALSA(a,b,c,d) { \ - t =a+d; b^=rotateL(t, 7); \ - t =b+a; c^=rotateL(t, 9); \ - t =c+b; d^=rotateL(t, 13); \ - t =d+c; a^=rotateL(t, 18); \ -} + uint32_t bufidx = 0; +#pragma unroll + for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8); + bufhelper = bufhelper + (bufhelper >> 16); + bufidx += bufhelper; + } + bufidx &= 0x000000ff; + qbuf = bufidx >> 2; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; -#define SALSA_CORE(state) { \ - SALSA(state.s0,state.s4,state.s8,state.sc); \ - SALSA(state.s5,state.s9,state.sd,state.s1); \ - SALSA(state.sa,state.se,state.s2,state.s6); \ - SALSA(state.sf,state.s3,state.s7,state.sb); \ - SALSA(state.s0,state.s1,state.s2,state.s3); \ - SALSA(state.s5,state.s6,state.s7,state.s4); \ - SALSA(state.sa,state.sb,state.s8,state.s9); \ - SALSA(state.sf,state.sc,state.sd,state.se); \ -} + for (int k = 7; k < 9; k++) { + temp[k] = B0[(k + qbuf) & 0x3f]; + } -#if __CUDA_ARCH__ >=500 -#define CHACHA_STEP(a,b,c,d) { \ - a += b; d = __byte_perm(d^a,0,0x1032); \ - c += d; b = rotateL(b^c, 12); \ - a += b; d = __byte_perm(d^a,0,0x2103); \ - c += d; b = rotateL(b^c, 7); \ -} + uint32_t output; +#if __CUDA_ARCH__ >= 320 + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); #else -#define CHACHA_STEP(a,b,c,d) { \ - a += b; d = rotateL(d^a,16); \ - c += d; b = rotateL(b^c, 12); \ - a += b; d = rotateL(d^a,8); \ - c += d; b = rotateL(b^c, 7); \ + output = (MAKE_ULONGLONG(temp[7], temp[8]) >> bitbuf); // to check maybe 7/8 reversed +#endif + output ^= input[7] ^ cdata7; + return output; } #endif -#define CHACHA_CORE_PARALLEL(state) { \ - CHACHA_STEP(state.lo.s0, state.lo.s4, state.hi.s0, state.hi.s4); \ - CHACHA_STEP(state.lo.s1, state.lo.s5, state.hi.s1, state.hi.s5); \ - CHACHA_STEP(state.lo.s2, state.lo.s6, state.hi.s2, state.hi.s6); \ - CHACHA_STEP(state.lo.s3, state.lo.s7, state.hi.s3, state.hi.s7); \ - CHACHA_STEP(state.lo.s0, state.lo.s5, state.hi.s2, state.hi.s7); \ - CHACHA_STEP(state.lo.s1, state.lo.s6, state.hi.s3, state.hi.s4); \ - CHACHA_STEP(state.lo.s2, state.lo.s7, state.hi.s0, state.hi.s5); \ - CHACHA_STEP(state.lo.s3, state.lo.s4, state.hi.s1, state.hi.s6); \ -} +#if __CUDA_ARCH__ >= 500 +static __forceinline__ __device__ +uint32_t fastkdf32_v3(uint32_t thread, const uint32_t nonce, uint32_t* const salt, uint32_t* const s_data) +{ + const uint32_t cdata7 = c_data[7]; + const uint32_t data18 = c_data[18]; + const uint32_t data20 = c_data[0]; + uint32_t* B0 = (uint32_t*)&s_data[threadIdx.x * 64U]; + ((uintx64*)B0)[0] = ((uintx64*)salt)[0]; -static __forceinline__ __device__ uint16 salsa_small_scalar_rnd(const uint16 &X) -{ - uint16 state = X; - uint32_t t; + uint32_t input[BLAKE2S_BLOCK_SIZE / 4]; + ((uint816*)input)[0] = ((uint816*)c_data)[0]; - for (int i = 0; i < 10; ++i) { SALSA_CORE(state);} + uint32_t key[BLAKE2S_BLOCK_SIZE / 4]; + ((uint4x2*)key)[0] = ((uint4x2*)salt)[0]; + ((uint4*)key)[2] = make_uint4(0, 0, 0, 0); + ((uint4*)key)[3] = make_uint4(0, 0, 0, 0); - return(X + state); -} + uint32_t qbuf, rbuf, bitbuf; + uint32_t temp[9]; -static __device__ __forceinline__ uint16 chacha_small_parallel_rnd(const uint16 &X) -{ - uint16 st = X; +#pragma nounroll + for (int i = 0; i < 31; i++) + { + Blake2S_v2(input, input, key); + + uint32_t bufidx = 0; +#pragma unroll + for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8); + bufhelper = bufhelper + (bufhelper >> 16); + bufidx += bufhelper; + } + bufidx &= 0x000000ff; + qbuf = bufidx >> 2; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + + uint32_t shifted; + uint32_t shift = 32U - bitbuf; + asm("shl.b32 %0, %1, %2;" : "=r"(shifted) : "r"(input[0]), "r"(bitbuf)); + temp[0] = B0[(0 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[0]), "r"(input[1]), "r"(shift)); + temp[1] = B0[(1 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[1]), "r"(input[2]), "r"(shift)); + temp[2] = B0[(2 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[2]), "r"(input[3]), "r"(shift)); + temp[3] = B0[(3 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[3]), "r"(input[4]), "r"(shift)); + temp[4] = B0[(4 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[4]), "r"(input[5]), "r"(shift)); + temp[5] = B0[(5 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[5]), "r"(input[6]), "r"(shift)); + temp[6] = B0[(6 + qbuf) & 0x3f] ^ shifted; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(shifted) : "r"(input[6]), "r"(input[7]), "r"(shift)); + temp[7] = B0[(7 + qbuf) & 0x3f] ^ shifted; + asm("shr.b32 %0, %1, %2;" : "=r"(shifted) : "r"(input[7]), "r"(shift)); + temp[8] = B0[(8 + qbuf) & 0x3f] ^ shifted; + + uint32_t a = c_data[qbuf & 0x3f], b; +#pragma unroll + for (int k = 0; k<16; k += 2) + { + b = c_data[(qbuf + k + 1) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k]) : "r"(a), "r"(b), "r"(bitbuf)); + a = c_data[(qbuf + k + 2) & 0x3f]; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[k + 1]) : "r"(b), "r"(a), "r"(bitbuf)); + } + + const uint32_t noncepos = 19U - qbuf % 20U; + if (noncepos <= 16U && qbuf < 60U) + { + if (noncepos != 0) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos - 1]) : "r"(data18), "r"(nonce), "r"(bitbuf)); + if (noncepos != 16U) + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(input[noncepos]) : "r"(nonce), "r"(data20), "r"(bitbuf)); + } - for (int i = 0; i < 10; ++i) {CHACHA_CORE_PARALLEL(st);} - return(X + st); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[0]) : "r"(temp[0]), "r"(temp[1]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[1]) : "r"(temp[1]), "r"(temp[2]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[2]) : "r"(temp[2]), "r"(temp[3]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[3]) : "r"(temp[3]), "r"(temp[4]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[4]) : "r"(temp[4]), "r"(temp[5]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[5]) : "r"(temp[5]), "r"(temp[6]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[6]) : "r"(temp[6]), "r"(temp[7]), "r"(bitbuf)); + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(key[7]) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); + +#pragma unroll + for (int k = 0; k < 9; k++) + { + B0[(k + qbuf) & 0x3f] = temp[k]; + } + } + + Blake2S_v2(input, input, key); + + uint32_t bufidx = 0; +#pragma unroll + for (int x = 0; x < BLAKE2S_OUT_SIZE / 4; ++x) + { + uint32_t bufhelper = (input[x] & 0x00ff00ff) + ((input[x] & 0xff00ff00) >> 8); + bufhelper = bufhelper + (bufhelper >> 16); + bufidx += bufhelper; + } + bufidx &= 0x000000ff; + qbuf = bufidx >> 2; + rbuf = bufidx & 3; + bitbuf = rbuf << 3; + + temp[7] = B0[(qbuf + 7) & 0x3f]; + temp[8] = B0[(qbuf + 8) & 0x3f]; + + uint32_t output; + asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(output) : "r"(temp[7]), "r"(temp[8]), "r"(bitbuf)); + output ^= input[7] ^ cdata7; + return output; } +#endif -static __device__ __forceinline__ void neoscrypt_chacha(uint16 *XV) -{ - XV[0] ^= XV[3]; - uint16 temp; - XV[0] = chacha_small_parallel_rnd(XV[0]); XV[1] ^= XV[0]; - temp = chacha_small_parallel_rnd(XV[1]); XV[2] ^= temp; - XV[1] = chacha_small_parallel_rnd(XV[2]); XV[3] ^= XV[1]; - XV[3] = chacha_small_parallel_rnd(XV[3]); - XV[2] = temp; +#define BLAKE_Ghost(idx0, idx1, a, b, c, d, key) { \ + idx = BLAKE2S_SIGMA_host[idx0][idx1]; a += key[idx]; \ + a += b; d = ROTR32(d^a,16); \ + c += d; b = ROTR32(b^c, 12); \ + idx = BLAKE2S_SIGMA_host[idx0][idx1+1]; a += key[idx]; \ + a += b; d = ROTR32(d^a,8); \ + c += d; b = ROTR32(b^c, 7); \ } -static __device__ __forceinline__ void neoscrypt_salsa(uint16 *XV) +static void Blake2Shost(uint32_t * inout, const uint32_t * inkey) { - XV[0] ^= XV[3]; - uint16 temp; + uint16 V; + uint32_t idx; + uint8 tmpblock; - XV[0] = salsa_small_scalar_rnd(XV[0]); XV[1] ^= XV[0]; - temp = salsa_small_scalar_rnd(XV[1]); XV[2] ^= temp; - XV[1] = salsa_small_scalar_rnd(XV[2]); XV[3] ^= XV[1]; - XV[3] = salsa_small_scalar_rnd(XV[3]); - XV[2] = temp; -} + V.hi = BLAKE2S_IV_Vechost; + V.lo = BLAKE2S_IV_Vechost; + V.lo.s0 ^= 0x01012020; + // Copy input block for later + tmpblock = V.lo; -#define SHIFT 130 + V.hi.s4 ^= BLAKE2S_BLOCK_SIZE; -__global__ __launch_bounds__(128, 1) -void neoscrypt_gpu_hash_k0(uint32_t threads, uint32_t startNonce, int stratum) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); -// if (thread < threads) + for (int x = 0; x < 10; ++x) { - uint32_t data[80]; - uint16 X[4]; - uint32_t shift = thread * SHIFT * 16; - const uint32_t nonce = startNonce + thread; - - for (int i = 0; i<20; i++) { - ((uint4*)data)[i] = ((uint4 *)c_data)[i]; - } //ld.local.v4 - data[19] = (stratum) ? cuda_swab32(nonce) : nonce; //freaking morons !!! - data[39] = data[19]; - data[59] = data[19]; - - fastkdf256(data, (uint8_t*)X); - - ((uintx64 *)(W + shift))[0] = ((uintx64 *)X)[0]; -// ((ulonglong16 *)(W + shift))[0] = ((ulonglong16 *)X)[0]; + BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inkey); + BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inkey); + BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inkey); + BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inkey); + BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inkey); + BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inkey); + BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inkey); + BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inkey); } + + V.lo ^= V.hi; + V.lo ^= tmpblock; + + V.hi = BLAKE2S_IV_Vechost; + tmpblock = V.lo; + + V.hi.s4 ^= 128; + V.hi.s6 = ~V.hi.s6; + + for (int x = 0; x < 10; ++x) + { + BLAKE_Ghost(x, 0x00, V.lo.s0, V.lo.s4, V.hi.s0, V.hi.s4, inout); + BLAKE_Ghost(x, 0x02, V.lo.s1, V.lo.s5, V.hi.s1, V.hi.s5, inout); + BLAKE_Ghost(x, 0x04, V.lo.s2, V.lo.s6, V.hi.s2, V.hi.s6, inout); + BLAKE_Ghost(x, 0x06, V.lo.s3, V.lo.s7, V.hi.s3, V.hi.s7, inout); + BLAKE_Ghost(x, 0x08, V.lo.s0, V.lo.s5, V.hi.s2, V.hi.s7, inout); + BLAKE_Ghost(x, 0x0A, V.lo.s1, V.lo.s6, V.hi.s3, V.hi.s4, inout); + BLAKE_Ghost(x, 0x0C, V.lo.s2, V.lo.s7, V.hi.s0, V.hi.s5, inout); + BLAKE_Ghost(x, 0x0E, V.lo.s3, V.lo.s4, V.hi.s1, V.hi.s6, inout); + } + + V.lo ^= V.hi ^ tmpblock; + + ((uint8*)inout)[0] = V.lo; } -__global__ __launch_bounds__(128, 1) -void neoscrypt_gpu_hash_k01(uint32_t threads, uint32_t startNonce) + +#define SHIFT 128U +#define TPB 32 +#define TPB2 64 + +__global__ +__launch_bounds__(TPB2, 1) +void neoscrypt_gpu_hash_start(const int stratum, const uint32_t startNonce) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); -// if (thread < threads) - { - uint16 X[4]; - uint32_t shift = thread * SHIFT * 16; - ((uintx64 *)X)[0]= ldg256(&(W + shift)[0]); + __shared__ uint32_t s_data[64 * TPB2]; - //#pragma unroll - for (int i = 0; i < 128; ++i) - { - neoscrypt_chacha(X); - ((ulonglong16 *)(W + shift))[i+1] = ((ulonglong16 *)X)[0]; -// ((uintx64 *)(W + shift))[i + 1] = ((uintx64 *)X)[0]; - } - } + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t nonce = startNonce + thread; + const uint32_t ZNonce = (stratum) ? cuda_swab32(nonce) : nonce; //freaking morons !!! + + __syncthreads(); +#if __CUDA_ARCH__ < 500 + fastkdf256_v1(thread, ZNonce, s_data); +#else + fastkdf256_v2(thread, ZNonce, s_data); +#endif } -__global__ __launch_bounds__(128, 1) -void neoscrypt_gpu_hash_k2(uint32_t threads, uint32_t startNonce) +__global__ +__launch_bounds__(TPB, 1) +void neoscrypt_gpu_hash_chacha1() { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); -// if (thread < threads) - { - uint16 X[4]; - uint32_t shift = thread * SHIFT * 16; - ((uintx64 *)X)[0] = ldg256(&(W + shift)[2048]); + const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y); + const uint32_t shift = SHIFT * 8U * (thread & 8191); + const uint32_t shiftTr = 8U * thread; - for (int t = 0; t < 128; t++) - { - int idx = X[3].lo.s0 & 0x7F; - ((uintx64 *)X)[0] ^= ldg256(&(W + shift)[idx << 4]); - neoscrypt_chacha(X); + uint4 X[4]; + for (int i = 0; i < 4; i++) + { + X[i].x = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + 0 * 4 + threadIdx.x); + X[i].y = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + 1 * 4 + threadIdx.x); + X[i].z = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + 2 * 4 + threadIdx.x); + X[i].w = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + 3 * 4 + threadIdx.x); + } - } - ((uintx64 *)(W + shift))[129] = ((uintx64*)X)[0]; // best checked +#pragma nounroll + for (int i = 0; i < 128; i++) + { + uint32_t offset = shift + i * 8U; + for (int j = 0; j < 4; j++) + ((uint4*)(W + offset))[j * 4 + threadIdx.x] = X[j]; + neoscrypt_chacha(X); + } +#pragma nounroll + for (int t = 0; t < 128; t++) + { + uint32_t offset = shift + (WarpShuffle(X[3].x, 0, 4) & 0x7F) * 8U; + for (int j = 0; j < 4; j++) + X[j] ^= ((uint4*)(W + offset))[j * 4 + threadIdx.x]; + neoscrypt_chacha(X); + } +#pragma unroll + for (int i = 0; i < 4; i++) + { + *((uint32_t*)&(Tr + shiftTr)[i * 2] + 0 * 4 + threadIdx.x) = X[i].x; + *((uint32_t*)&(Tr + shiftTr)[i * 2] + 1 * 4 + threadIdx.x) = X[i].y; + *((uint32_t*)&(Tr + shiftTr)[i * 2] + 2 * 4 + threadIdx.x) = X[i].z; + *((uint32_t*)&(Tr + shiftTr)[i * 2] + 3 * 4 + threadIdx.x) = X[i].w; } } -__global__ __launch_bounds__(128, 1) -void neoscrypt_gpu_hash_k3(uint32_t threads, uint32_t startNonce) +__global__ +__launch_bounds__(TPB, 1) +void neoscrypt_gpu_hash_salsa1() { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); -// if (thread < threads) + const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y); + const uint32_t shift = SHIFT * 8U * (thread & 8191); + const uint32_t shiftTr = 8U * thread; + + uint4 Z[4]; + for (int i = 0; i < 4; i++) { - uint32_t shift = thread * SHIFT * 16; - uint16 Z[4]; + Z[i].x = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + ((0 + threadIdx.x) & 3) * 4 + threadIdx.x); + Z[i].y = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + ((1 + threadIdx.x) & 3) * 4 + threadIdx.x); + Z[i].z = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + ((2 + threadIdx.x) & 3) * 4 + threadIdx.x); + Z[i].w = __ldg((uint32_t*)&(Input + shiftTr)[i * 2] + ((3 + threadIdx.x) & 3) * 4 + threadIdx.x); + } - ((uintx64*)Z)[0] = ldg256(&(W + shift)[0]); +#pragma nounroll + for (int i = 0; i < 128; i++) + { + uint32_t offset = shift + i * 8U; + for (int j = 0; j < 4; j++) + ((uint4*)(W + offset))[j * 4 + threadIdx.x] = Z[j]; + neoscrypt_salsa(Z); + } - //#pragma unroll - for (int i = 0; i < 128; ++i) { - neoscrypt_salsa(Z); - ((ulonglong16 *)(W + shift))[i+1] = ((ulonglong16 *)Z)[0]; -// ((uintx64 *)(W + shift))[i + 1] = ((uintx64 *)Z)[0]; - } +#pragma nounroll + for (int t = 0; t < 128; t++) + { + uint32_t offset = shift + (WarpShuffle(Z[3].x, 0, 4) & 0x7F) * 8U; + for (int j = 0; j < 4; j++) + Z[j] ^= ((uint4*)(W + offset))[j * 4 + threadIdx.x]; + neoscrypt_salsa(Z); + } +#pragma unroll + for (int i = 0; i < 4; i++) + { + *((uint32_t*)&(Tr2 + shiftTr)[i * 2] + ((0 + threadIdx.x) & 3) * 4 + threadIdx.x) = Z[i].x; + *((uint32_t*)&(Tr2 + shiftTr)[i * 2] + ((1 + threadIdx.x) & 3) * 4 + threadIdx.x) = Z[i].y; + *((uint32_t*)&(Tr2 + shiftTr)[i * 2] + ((2 + threadIdx.x) & 3) * 4 + threadIdx.x) = Z[i].z; + *((uint32_t*)&(Tr2 + shiftTr)[i * 2] + ((3 + threadIdx.x) & 3) * 4 + threadIdx.x) = Z[i].w; } } -__global__ __launch_bounds__(128, 1) -void neoscrypt_gpu_hash_k4(uint32_t threads, uint32_t startNonce, uint32_t *nonceRes, int stratum) +__global__ +__launch_bounds__(TPB2, 8) +void neoscrypt_gpu_hash_ending(const int stratum, const uint32_t startNonce, uint32_t *resNonces) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - const uint32_t nonce = startNonce + thread; + __shared__ uint32_t s_data[64 * TPB2]; - uint32_t shift = thread * SHIFT * 16; - uint16 Z[4]; - uint32_t outbuf[8]; - uint32_t data[80]; + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t shiftTr = thread * 8U; + const uint32_t nonce = startNonce + thread; + const uint32_t ZNonce = (stratum) ? cuda_swab32(nonce) : nonce; - for (int i=0; i<20; i++) { - ((uint4*)data)[i] = ((uint4 *)c_data)[i]; - } + __syncthreads(); - data[19] = (stratum) ? cuda_swab32(nonce) : nonce; - data[39] = data[19]; - data[59] = data[19]; - ((uintx64 *)Z)[0] = ldg256(&(W + shift)[2048]); - for (int t = 0; t < 128; t++) - { - int idx = Z[3].lo.s0 & 0x7F; - ((uintx64 *)Z)[0] ^= ldg256(&(W + shift)[idx << 4]); - neoscrypt_salsa(Z); - } - ((uintx64 *)Z)[0] ^= ldg256(&(W + shift)[2064]); - fastkdf32(data, (uint32_t*)Z, outbuf); -#if __CUDA_ARCH__ < 320 - // workaround required when using SM 3.0 shift256R() func (tested on SM 5.0) - if (thread == 0) - printf("", outbuf[7]); + uint2x4 Z[8]; +#pragma unroll + for (int i = 0; i<8; i++) + Z[i] = __ldg4(&(Tr2 + shiftTr)[i]) ^ __ldg4(&(Tr + shiftTr)[i]); + +#if __CUDA_ARCH__ < 500 + uint32_t outbuf = fastkdf32_v1(thread, ZNonce, (uint32_t*)Z, s_data); +#else + uint32_t outbuf = fastkdf32_v3(thread, ZNonce, (uint32_t*)Z, s_data); #endif - if (outbuf[7] <= pTarget[7]) { - atomicMin(nonceRes, nonce); // init val is UINT32_MAX - } + + if (outbuf <= c_target[1]) + { + resNonces[0] = nonce; + //uint32_t tmp = atomicExch(resNonces, nonce); + //if(tmp != UINT32_MAX) + // resNonces[1] = tmp; } } +static __thread uint32_t *hash1 = NULL; +static __thread uint32_t *Trans1 = NULL; +static __thread uint32_t *Trans2 = NULL; // 2 streams +static __thread uint32_t *Trans3 = NULL; // 2 streams + __host__ -void neoscrypt_cpu_init(int thr_id, uint32_t threads) +void neoscrypt_init_2stream(int thr_id, uint32_t threads) { - cuda_get_arch(thr_id); - cudaMalloc(&d_NNonce[thr_id], sizeof(uint32_t)); - CUDA_SAFE_CALL(cudaMalloc(&d_buffer[thr_id], (size_t) 256 * SHIFT * threads)); - cudaMemcpyToSymbol(W, &d_buffer[thr_id], sizeof(uint4*), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(BLAKE2S_SIGMA, BLAKE2S_SIGMA_host, sizeof(BLAKE2S_SIGMA_host), 0, cudaMemcpyHostToDevice); + CUDA_SAFE_CALL(cudaMalloc(&d_NNonce[thr_id], 2 * sizeof(uint32_t))); + CUDA_SAFE_CALL(cudaMalloc(&hash1, 32 * 128 * sizeof(uint64_t) * min(8192, threads))); + CUDA_SAFE_CALL(cudaMalloc(&Trans1, 32 * sizeof(uint64_t) * threads)); + CUDA_SAFE_CALL(cudaMalloc(&Trans2, 32 * sizeof(uint64_t) * threads)); + CUDA_SAFE_CALL(cudaMalloc(&Trans3, 32 * sizeof(uint64_t) * threads)); + + CUDA_SAFE_CALL(cudaMemcpyToSymbol(W, &hash1, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(Tr, &Trans1, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(Tr2, &Trans2, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(Input, &Trans3, sizeof(uint2x4*), 0, cudaMemcpyHostToDevice)); } __host__ -void neoscrypt_cpu_free(int thr_id) +void neoscrypt_free_2stream(int thr_id) { cudaFree(d_NNonce[thr_id]); - cudaFree(d_buffer[thr_id]); + + cudaFree(hash1); + cudaFree(Trans1); + cudaFree(Trans2); + cudaFree(Trans3); + } __host__ -uint32_t neoscrypt_cpu_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, int have_stratum, int order) +void neoscrypt_hash_k4_2stream(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum) { - uint32_t result[MAX_GPUS]; - memset(result, 0xff, sizeof(result)); - cudaMemset(d_NNonce[thr_id], 0xff, sizeof(uint32_t)); + CUDA_SAFE_CALL(cudaMemset(d_NNonce[thr_id], 0xff, 2 * sizeof(uint32_t))); - const uint32_t threadsperblock = 128; + const int threadsperblock = TPB; dim3 grid((threads + threadsperblock - 1) / threadsperblock); dim3 block(threadsperblock); - neoscrypt_gpu_hash_k0 <<< grid, block >>>(threads, startNounce, have_stratum); - neoscrypt_gpu_hash_k01 <<< grid, block >>>(threads, startNounce); - neoscrypt_gpu_hash_k2 <<< grid, block >>>(threads, startNounce); - neoscrypt_gpu_hash_k3 <<< grid, block >>>(threads, startNounce); - neoscrypt_gpu_hash_k4 <<< grid, block >>>(threads, startNounce, d_NNonce[thr_id], have_stratum); + const int threadsperblock2 = TPB2; + dim3 grid2((threads + threadsperblock2 - 1) / threadsperblock2); + dim3 block2(threadsperblock2); - MyStreamSynchronize(NULL, order, thr_id); - cudaMemcpy(&result[thr_id], d_NNonce[thr_id], sizeof(uint32_t), cudaMemcpyDeviceToHost); + dim3 grid3((threads * 4 + threadsperblock - 1) / threadsperblock); + dim3 block3(4, threadsperblock >> 2); - return result[thr_id]; + neoscrypt_gpu_hash_start << > > (stratum, startNounce); //fastkdf + + neoscrypt_gpu_hash_salsa1 << > > (); + neoscrypt_gpu_hash_chacha1 << > > (); + + neoscrypt_gpu_hash_ending << > > (stratum, startNounce, d_NNonce[thr_id]); //fastkdf+end + + CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_NNonce[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); } __host__ -void neoscrypt_setBlockTarget(uint32_t* pdata, const void *target) +void neoscrypt_setBlockTarget(uint32_t* const pdata, uint32_t* const target) { - unsigned char PaddedMessage[80*4]; //bring balance to the force + uint32_t PaddedMessage[64]; uint32_t input[16], key[16] = { 0 }; - memcpy(PaddedMessage, pdata, 80); - memcpy(PaddedMessage + 80, pdata, 80); - memcpy(PaddedMessage + 160, pdata, 80); - memcpy(PaddedMessage + 240, pdata, 80); + for (int i = 0; i < 19; i++) + { + PaddedMessage[i] = pdata[i]; + PaddedMessage[i + 20] = pdata[i]; + PaddedMessage[i + 40] = pdata[i]; + } + for (int i = 0; i<4; i++) + PaddedMessage[i + 60] = pdata[i]; + + PaddedMessage[19] = 0; + PaddedMessage[39] = 0; + PaddedMessage[59] = 0; ((uint16*)input)[0] = ((uint16*)pdata)[0]; ((uint8*)key)[0] = ((uint8*)pdata)[0]; - Blake2Shost(input,key); + Blake2Shost(input, key); - cudaMemcpyToSymbol(pTarget, target, 32, 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(input_init, input, sizeof(input), 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(key_init, key, sizeof(key), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(input_init, input, 64, 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(key_init, key, 64, 0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(c_data, PaddedMessage, 80*4, 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(c_target, &target[6], 2 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(c_data, PaddedMessage, 64 * sizeof(uint32_t), 0, cudaMemcpyHostToDevice); + CUDA_SAFE_CALL(cudaGetLastError()); } diff --git a/neoscrypt/cuda_vectors.h b/neoscrypt/cuda_vectors.h index 08fc0ee..3799b74 100644 --- a/neoscrypt/cuda_vectors.h +++ b/neoscrypt/cuda_vectors.h @@ -482,7 +482,7 @@ static __forceinline__ __device__ uint32_t rotateR(uint32_t vec4, uint32_t shift // require a uint32_t[9] ret array // note: djm neoscrypt implementation is near the limits of gpu capabilities // and weird behaviors can happen when tuning device functions code... -__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift) +__device__ __forceinline__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift) { uint8_t *v = (uint8_t*) &vec4.s0; uint8_t *r = (uint8_t*) ret; @@ -496,7 +496,7 @@ __device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift) #else // same for SM 3.5+, really faster ? -__device__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift) +__device__ __forceinline__ void shift256R(uint32_t* ret, const uint8 &vec4, uint32_t shift) { uint32_t truc = 0, truc2 = cuda_swab32(vec4.s7), truc3 = 0; asm("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(truc) : "r"(truc3), "r"(truc2), "r"(shift)); diff --git a/neoscrypt/neoscrypt.cpp b/neoscrypt/neoscrypt.cpp index b0cb1cb..ab59f19 100644 --- a/neoscrypt/neoscrypt.cpp +++ b/neoscrypt/neoscrypt.cpp @@ -1,11 +1,14 @@ #include -#include "miner.h" -#include "neoscrypt/neoscrypt.h" +#include +#include -extern void neoscrypt_setBlockTarget(uint32_t * data, const void *ptarget); -extern void neoscrypt_cpu_init(int thr_id, uint32_t threads); -extern void neoscrypt_cpu_free(int thr_id); -extern uint32_t neoscrypt_cpu_hash_k4(int thr_id, uint32_t threads, uint32_t startNounce, int have_stratum, int order); +#include "neoscrypt.h" + +extern void neoscrypt_setBlockTarget(uint32_t* const data, uint32_t* const ptarget); + +extern void neoscrypt_init_2stream(int thr_id, uint32_t threads); +extern void neoscrypt_free_2stream(int thr_id); +extern void neoscrypt_hash_k4_2stream(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *resNonces, bool stratum); static bool init[MAX_GPUS] = { 0 }; @@ -18,6 +21,17 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign int dev_id = device_map[thr_id]; int intensity = is_windows() ? 18 : 19; + // Pascal + if (strstr(device_name[dev_id], "GTX 10")) intensity = 22; + // Maxwell + else if (strstr(device_name[dev_id], "TITAN X")) intensity = 21; + else if (strstr(device_name[dev_id], "980")) intensity = 21; + else if (strstr(device_name[dev_id], "970")) intensity = 20; + else if (strstr(device_name[dev_id], "960")) intensity = 20; + else if (strstr(device_name[dev_id], "950")) intensity = 19; + else if (strstr(device_name[dev_id], "750 Ti")) intensity = 19; + else if (strstr(device_name[dev_id], "750")) intensity = 19; + uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); throughput = throughput / 32; /* set for max intensity ~= 20 */ api_set_throughput(thr_id, throughput); @@ -31,16 +45,20 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign { cudaDeviceSynchronize(); cudaSetDevice(dev_id); - cudaDeviceSetCacheConfig(cudaFuncCachePreferL1); - cudaGetLastError(); // reset errors if device is not "reset" + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + cudaGetLastError(); // reset errors if device is not "reset" + } if (device_sm[dev_id] <= 300) { - applog(LOG_ERR, "Sorry neoscrypt is not supported on SM 3.0 devices"); + gpulog(LOG_ERR, thr_id, "Sorry neoscrypt is not supported on SM 3.0 devices"); proper_exit(EXIT_CODE_CUDA_ERROR); } - applog(LOG_INFO, "GPU #%d: Using %d cuda threads", dev_id, throughput); - neoscrypt_cpu_init(thr_id, throughput); + gpulog(LOG_INFO, thr_id, "Using %d cuda threads", throughput); + neoscrypt_init_2stream(thr_id, throughput); init[thr_id] = true; } @@ -48,34 +66,39 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign if (have_stratum) { for (int k = 0; k < 20; k++) be32enc(&endiandata[k], pdata[k]); - } else { + } + else { for (int k = 0; k < 20; k++) endiandata[k] = pdata[k]; } - neoscrypt_setBlockTarget(endiandata,ptarget); + neoscrypt_setBlockTarget(endiandata, ptarget); do { - uint32_t foundNonce = neoscrypt_cpu_hash_k4(thr_id, throughput, pdata[19], have_stratum, 0); - if (foundNonce != UINT32_MAX) - { - uint32_t _ALIGN(64) vhash64[8]; + uint32_t foundNonces[2] = { UINT32_MAX, UINT32_MAX }; + neoscrypt_hash_k4_2stream(thr_id, throughput, pdata[19], foundNonces, have_stratum); - *hashes_done = pdata[19] - first_nonce + 1; + *hashes_done = pdata[19] - first_nonce + throughput; + + if (foundNonces[0] != UINT32_MAX) + { + uint32_t _ALIGN(64) vhash[8]; if (have_stratum) { - be32enc(&endiandata[19], foundNonce); - } else { - endiandata[19] = foundNonce; + be32enc(&endiandata[19], foundNonces[0]); + } + else { + endiandata[19] = foundNonces[0]; } - neoscrypt((uchar*)vhash64, (uchar*) endiandata, 0x80000620U); + neoscrypt((uchar*)vhash, (uchar*)endiandata, 0x80000620U); - if (vhash64[7] <= ptarget[7] && fulltest(vhash64, ptarget)) { - work_set_target_ratio(work, vhash64); - pdata[19] = foundNonce; + if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { + work_set_target_ratio(work, vhash); + pdata[19] = foundNonces[0]; return 1; - } else { - gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce); + } + else { + gpulog(LOG_WARNING, thr_id, "nonce %08x does not validate on CPU!", foundNonces[0]); } } @@ -100,8 +123,9 @@ void free_neoscrypt(int thr_id) cudaThreadSynchronize(); - neoscrypt_cpu_free(thr_id); + neoscrypt_free_2stream(thr_id); init[thr_id] = false; cudaDeviceSynchronize(); } + diff --git a/nvml.cpp b/nvml.cpp index 74f2994..2d3a12d 100644 --- a/nvml.cpp +++ b/nvml.cpp @@ -49,7 +49,7 @@ uint32_t limit_prev[MAX_GPUS] = { 0 }; static void *wrap_dlopen(const char *filename) { HMODULE h = LoadLibrary(filename); if (!h && opt_debug) { - applog(LOG_DEBUG, "dlopen(%d): failed to load %s", + applog(LOG_DEBUG, "dlopen(%d): failed to load %s", GetLastError(), filename); } return (void*)h; @@ -68,7 +68,7 @@ uint32_t limit_prev[MAX_GPUS] = { 0 }; static void *wrap_dlopen(const char *filename) { void *h = dlopen(filename, RTLD_NOW); if (h == NULL && opt_debug) { - applog(LOG_DEBUG, "dlopen(%d): failed to load %s", + applog(LOG_DEBUG, "dlopen(%d): failed to load %s", errno, filename); } return (void*)h; diff --git a/quark/cuda_quark_blake512_sp.cuh b/quark/cuda_quark_blake512_sp.cuh index 069620a..64f84fa 100644 --- a/quark/cuda_quark_blake512_sp.cuh +++ b/quark/cuda_quark_blake512_sp.cuh @@ -21,12 +21,7 @@ static __device__ __forceinline__ uint2 cuda_swap(uint2 v) { v.y = t; return v; } -static __device__ __forceinline__ uint2 eorswap32(uint2 u, uint2 v) { - uint2 result; - result.y = u.x ^ v.x; - result.x = u.y ^ v.y; - return result; -} + __constant__ uint2 c_512_u2[16] = { diff --git a/util.cpp b/util.cpp index df32394..ae08cd5 100644 --- a/util.cpp +++ b/util.cpp @@ -559,7 +559,7 @@ static json_t *json_rpc_call(CURL *curl, const char *url, res_val = json_object_get(val, "result"); err_val = json_object_get(val, "error"); - if (!res_val || json_is_null(res_val) || + if (!res_val || //json_is_null(res_val) || (err_val && !json_is_null(err_val))) { char *s = NULL;