From dad01105578318b1b3fb4a3575e222e5ffdae670 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sun, 8 May 2016 17:31:20 +0200 Subject: [PATCH] x17 cleanup haval256 is now 2x faster, but sha512 perf depends a lot on cuda version... --- Makefile.am | 2 +- ccminer.vcxproj | 2 +- ccminer.vcxproj.filters | 4 +- cuda_helper.h | 112 +++++------ x17/cuda_x17_haval256.cu | 351 ++++++++++++++++++++++++++++++++++ x17/cuda_x17_haval512.cu | 400 --------------------------------------- x17/cuda_x17_sha512.cu | 244 +++++++++--------------- x17/x17.cu | 18 +- 8 files changed, 504 insertions(+), 629 deletions(-) create mode 100644 x17/cuda_x17_haval256.cu delete mode 100644 x17/cuda_x17_haval512.cu diff --git a/Makefile.am b/Makefile.am index 9b80922..51a998c 100644 --- a/Makefile.am +++ b/Makefile.am @@ -57,7 +57,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \ x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \ x15/whirlpool.cu \ - x17/x17.cu x17/cuda_x17_haval512.cu x17/cuda_x17_sha512.cu \ + x17/x17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \ x11/c11.cu x11/s3.cu x11/sib.cu x11/cuda_streebog.cu # scrypt diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 18dec13..0460c24 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -508,7 +508,7 @@ - + 80 diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 42c1287..ed942f3 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -595,7 +595,7 @@ Source Files\CUDA\x15 - + Source Files\CUDA\x17 @@ -728,4 +728,4 @@ Ressources - \ No newline at end of file + diff --git a/cuda_helper.h b/cuda_helper.h index ef9ba82..1358892 100644 --- a/cuda_helper.h +++ b/cuda_helper.h @@ -133,8 +133,16 @@ __device__ __forceinline__ uint64_t cuda_swab64(uint64_t x) { // Input: 77665544 33221100 // Output: 00112233 44556677 - uint64_t result = __byte_perm((uint32_t) x, 0, 0x0123); - return (result << 32) | __byte_perm(_HIDWORD(x), 0, 0x0123); + uint64_t result; + //result = __byte_perm((uint32_t) x, 0, 0x0123); + //return (result << 32) + __byte_perm(_HIDWORD(x), 0, 0x0123); + asm("{ .reg .b32 x, y; // swab64\n\t" + "mov.b64 {x,y}, %1;\n\t" + "prmt.b32 x, x, 0, 0x0123;\n\t" + "prmt.b32 y, y, 0, 0x0123;\n\t" + "mov.b64 %0, {y,x};\n\t" + "}\n" : "=l"(result): "l"(x)); + return result; } #else /* host */ @@ -198,7 +206,7 @@ __device__ __forceinline__ uint64_t xor1(uint64_t a, uint64_t b) { uint64_t result; - asm("xor.b64 %0, %1, %2;" : "=l"(result) : "l"(a), "l"(b)); + asm("xor.b64 %0, %1, %2; // xor1" : "=l"(result) : "l"(a), "l"(b)); return result; } #else @@ -211,7 +219,7 @@ __device__ __forceinline__ uint64_t xor3(uint64_t a, uint64_t b, uint64_t c) { uint64_t result; - asm("xor.b64 %0, %2, %3;\n\t" + asm("xor.b64 %0, %2, %3; // xor3\n\t" "xor.b64 %0, %0, %1;\n\t" /* output : input registers */ : "=l"(result) : "l"(a), "l"(b), "l"(c)); @@ -246,49 +254,31 @@ uint64_t xandx(uint64_t a, uint64_t b, uint64_t c) { #ifdef __CUDA_ARCH__ uint64_t result; - asm("{\n\t" + asm("{ // xandx \n\t" ".reg .u64 n;\n\t" "xor.b64 %0, %2, %3;\n\t" "and.b64 n, %0, %1;\n\t" - "xor.b64 %0, n, %3;" - "}\n" - : "=l"(result) : "l"(a), "l"(b), "l"(c)); + "xor.b64 %0, n, %3;\n\t" + "}\n" : "=l"(result) : "l"(a), "l"(b), "l"(c)); return result; #else return ((b^c) & a) ^ c; #endif } -// device asm for x17 -__device__ __forceinline__ -uint64_t sph_t64(uint64_t x) -{ -#ifdef __CUDA_ARCH__ - uint64_t result; - asm("{\n\t" - "and.b64 %0,%1,0xFFFFFFFFFFFFFFFF;\n\t" - "}\n" - : "=l"(result) : "l"(x)); - return result; -#else - return x; -#endif -} - // device asm for x17 __device__ __forceinline__ uint64_t andor(uint64_t a, uint64_t b, uint64_t c) { #ifdef __CUDA_ARCH__ uint64_t result; - asm("{\n\t" + asm("{ // andor\n\t" ".reg .u64 m,n;\n\t" "and.b64 m, %1, %2;\n\t" " or.b64 n, %1, %2;\n\t" "and.b64 %0, n, %3;\n\t" - " or.b64 %0, %0, m ;\n\t" - "}\n" - : "=l"(result) : "l"(a), "l"(b), "l"(c)); + " or.b64 %0, %0, m;\n\t" + "}\n" : "=l"(result) : "l"(a), "l"(b), "l"(c)); return result; #else return ((a | b) & c) | (a & b); @@ -302,7 +292,6 @@ uint64_t shr_t64(uint64_t x, uint32_t n) #ifdef __CUDA_ARCH__ uint64_t result; asm("shr.b64 %0,%1,%2;\n\t" - "and.b64 %0,%0,0xFFFFFFFFFFFFFFFF;\n\t" /* useful ? */ : "=l"(result) : "l"(x), "r"(n)); return result; #else @@ -316,7 +305,6 @@ uint64_t shl_t64(uint64_t x, uint32_t n) #ifdef __CUDA_ARCH__ uint64_t result; asm("shl.b64 %0,%1,%2;\n\t" - "and.b64 %0,%0,0xFFFFFFFFFFFFFFFF;\n\t" /* useful ? */ : "=l"(result) : "l"(x), "r"(n)); return result; #else @@ -370,15 +358,14 @@ __device__ __forceinline__ uint64_t ROTR64(const uint64_t x, const int offset) { uint64_t result; - asm("{\n\t" + asm("{ // ROTR64 \n\t" ".reg .b64 lhs;\n\t" ".reg .u32 roff;\n\t" "shr.b64 lhs, %1, %2;\n\t" "sub.u32 roff, 64, %2;\n\t" "shl.b64 %0, %1, roff;\n\t" "add.u64 %0, %0, lhs;\n\t" - "}\n" - : "=l"(result) : "l"(x), "r"(offset)); + "}\n" : "=l"(result) : "l"(x), "r"(offset)); return result; } #else @@ -405,15 +392,14 @@ __device__ __forceinline__ uint64_t ROTL64(const uint64_t x, const int offset) { uint64_t result; - asm("{\n\t" + asm("{ // ROTL64 \n\t" ".reg .b64 lhs;\n\t" ".reg .u32 roff;\n\t" "shl.b64 lhs, %1, %2;\n\t" "sub.u32 roff, 64, %2;\n\t" "shr.b64 %0, %1, roff;\n\t" "add.u64 %0, lhs, %0;\n\t" - "}\n" - : "=l"(result) : "l"(x), "r"(offset)); + "}\n" : "=l"(result) : "l"(x), "r"(offset)); return result; } #elif __CUDA_ARCH__ >= 320 && USE_ROT_ASM_OPT == 3 @@ -421,7 +407,7 @@ __device__ uint64_t ROTL64(const uint64_t x, const int offset) { uint64_t res; - asm("{\n\t" + asm("{ // ROTL64 \n\t" ".reg .u32 tl,th,vl,vh;\n\t" ".reg .pred p;\n\t" "mov.b64 {tl,th}, %1;\n\t" @@ -430,8 +416,7 @@ uint64_t ROTL64(const uint64_t x, const int offset) "setp.lt.u32 p, %2, 32;\n\t" "@!p mov.b64 %0, {vl,vh};\n\t" "@p mov.b64 %0, {vh,vl};\n\t" - "}" - : "=l"(res) : "l"(x) , "r"(offset) + "}\n" : "=l"(res) : "l"(x) , "r"(offset) ); return res; } @@ -498,11 +483,10 @@ static __device__ __forceinline__ void operator^= (uint2 &a, uint2 b) { a = a ^ static __device__ __forceinline__ uint2 operator+ (uint2 a, uint2 b) { #ifdef __CUDA_ARCH__ uint2 result; - asm("{\n\t" - "add.cc.u32 %0,%2,%4; \n\t" - "addc.u32 %1,%3,%5; \n\t" - "}\n\t" - : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); + asm("{ // uint2 a+b \n\t" + "add.cc.u32 %0, %2, %4; \n\t" + "addc.u32 %1, %3, %5; \n\t" + "}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); return result; #else return vectorize(devectorize(a) + devectorize(b)); @@ -514,11 +498,10 @@ static __device__ __forceinline__ void operator+= (uint2 &a, uint2 b) { a = a + static __device__ __forceinline__ uint2 operator- (uint2 a, uint2 b) { #if defined(__CUDA_ARCH__) && CUDA_VERSION < 7000 uint2 result; - asm("{\n\t" - "sub.cc.u32 %0,%2,%4; \n\t" - "subc.u32 %1,%3,%5; \n\t" - "}\n\t" - : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); + asm("{ // uint2 a-b \n\t" + "sub.cc.u32 %0, %2, %4; \n\t" + "subc.u32 %1, %3, %5; \n\t" + "}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); return result; #else return vectorize(devectorize(a) - devectorize(b)); @@ -534,13 +517,12 @@ static __device__ __forceinline__ uint2 operator* (uint2 a, uint2 b) { #ifdef __CUDA_ARCH__ uint2 result; - asm("{\n\t" - "mul.lo.u32 %0,%2,%4; \n\t" - "mul.hi.u32 %1,%2,%4; \n\t" - "mad.lo.cc.u32 %1,%3,%4,%1; \n\t" - "madc.lo.u32 %1,%3,%5,%1; \n\t" - "}\n\t" - : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); + asm("{ // uint2 a*b \n\t" + "mul.lo.u32 %0, %2, %4; \n\t" + "mul.hi.u32 %1, %2, %4; \n\t" + "mad.lo.cc.u32 %1, %3, %4, %1; \n\t" + "madc.lo.u32 %1, %3, %5, %1; \n\t" + "}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(b.x), "r"(b.y)); return result; #else // incorrect but unused host equiv @@ -646,17 +628,15 @@ static uint2 SHL2(uint2 a, int offset) #if __CUDA_ARCH__ > 300 uint2 result; if (offset < 32) { - asm("{\n\t" - "shf.l.clamp.b32 %1,%2,%3,%4; \n\t" - "shl.b32 %0,%2,%4; \n\t" - "}\n\t" - : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); + asm("{ // SHL2 (l) \n\t" + "shf.l.clamp.b32 %1, %2, %3, %4; \n\t" + "shl.b32 %0, %2, %4; \n\t" + "}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.x), "r"(a.y), "r"(offset)); } else { - asm("{\n\t" - "shf.l.clamp.b32 %1,%2,%3,%4; \n\t" - "shl.b32 %0,%2,%4; \n\t" - "}\n\t" - : "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); + asm("{ // SHL2 (h) \n\t" + "shf.l.clamp.b32 %1, %2, %3, %4; \n\t" + "shl.b32 %0, %2, %4; \n\t" + "}\n" : "=r"(result.x), "=r"(result.y) : "r"(a.y), "r"(a.x), "r"(offset)); } return result; #else diff --git a/x17/cuda_x17_haval256.cu b/x17/cuda_x17_haval256.cu new file mode 100644 index 0000000..8573e64 --- /dev/null +++ b/x17/cuda_x17_haval256.cu @@ -0,0 +1,351 @@ +/* + * haval-256 kernel implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2014 djm34 + * 2016 tpruvot + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + */ +#include +#include + +#include "cuda_helper.h" + +#define F1(x6, x5, x4, x3, x2, x1, x0) \ + (((x1) & ((x0) ^ (x4))) ^ ((x2) & (x5)) ^ ((x3) & (x6)) ^ (x0)) + +#define F2(x6, x5, x4, x3, x2, x1, x0) \ + (((x2) & (((x1) & ~(x3)) ^ ((x4) & (x5)) ^ (x6) ^ (x0))) \ + ^ ((x4) & ((x1) ^ (x5))) ^ ((x3 & (x5)) ^ (x0))) + +#define F3(x6, x5, x4, x3, x2, x1, x0) \ + (((x3) & (((x1) & (x2)) ^ (x6) ^ (x0))) \ + ^ ((x1) & (x4)) ^ ((x2) & (x5)) ^ (x0)) + +#define F4(x6, x5, x4, x3, x2, x1, x0) \ + (((x3) & (((x1) & (x2)) ^ ((x4) | (x6)) ^ (x5))) \ + ^ ((x4) & ((~(x2) & (x5)) ^ (x1) ^ (x6) ^ (x0))) \ + ^ ((x2) & (x6)) ^ (x0)) + +#define F5(x6, x5, x4, x3, x2, x1, x0) \ + (((x0) & ~(((x1) & (x2) & (x3)) ^ (x5))) \ + ^ ((x1) & (x4)) ^ ((x2) & (x5)) ^ ((x3) & (x6))) + +#define FP5_1(x6, x5, x4, x3, x2, x1, x0) \ + F1(x3, x4, x1, x0, x5, x2, x6) +#define FP5_2(x6, x5, x4, x3, x2, x1, x0) \ + F2(x6, x2, x1, x0, x3, x4, x5) +#define FP5_3(x6, x5, x4, x3, x2, x1, x0) \ + F3(x2, x6, x0, x4, x3, x1, x5) +#define FP5_4(x6, x5, x4, x3, x2, x1, x0) \ + F4(x1, x5, x3, x2, x0, x4, x6) +#define FP5_5(x6, x5, x4, x3, x2, x1, x0) \ + F5(x2, x5, x0, x6, x4, x3, x1) + +#define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) { \ + uint32_t t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \ + (x7) = (uint32_t)(ROTR32(t, 7) + ROTR32((x7), 11) + (w) + (c)); \ +} + +#define PASS1(n, in) { \ + STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[ 0], 0U); \ + STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[ 1], 0U); \ + STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[ 2], 0U); \ + STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[ 3], 0U); \ + STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[ 4], 0U); \ + STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[ 5], 0U); \ + STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[ 6], 0U); \ + STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[ 7], 0U); \ + \ + STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[ 8], 0U); \ + STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], 0U); \ + STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[10], 0U); \ + STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[11], 0U); \ + STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[12], 0U); \ + STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[13], 0U); \ + STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[14], 0U); \ + STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[15], 0U); \ + \ + STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[16], 0U); \ + STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[17], 0U); \ + STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[18], 0U); \ + STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[19], 0U); \ + STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[20], 0U); \ + STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[21], 0U); \ + STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[22], 0U); \ + STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[23], 0U); \ + \ + STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[24], 0U); \ + STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[25], 0U); \ + STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[26], 0U); \ + STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[27], 0U); \ + STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[28], 0U); \ + STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[29], 0U); \ + STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[30], 0U); \ + STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[31], 0U); \ +} + +#define PASS2(n, in) { \ + STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[ 5], 0x452821E6); \ + STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[14], 0x38D01377); \ + STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[26], 0xBE5466CF); \ + STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[18], 0x34E90C6C); \ + STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[11], 0xC0AC29B7); \ + STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[28], 0xC97C50DD); \ + STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[ 7], 0x3F84D5B5); \ + STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[16], 0xB5470917); \ + \ + STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[ 0], 0x9216D5D9); \ + STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[23], 0x8979FB1B); \ + STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[20], 0xD1310BA6); \ + STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[22], 0x98DFB5AC); \ + STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[ 1], 0x2FFD72DB); \ + STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[10], 0xD01ADFB7); \ + STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[ 4], 0xB8E1AFED); \ + STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[ 8], 0x6A267E96); \ + \ + STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[30], 0xBA7C9045); \ + STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[ 3], 0xF12C7F99); \ + STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[21], 0x24A19947); \ + STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[ 9], 0xB3916CF7); \ + STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[17], 0x0801F2E2); \ + STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[24], 0x858EFC16); \ + STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[29], 0x636920D8); \ + STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[ 6], 0x71574E69); \ + \ + STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[19], 0xA458FEA3); \ + STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[12], 0xF4933D7E); \ + STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[15], 0x0D95748F); \ + STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[13], 0x728EB658); \ + STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[ 2], 0x718BCD58); \ + STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[25], 0x82154AEE); \ + STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[31], 0x7B54A41D); \ + STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[27], 0xC25A59B5); \ +} + +#define PASS3(n, in) { \ + STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[19], 0x9C30D539); \ + STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], 0x2AF26013); \ + STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[ 4], 0xC5D1B023); \ + STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[20], 0x286085F0); \ + STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[28], 0xCA417918); \ + STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[17], 0xB8DB38EF); \ + STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[ 8], 0x8E79DCB0); \ + STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[22], 0x603A180E); \ + \ + STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[29], 0x6C9E0E8B); \ + STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[14], 0xB01E8A3E); \ + STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[25], 0xD71577C1); \ + STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[12], 0xBD314B27); \ + STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[24], 0x78AF2FDA); \ + STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[30], 0x55605C60); \ + STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[16], 0xE65525F3); \ + STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[26], 0xAA55AB94); \ + \ + STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[31], 0x57489862); \ + STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[15], 0x63E81440); \ + STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[ 7], 0x55CA396A); \ + STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[ 3], 0x2AAB10B6); \ + STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[ 1], 0xB4CC5C34); \ + STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[ 0], 0x1141E8CE); \ + STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[18], 0xA15486AF); \ + STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[27], 0x7C72E993); \ + \ + STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[13], 0xB3EE1411); \ + STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[ 6], 0x636FBC2A); \ + STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[21], 0x2BA9C55D); \ + STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[10], 0x741831F6); \ + STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[23], 0xCE5C3E16); \ + STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[11], 0x9B87931E); \ + STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[ 5], 0xAFD6BA33); \ + STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[ 2], 0x6C24CF5C); \ +} + +#define PASS4(n, in) { \ + STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[24], 0x7A325381); \ + STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[ 4], 0x28958677); \ + STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[ 0], 0x3B8F4898); \ + STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[14], 0x6B4BB9AF); \ + STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[ 2], 0xC4BFE81B); \ + STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[ 7], 0x66282193); \ + STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[28], 0x61D809CC); \ + STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[23], 0xFB21A991); \ + \ + STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[26], 0x487CAC60); \ + STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[ 6], 0x5DEC8032); \ + STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[30], 0xEF845D5D); \ + STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[20], 0xE98575B1); \ + STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[18], 0xDC262302); \ + STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[25], 0xEB651B88); \ + STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[19], 0x23893E81); \ + STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[ 3], 0xD396ACC5); \ + \ + STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[22], 0x0F6D6FF3); \ + STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[11], 0x83F44239); \ + STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[31], 0x2E0B4482); \ + STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[21], 0xA4842004); \ + STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[ 8], 0x69C8F04A); \ + STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[27], 0x9E1F9B5E); \ + STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[12], 0x21C66842); \ + STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[ 9], 0xF6E96C9A); \ + \ + STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[ 1], 0x670C9C61); \ + STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[29], 0xABD388F0); \ + STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[ 5], 0x6A51A0D2); \ + STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[15], 0xD8542F68); \ + STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[17], 0x960FA728); \ + STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[10], 0xAB5133A3); \ + STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[16], 0x6EEF0B6C); \ + STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[13], 0x137A3BE4); \ +} + +#define PASS5(n, in) { \ + STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[27], 0xBA3BF050); \ + STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 3], 0x7EFB2A98); \ + STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[21], 0xA1F1651D); \ + STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[26], 0x39AF0176); \ + STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[17], 0x66CA593E); \ + STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[11], 0x82430E88); \ + STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[20], 0x8CEE8619); \ + STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[29], 0x456F9FB4); \ + \ + STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[19], 0x7D84A5C3); \ + STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 0], 0x3B8B5EBE); \ + STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[12], 0xE06F75D8); \ + STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[ 7], 0x85C12073); \ + STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[13], 0x401A449F); \ + STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 8], 0x56C16AA6); \ + STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[31], 0x4ED3AA62); \ + STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[10], 0x363F7706); \ + \ + STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[ 5], 0x1BFEDF72); \ + STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], 0x429B023D); \ + STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[14], 0x37D0D724); \ + STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[30], 0xD00A1248); \ + STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[18], 0xDB0FEAD3); \ + STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 6], 0x49F1C09B); \ + STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[28], 0x075372C9); \ + STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[24], 0x80991B7B); \ + \ + STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[ 2], 0x25D479D8); \ + STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[23], 0xF6E8DEF7); \ + STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[16], 0xE3FE501A); \ + STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[22], 0xB6794C3B); \ + STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[ 4], 0x976CE0BD); \ + STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 1], 0x04C006BA); \ + STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[25], 0xC1A94FB6); \ + STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[15], 0x409F60C4); \ +} + +__global__ /* __launch_bounds__(256, 6) */ +void x17_haval256_gpu_hash_64(uint32_t threads, uint64_t *g_hash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint64_t *pHash = &g_hash[thread*8U]; + + uint32_t s0, s1, s2, s3, s4, s5, s6, s7; + const uint32_t u0 = s0 = 0x243F6A88; + const uint32_t u1 = s1 = 0x85A308D3; + const uint32_t u2 = s2 = 0x13198A2E; + const uint32_t u3 = s3 = 0x03707344; + const uint32_t u4 = s4 = 0xA4093822; + const uint32_t u5 = s5 = 0x299F31D0; + const uint32_t u6 = s6 = 0x082EFA98; + const uint32_t u7 = s7 = 0xEC4E6C89; + + union { + uint32_t h4[16]; + uint64_t h8[8]; + } hash; + + #pragma unroll + for (int i=0; i<8; i++) { + hash.h8[i] = pHash[i]; + } + +///////// input big ///////////////////// + + uint32_t buf[32]; + + #pragma unroll + for (int i=0; i<16; i++) + buf[i] = hash.h4[i]; + + buf[16] = 0x00000001; + + #pragma unroll + for (int i=17; i<29; i++) + buf[i] = 0; + + buf[29] = 0x40290000; + buf[30] = 0x00000200; + buf[31] = 0; + + PASS1(5, buf); + PASS2(5, buf); + PASS3(5, buf); + PASS4(5, buf); + PASS5(5, buf); + + hash.h4[0] = s0 + u0; + hash.h4[1] = s1 + u1; + hash.h4[2] = s2 + u2; + hash.h4[3] = s3 + u3; + hash.h4[4] = s4 + u4; + hash.h4[5] = s5 + u5; + hash.h4[6] = s6 + u6; + hash.h4[7] = s7 + u7; + + pHash[0] = hash.h8[0]; + pHash[1] = hash.h8[1]; + pHash[2] = hash.h8[2]; + pHash[3] = hash.h8[3]; +#ifdef NEED_HASH_512 + pHash[4] = hash.h8[4]; + pHash[5] = hash.h8[5]; + pHash[6] = hash.h8[6]; + pHash[7] = hash.h8[7]; +#endif + } +} + +__host__ +void x17_haval256_cpu_init(int thr_id, uint32_t threads) +{ +} + +__host__ +void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order) +{ + const uint32_t threadsperblock = 256; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + x17_haval256_gpu_hash_64 <<>> (threads, (uint64_t*)d_hash); + + //MyStreamSynchronize(NULL, order, thr_id); +} diff --git a/x17/cuda_x17_haval512.cu b/x17/cuda_x17_haval512.cu deleted file mode 100644 index c354782..0000000 --- a/x17/cuda_x17_haval512.cu +++ /dev/null @@ -1,400 +0,0 @@ -/* - * Haval-512 for X17 - * - * Built on cbuchner1's implementation, actual hashing code - * heavily based on phm's sgminer - * - */ - -/* - * Haval-512 kernel implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2014 djm34 - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - * @author phm - */ -#include -#include - -#define USE_SHARED 1 - -#define SPH_T64(x) ((x) & 0xFFFFFFFFFFFFFFFFULL) - -#include "cuda_helper.h" - -#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) -#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) - -static __constant__ uint32_t initVector[8]; - -static const uint32_t c_initVector[8] = { - SPH_C32(0x243F6A88), - SPH_C32(0x85A308D3), - SPH_C32(0x13198A2E), - SPH_C32(0x03707344), - SPH_C32(0xA4093822), - SPH_C32(0x299F31D0), - SPH_C32(0x082EFA98), - SPH_C32(0xEC4E6C89) -}; - -#define PASS1(n, in) { \ - STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[ 0], SPH_C32(0x00000000)); \ - STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[ 1], SPH_C32(0x00000000)); \ - STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[ 2], SPH_C32(0x00000000)); \ - STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[ 3], SPH_C32(0x00000000)); \ - STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[ 4], SPH_C32(0x00000000)); \ - STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[ 5], SPH_C32(0x00000000)); \ - STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[ 6], SPH_C32(0x00000000)); \ - STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[ 7], SPH_C32(0x00000000)); \ - \ - STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[ 8], SPH_C32(0x00000000)); \ - STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], SPH_C32(0x00000000)); \ - STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[10], SPH_C32(0x00000000)); \ - STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[11], SPH_C32(0x00000000)); \ - STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[12], SPH_C32(0x00000000)); \ - STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[13], SPH_C32(0x00000000)); \ - STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[14], SPH_C32(0x00000000)); \ - STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[15], SPH_C32(0x00000000)); \ - \ - STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[16], SPH_C32(0x00000000)); \ - STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[17], SPH_C32(0x00000000)); \ - STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[18], SPH_C32(0x00000000)); \ - STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[19], SPH_C32(0x00000000)); \ - STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[20], SPH_C32(0x00000000)); \ - STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[21], SPH_C32(0x00000000)); \ - STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[22], SPH_C32(0x00000000)); \ - STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[23], SPH_C32(0x00000000)); \ - \ - STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, in[24], SPH_C32(0x00000000)); \ - STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, in[25], SPH_C32(0x00000000)); \ - STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, in[26], SPH_C32(0x00000000)); \ - STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, in[27], SPH_C32(0x00000000)); \ - STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, in[28], SPH_C32(0x00000000)); \ - STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, in[29], SPH_C32(0x00000000)); \ - STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, in[30], SPH_C32(0x00000000)); \ - STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, in[31], SPH_C32(0x00000000)); \ -} - -#define PASS2(n, in) { \ - STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[ 5], SPH_C32(0x452821E6)); \ - STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[14], SPH_C32(0x38D01377)); \ - STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[26], SPH_C32(0xBE5466CF)); \ - STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[18], SPH_C32(0x34E90C6C)); \ - STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[11], SPH_C32(0xC0AC29B7)); \ - STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[28], SPH_C32(0xC97C50DD)); \ - STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[ 7], SPH_C32(0x3F84D5B5)); \ - STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[16], SPH_C32(0xB5470917)); \ - \ - STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[ 0], SPH_C32(0x9216D5D9)); \ - STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[23], SPH_C32(0x8979FB1B)); \ - STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[20], SPH_C32(0xD1310BA6)); \ - STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[22], SPH_C32(0x98DFB5AC)); \ - STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[ 1], SPH_C32(0x2FFD72DB)); \ - STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[10], SPH_C32(0xD01ADFB7)); \ - STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[ 4], SPH_C32(0xB8E1AFED)); \ - STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[ 8], SPH_C32(0x6A267E96)); \ - \ - STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[30], SPH_C32(0xBA7C9045)); \ - STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[ 3], SPH_C32(0xF12C7F99)); \ - STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[21], SPH_C32(0x24A19947)); \ - STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[ 9], SPH_C32(0xB3916CF7)); \ - STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[17], SPH_C32(0x0801F2E2)); \ - STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[24], SPH_C32(0x858EFC16)); \ - STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[29], SPH_C32(0x636920D8)); \ - STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[ 6], SPH_C32(0x71574E69)); \ - \ - STEP(n, 2, s7, s6, s5, s4, s3, s2, s1, s0, in[19], SPH_C32(0xA458FEA3)); \ - STEP(n, 2, s6, s5, s4, s3, s2, s1, s0, s7, in[12], SPH_C32(0xF4933D7E)); \ - STEP(n, 2, s5, s4, s3, s2, s1, s0, s7, s6, in[15], SPH_C32(0x0D95748F)); \ - STEP(n, 2, s4, s3, s2, s1, s0, s7, s6, s5, in[13], SPH_C32(0x728EB658)); \ - STEP(n, 2, s3, s2, s1, s0, s7, s6, s5, s4, in[ 2], SPH_C32(0x718BCD58)); \ - STEP(n, 2, s2, s1, s0, s7, s6, s5, s4, s3, in[25], SPH_C32(0x82154AEE)); \ - STEP(n, 2, s1, s0, s7, s6, s5, s4, s3, s2, in[31], SPH_C32(0x7B54A41D)); \ - STEP(n, 2, s0, s7, s6, s5, s4, s3, s2, s1, in[27], SPH_C32(0xC25A59B5)); \ -} - -#define PASS3(n, in) { \ - STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[19], SPH_C32(0x9C30D539)); \ - STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], SPH_C32(0x2AF26013)); \ - STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[ 4], SPH_C32(0xC5D1B023)); \ - STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[20], SPH_C32(0x286085F0)); \ - STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[28], SPH_C32(0xCA417918)); \ - STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[17], SPH_C32(0xB8DB38EF)); \ - STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[ 8], SPH_C32(0x8E79DCB0)); \ - STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[22], SPH_C32(0x603A180E)); \ - \ - STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[29], SPH_C32(0x6C9E0E8B)); \ - STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[14], SPH_C32(0xB01E8A3E)); \ - STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[25], SPH_C32(0xD71577C1)); \ - STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[12], SPH_C32(0xBD314B27)); \ - STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[24], SPH_C32(0x78AF2FDA)); \ - STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[30], SPH_C32(0x55605C60)); \ - STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[16], SPH_C32(0xE65525F3)); \ - STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[26], SPH_C32(0xAA55AB94)); \ - \ - STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[31], SPH_C32(0x57489862)); \ - STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[15], SPH_C32(0x63E81440)); \ - STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[ 7], SPH_C32(0x55CA396A)); \ - STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[ 3], SPH_C32(0x2AAB10B6)); \ - STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[ 1], SPH_C32(0xB4CC5C34)); \ - STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[ 0], SPH_C32(0x1141E8CE)); \ - STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[18], SPH_C32(0xA15486AF)); \ - STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[27], SPH_C32(0x7C72E993)); \ - \ - STEP(n, 3, s7, s6, s5, s4, s3, s2, s1, s0, in[13], SPH_C32(0xB3EE1411)); \ - STEP(n, 3, s6, s5, s4, s3, s2, s1, s0, s7, in[ 6], SPH_C32(0x636FBC2A)); \ - STEP(n, 3, s5, s4, s3, s2, s1, s0, s7, s6, in[21], SPH_C32(0x2BA9C55D)); \ - STEP(n, 3, s4, s3, s2, s1, s0, s7, s6, s5, in[10], SPH_C32(0x741831F6)); \ - STEP(n, 3, s3, s2, s1, s0, s7, s6, s5, s4, in[23], SPH_C32(0xCE5C3E16)); \ - STEP(n, 3, s2, s1, s0, s7, s6, s5, s4, s3, in[11], SPH_C32(0x9B87931E)); \ - STEP(n, 3, s1, s0, s7, s6, s5, s4, s3, s2, in[ 5], SPH_C32(0xAFD6BA33)); \ - STEP(n, 3, s0, s7, s6, s5, s4, s3, s2, s1, in[ 2], SPH_C32(0x6C24CF5C)); \ -} - -#define PASS4(n, in) { \ - STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[24], SPH_C32(0x7A325381)); \ - STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[ 4], SPH_C32(0x28958677)); \ - STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[ 0], SPH_C32(0x3B8F4898)); \ - STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[14], SPH_C32(0x6B4BB9AF)); \ - STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[ 2], SPH_C32(0xC4BFE81B)); \ - STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[ 7], SPH_C32(0x66282193)); \ - STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[28], SPH_C32(0x61D809CC)); \ - STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[23], SPH_C32(0xFB21A991)); \ - \ - STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[26], SPH_C32(0x487CAC60)); \ - STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[ 6], SPH_C32(0x5DEC8032)); \ - STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[30], SPH_C32(0xEF845D5D)); \ - STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[20], SPH_C32(0xE98575B1)); \ - STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[18], SPH_C32(0xDC262302)); \ - STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[25], SPH_C32(0xEB651B88)); \ - STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[19], SPH_C32(0x23893E81)); \ - STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[ 3], SPH_C32(0xD396ACC5)); \ - \ - STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[22], SPH_C32(0x0F6D6FF3)); \ - STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[11], SPH_C32(0x83F44239)); \ - STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[31], SPH_C32(0x2E0B4482)); \ - STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[21], SPH_C32(0xA4842004)); \ - STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[ 8], SPH_C32(0x69C8F04A)); \ - STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[27], SPH_C32(0x9E1F9B5E)); \ - STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[12], SPH_C32(0x21C66842)); \ - STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[ 9], SPH_C32(0xF6E96C9A)); \ - \ - STEP(n, 4, s7, s6, s5, s4, s3, s2, s1, s0, in[ 1], SPH_C32(0x670C9C61)); \ - STEP(n, 4, s6, s5, s4, s3, s2, s1, s0, s7, in[29], SPH_C32(0xABD388F0)); \ - STEP(n, 4, s5, s4, s3, s2, s1, s0, s7, s6, in[ 5], SPH_C32(0x6A51A0D2)); \ - STEP(n, 4, s4, s3, s2, s1, s0, s7, s6, s5, in[15], SPH_C32(0xD8542F68)); \ - STEP(n, 4, s3, s2, s1, s0, s7, s6, s5, s4, in[17], SPH_C32(0x960FA728)); \ - STEP(n, 4, s2, s1, s0, s7, s6, s5, s4, s3, in[10], SPH_C32(0xAB5133A3)); \ - STEP(n, 4, s1, s0, s7, s6, s5, s4, s3, s2, in[16], SPH_C32(0x6EEF0B6C)); \ - STEP(n, 4, s0, s7, s6, s5, s4, s3, s2, s1, in[13], SPH_C32(0x137A3BE4)); \ -} - -#define PASS5(n, in) { \ - STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[27], SPH_C32(0xBA3BF050)); \ - STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 3], SPH_C32(0x7EFB2A98)); \ - STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[21], SPH_C32(0xA1F1651D)); \ - STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[26], SPH_C32(0x39AF0176)); \ - STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[17], SPH_C32(0x66CA593E)); \ - STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[11], SPH_C32(0x82430E88)); \ - STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[20], SPH_C32(0x8CEE8619)); \ - STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[29], SPH_C32(0x456F9FB4)); \ - \ - STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[19], SPH_C32(0x7D84A5C3)); \ - STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 0], SPH_C32(0x3B8B5EBE)); \ - STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[12], SPH_C32(0xE06F75D8)); \ - STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[ 7], SPH_C32(0x85C12073)); \ - STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[13], SPH_C32(0x401A449F)); \ - STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 8], SPH_C32(0x56C16AA6)); \ - STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[31], SPH_C32(0x4ED3AA62)); \ - STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[10], SPH_C32(0x363F7706)); \ - \ - STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[ 5], SPH_C32(0x1BFEDF72)); \ - STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[ 9], SPH_C32(0x429B023D)); \ - STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[14], SPH_C32(0x37D0D724)); \ - STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[30], SPH_C32(0xD00A1248)); \ - STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[18], SPH_C32(0xDB0FEAD3)); \ - STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 6], SPH_C32(0x49F1C09B)); \ - STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[28], SPH_C32(0x075372C9)); \ - STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[24], SPH_C32(0x80991B7B)); \ - \ - STEP(n, 5, s7, s6, s5, s4, s3, s2, s1, s0, in[ 2], SPH_C32(0x25D479D8)); \ - STEP(n, 5, s6, s5, s4, s3, s2, s1, s0, s7, in[23], SPH_C32(0xF6E8DEF7)); \ - STEP(n, 5, s5, s4, s3, s2, s1, s0, s7, s6, in[16], SPH_C32(0xE3FE501A)); \ - STEP(n, 5, s4, s3, s2, s1, s0, s7, s6, s5, in[22], SPH_C32(0xB6794C3B)); \ - STEP(n, 5, s3, s2, s1, s0, s7, s6, s5, s4, in[ 4], SPH_C32(0x976CE0BD)); \ - STEP(n, 5, s2, s1, s0, s7, s6, s5, s4, s3, in[ 1], SPH_C32(0x04C006BA)); \ - STEP(n, 5, s1, s0, s7, s6, s5, s4, s3, s2, in[25], SPH_C32(0xC1A94FB6)); \ - STEP(n, 5, s0, s7, s6, s5, s4, s3, s2, s1, in[15], SPH_C32(0x409F60C4)); \ -} - -#define F1(x6, x5, x4, x3, x2, x1, x0) \ - (((x1) & ((x0) ^ (x4))) ^ ((x2) & (x5)) ^ ((x3) & (x6)) ^ (x0)) - -#define F2(x6, x5, x4, x3, x2, x1, x0) \ - (((x2) & (((x1) & ~(x3)) ^ ((x4) & (x5)) ^ (x6) ^ (x0))) \ - ^ ((x4) & ((x1) ^ (x5))) ^ ((x3 & (x5)) ^ (x0))) - -#define F3(x6, x5, x4, x3, x2, x1, x0) \ - (((x3) & (((x1) & (x2)) ^ (x6) ^ (x0))) \ - ^ ((x1) & (x4)) ^ ((x2) & (x5)) ^ (x0)) - -#define F4(x6, x5, x4, x3, x2, x1, x0) \ - (((x3) & (((x1) & (x2)) ^ ((x4) | (x6)) ^ (x5))) \ - ^ ((x4) & ((~(x2) & (x5)) ^ (x1) ^ (x6) ^ (x0))) \ - ^ ((x2) & (x6)) ^ (x0)) - -#define F5(x6, x5, x4, x3, x2, x1, x0) \ - (((x0) & ~(((x1) & (x2) & (x3)) ^ (x5))) \ - ^ ((x1) & (x4)) ^ ((x2) & (x5)) ^ ((x3) & (x6))) - -#define FP5_1(x6, x5, x4, x3, x2, x1, x0) \ - F1(x3, x4, x1, x0, x5, x2, x6) -#define FP5_2(x6, x5, x4, x3, x2, x1, x0) \ - F2(x6, x2, x1, x0, x3, x4, x5) -#define FP5_3(x6, x5, x4, x3, x2, x1, x0) \ - F3(x2, x6, x0, x4, x3, x1, x5) -#define FP5_4(x6, x5, x4, x3, x2, x1, x0) \ - F4(x1, x5, x3, x2, x0, x4, x6) -#define FP5_5(x6, x5, x4, x3, x2, x1, x0) \ - F5(x2, x5, x0, x6, x4, x3, x1) - - -#define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) { \ - uint32_t t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \ - (x7) = SPH_T32(SPH_ROTR32(t, 7) + SPH_ROTR32((x7), 11) \ - + (w) + (c)); \ - } - - -__global__ -void x17_haval256_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) -{ - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t nounce = g_nonceVector ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; - uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition]; - union { - uint8_t h1[64]; - uint32_t h4[16]; - uint64_t h8[8]; - } hash; - - uint32_t u0, u1, u2, u3, u4, u5, u6, u7; - uint32_t s0,s1,s2,s3,s4,s5,s6,s7; - uint32_t buf[32]; - - s0 = initVector[0]; - s1 = initVector[1]; - s2 = initVector[2]; - s3 = initVector[3]; - s4 = initVector[4]; - s5 = initVector[5]; - s6 = initVector[6]; - s7 = initVector[7]; - - u0 = s0; - u1 = s1; - u2 = s2; - u3 = s3; - u4 = s4; - u5 = s5; - u6 = s6; - u7 = s7; - - #pragma unroll - for (int i=0; i<16; i++) { - hash.h4[i]= inpHash[i]; - } - -///////// input big ///////////////////// - - #pragma unroll - for (int i=0; i<32; i++) { - if (i<16) { - buf[i]=hash.h4[i]; - } else { - buf[i]=0; - } - } - - buf[16]=0x00000001; - buf[29]=0x40290000; - buf[30]=0x00000200; - - PASS1(5, buf); - PASS2(5, buf); - PASS3(5, buf); - PASS4(5, buf); - PASS5(5, buf); - - s0 = SPH_T32(s0 + u0); - s1 = SPH_T32(s1 + u1); - s2 = SPH_T32(s2 + u2); - s3 = SPH_T32(s3 + u3); - s4 = SPH_T32(s4 + u4); - s5 = SPH_T32(s5 + u5); - s6 = SPH_T32(s6 + u6); - s7 = SPH_T32(s7 + u7); - - hash.h4[0]=s0; - hash.h4[1]=s1; - hash.h4[2]=s2; - hash.h4[3]=s3; - hash.h4[4]=s4; - hash.h4[5]=s5; - hash.h4[6]=s6; - hash.h4[7]=s7; - - #pragma unroll 16 - for (int u = 0; u < 16; u ++) - inpHash[u] = hash.h4[u]; - } // threads -} - -__host__ -void x17_haval256_cpu_init(int thr_id, uint32_t threads) -{ - cudaMemcpyToSymbol(initVector,c_initVector,sizeof(c_initVector),0, cudaMemcpyHostToDevice); -} - -__host__ -void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) -{ - const uint32_t threadsperblock = 256; // Alignment mit mixtab Grösse. NICHT ÄNDERN - - // berechne wie viele Thread Blocks wir brauchen - dim3 grid((threads + threadsperblock-1)/threadsperblock); - dim3 block(threadsperblock); - - size_t shared_size = 0; - - x17_haval256_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); - - MyStreamSynchronize(NULL, order, thr_id); -} diff --git a/x17/cuda_x17_sha512.cu b/x17/cuda_x17_sha512.cu index e67a1f0..088385e 100644 --- a/x17/cuda_x17_sha512.cu +++ b/x17/cuda_x17_sha512.cu @@ -1,14 +1,10 @@ -/** - * sha512 djm34 - * (cleaned by tpruvot) - */ - /* - * sha-512 kernel implementation. + * sha-512 cuda kernel implementation. * * ==========================(LICENSE BEGIN)============================ * * Copyright (c) 2014 djm34 + * 2016 tpruvot * * Permission is hereby granted, free of charge, to any person obtaining * a copy of this software and associated documentation files (the @@ -30,206 +26,146 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * * ===========================(LICENSE END)============================= - * - * @author phm */ #include -#define USE_SHARED 1 -#define SPH_C64(x) ((uint64_t)(x ## ULL)) +#define NEED_HASH_512 #include "cuda_helper.h" #define SWAP64(u64) cuda_swab64(u64) -#define SPH_ROTL32(x, n) SPH_T32(((x) << (n)) | ((x) >> (32 - (n)))) -#define SPH_ROTR32(x, n) SPH_ROTL32(x, (32 - (n))) - -static __constant__ uint64_t H_512[8]; - -static const uint64_t H512[8] = { - SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B), - SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1), - SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F), - SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179) +static __constant__ uint64_t c_WB[80]; + +static const uint64_t WB[80] = { + 0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC, + 0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118, + 0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2, + 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694, + 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65, + 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5, + 0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4, + 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70, + 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF, + 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B, + 0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30, + 0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8, + 0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8, + 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3, + 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC, + 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B, + 0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178, + 0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B, + 0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C, + 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817 }; -static __constant__ uint64_t K_512[80]; - -static const uint64_t K512[80] = { - SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD), - SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC), - SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019), - SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118), - SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE), - SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2), - SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1), - SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694), - SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3), - SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65), - SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483), - SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5), - SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210), - SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4), - SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725), - SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70), - SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926), - SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF), - SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8), - SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B), - SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001), - SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30), - SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910), - SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8), - SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53), - SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8), - SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB), - SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3), - SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60), - SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC), - SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9), - SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B), - SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207), - SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178), - SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6), - SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B), - SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493), - SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C), - SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A), - SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817) -}; - -#define SHA3_STEP(ord,r,i) { \ - uint64_t T1, T2; \ - int a = 8-ord; \ - T1 = SPH_T64(r[(7+a)%8] + BSG5_1(r[(4+a)%8]) + CH(r[(4+a)%8], r[(5+a)%8], r[(6+a)%8]) + K_512[i] + W[i]); \ - T2 = SPH_T64(BSG5_0(r[(0+a)%8]) + MAJ(r[(0+a)%8], r[(1+a)%8], r[(2+a)%8])); \ - r[(3+a)%8] = SPH_T64(r[(3+a)%8] + T1); \ - r[(7+a)%8] = SPH_T64(T1 + T2); \ - } +#define BSG5_0(x) xor3(ROTR64(x,28), ROTR64(x,34), ROTR64(x,39)) +#define SSG5_0(x) xor3(ROTR64(x, 1), ROTR64(x ,8), shr_t64(x,7)) +#define SSG5_1(x) xor3(ROTR64(x,19), ROTR64(x,61), shr_t64(x,6)) -#define SHA3_STEP2(truc,ord,r,i) { \ - uint64_t T1, T2; \ - int a = 8-ord; \ - T1 = Tone(truc,r,W,a,i); \ - T2 = SPH_T64(BSG5_0(r[(0+a)%8]) + MAJ(r[(0+a)%8], r[(1+a)%8], r[(2+a)%8])); \ - r[(3+a)%8] = SPH_T64(r[(3+a)%8] + T1); \ - r[(7+a)%8] = SPH_T64(T1 + T2); \ - } -//#define BSG5_0(x) (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39)) -#define BSG5_0(x) xor3(ROTR64(x, 28),ROTR64(x, 34),ROTR64(x, 39)) - -//#define BSG5_1(x) (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41)) -#define BSG5_1(x) xor3(ROTR64(x, 14),ROTR64(x, 18),ROTR64(x, 41)) - -//#define SSG5_0(x) (ROTR64(x, 1) ^ ROTR64(x, 8) ^ SPH_T64((x) >> 7)) -#define SSG5_0(x) xor3(ROTR64(x, 1),ROTR64(x, 8),shr_t64(x,7)) - -//#define SSG5_1(x) (ROTR64(x, 19) ^ ROTR64(x, 61) ^ SPH_T64((x) >> 6)) -#define SSG5_1(x) xor3(ROTR64(x, 19),ROTR64(x, 61),shr_t64(x,6)) - -//#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z)) -#define CH(x, y, z) xandx(x,y,z) //#define MAJ(X, Y, Z) (((X) & (Y)) | (((X) | (Y)) & (Z))) #define MAJ(x, y, z) andor(x,y,z) __device__ __forceinline__ -uint64_t Tone(const uint64_t* sharedMemory, uint64_t r[8], uint64_t W[80], uint32_t a, uint32_t i) +uint64_t Tone(uint64_t* K, uint64_t* r, uint64_t* W, const int a, const int i) { - uint64_t h = r[(7+a)%8]; - uint64_t e = r[(4+a)%8]; - uint64_t f = r[(5+a)%8]; - uint64_t g = r[(6+a)%8]; - //uint64_t BSG51 = ROTR64(e, 14) ^ ROTR64(e, 18) ^ ROTR64(e, 41); - uint64_t BSG51 = xor3(ROTR64(e, 14),ROTR64(e, 18),ROTR64(e, 41)); - //uint64_t CHl = (((f) ^ (g)) & (e)) ^ (g); - uint64_t CHl = xandx(e,f,g); - uint64_t result = SPH_T64(h+BSG51+CHl+sharedMemory[i]+W[i]); - return result; + //asm("// TONE \n"); + const uint64_t e = r[(a+4) & 7]; + uint64_t BSG51 = xor3(ROTR64(e, 14), ROTR64(e, 18), ROTR64(e, 41)); + const uint64_t f = r[(a+5) & 7]; + const uint64_t g = r[(a+6) & 7]; + uint64_t CHl = ((f ^ g) & e) ^ g; // xandx(e, f, g); + return (r[(a+7) & 7] + BSG51 + CHl + K[i] + W[i]); } -__global__ -void x17_sha512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *g_nonceVector) +#define SHA3_STEP(K, r, W, ord, i) { \ + const int a = (8 - ord) & 7; \ + uint64_t T1 = Tone(K, r, W, a, i); \ + r[(a+3) & 7] += T1; \ + uint64_t T2 = (BSG5_0(r[a]) + MAJ(r[a], r[(a+1) & 7], r[(a+2) & 7])); \ + r[(a+7) & 7] = T1 + T2; \ +} + +__global__ /*__launch_bounds__(256, 4)*/ +void x17_sha512_gpu_hash_64(const uint32_t threads, uint64_t *g_hash) { - uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) { - uint32_t nounce = (g_nonceVector != NULL) ? g_nonceVector[thread] : (startNounce + thread); - int hashPosition = nounce - startNounce; - uint32_t *inpHash = (uint32_t*)&g_hash[8 * hashPosition]; - union { - uint8_t h1[64]; - uint32_t h4[16]; - uint64_t h8[8]; - } hash; + uint64_t *pHash = &g_hash[thread*8U]; + uint64_t W[80]; #pragma unroll - for (int i=0;i<16;i++) { - hash.h4[i]= inpHash[i]; + for (int i = 0; i < 8; i ++) { + W[i] = SWAP64(pHash[i]); } - uint64_t W[80]; - uint64_t r[8]; + W[8] = 0x8000000000000000; - #pragma unroll 71 - for (int i=9;i<80;i++) { - W[i]=0; + #pragma unroll 69 + for (int i = 9; i<78; i++) { + W[i] = 0U; } + W[15] = 0x0000000000000200; - #pragma unroll - for (int i = 0; i < 8; i ++) { - W[i] = SWAP64(hash.h8[i]); - r[i] = H_512[i]; + #pragma unroll 64 + for (int i = 16; i < 80; i ++) { + W[i] = SSG5_1(W[i-2]) + W[i-7]; + W[i] += SSG5_0(W[i-15]) + W[i-16]; } - W[8] = 0x8000000000000000; - W[15]= 0x0000000000000200; + const uint64_t IV512[8] = { + 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, + 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, + 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, + 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 + }; - #pragma unroll 64 - for (int i = 16; i < 80; i ++) - W[i] = SPH_T64(SSG5_1(W[i - 2]) + W[i - 7] - + SSG5_0(W[i - 15]) + W[i - 16]); + uint64_t r[8]; + #pragma unroll + for (int i = 0; i < 8; i ++) { + r[i] = IV512[i]; + } +#if CUDART_VERSION >= 7050 #pragma unroll 10 +#endif for (int i = 0; i < 80; i += 8) { - #pragma unroll 8 - for (int ord=0;ord<8;ord++) { - SHA3_STEP2(K_512,ord,r,i+ord); + #pragma unroll + for (int ord = 0; ord < 8; ord++) { + SHA3_STEP(c_WB, r, W, ord, i+ord); } } - #pragma unroll 8 - for (int i = 0; i < 8; i++) { - r[i] = SPH_T64(r[i] + H_512[i]); - } - - #pragma unroll 8 - for(int i=0;i<8;i++) { - hash.h8[i] = SWAP64(r[i]); + #pragma unroll + for (int u = 0; u < 4; u ++) { + pHash[u] = SWAP64(r[u] + IV512[u]); } - #pragma unroll 16 - for (int u = 0; u < 16; u ++) { - inpHash[u] = hash.h4[u]; +#ifdef NEED_HASH_512 + #pragma unroll + for (int u = 4; u < 8; u ++) { + pHash[u] = SWAP64(r[u] + IV512[u]); } +#endif } } __host__ void x17_sha512_cpu_init(int thr_id, uint32_t threads) { - cudaMemcpyToSymbol(K_512,K512,80*sizeof(uint64_t),0, cudaMemcpyHostToDevice); - cudaMemcpyToSymbol(H_512,H512,sizeof(H512),0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(c_WB, WB, 80*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); } __host__ -void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order) +void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order) { const uint32_t threadsperblock = 256; dim3 grid((threads + threadsperblock-1)/threadsperblock); dim3 block(threadsperblock); - size_t shared_size =0; - x17_sha512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); - MyStreamSynchronize(NULL, order, thr_id); + x17_sha512_gpu_hash_64 <<>> (threads, (uint64_t*)d_hash); + + //MyStreamSynchronize(NULL, order, thr_id); } diff --git a/x17/x17.cu b/x17/x17.cu index fcfdf42..92d5de9 100644 --- a/x17/x17.cu +++ b/x17/x17.cu @@ -49,10 +49,10 @@ extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t sta extern void x15_whirlpool_cpu_free(int thr_id); extern void x17_sha512_cpu_init(int thr_id, uint32_t threads); -extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); extern void x17_haval256_cpu_init(int thr_id, uint32_t threads); -extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order); // X17 Hashfunktion @@ -206,6 +206,8 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u quark_blake512_cpu_setBlock_80(thr_id, endiandata); cuda_check_cpu_setTarget(ptarget); + int warn = 0; + do { int order = 0; @@ -224,8 +226,8 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); - x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], order++); *hashes_done = pdata[19] - first_nonce + throughput; @@ -252,7 +254,13 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u pdata[19] = foundNonce; return res; } else { - gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce); + // x11+ coins could do some random error, but not on retry + if (!warn) { + warn++; continue; + } else { + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", foundNonce); + warn = 0; + } } }