From 223077d11a20cf37eb6f11c41470479b258ea270 Mon Sep 17 00:00:00 2001 From: Tanguy Pruvot Date: Sat, 16 Jul 2016 21:26:42 +0200 Subject: [PATCH] merged sha256d/ripemd drop ripemd.cu and remove unused kernels --- Makefile.am | 2 +- ccminer.vcxproj | 5 +- ccminer.vcxproj.filters | 3 - lbry/cuda_ripemd160.cu | 402 --------------------------------- lbry/cuda_sha256_lbry.cu | 463 ++++++++++++++++++++++++++------------- lbry/lbry.cu | 16 +- 6 files changed, 315 insertions(+), 576 deletions(-) delete mode 100644 lbry/cuda_ripemd160.cu diff --git a/Makefile.am b/Makefile.am index 7a8fd47..8de4412 100644 --- a/Makefile.am +++ b/Makefile.am @@ -51,7 +51,7 @@ ccminer_SOURCES = elist.h miner.h compat.h \ sph/hamsi.c sph/hamsi_helper.c sph/streebog.c \ sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \ sph/ripemd.c sph/sph_sha2.c \ - lbry/lbry.cu lbry/cuda_ripemd160.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu \ + lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu \ qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \ x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \ x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \ diff --git a/ccminer.vcxproj b/ccminer.vcxproj index b304366..ce4c130 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -39,7 +39,7 @@ - + @@ -424,7 +424,6 @@ 92 - @@ -530,7 +529,7 @@ - + diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index ad3842f..e0d0351 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -730,9 +730,6 @@ Source Files\CUDA\lbry - - Source Files\CUDA\lbry - Source Files\CUDA\lbry diff --git a/lbry/cuda_ripemd160.cu b/lbry/cuda_ripemd160.cu deleted file mode 100644 index bc4406e..0000000 --- a/lbry/cuda_ripemd160.cu +++ /dev/null @@ -1,402 +0,0 @@ -/* - * ripemd-160 kernel implementation. - * - * ==========================(LICENSE BEGIN)============================ - * - * Copyright (c) 2014, 2016 djm34, tpruvot - * - * Permission is hereby granted, free of charge, to any person obtaining - * a copy of this software and associated documentation files (the - * "Software"), to deal in the Software without restriction, including - * without limitation the rights to use, copy, modify, merge, publish, - * distribute, sublicense, and/or sell copies of the Software, and to - * permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - * - * ===========================(LICENSE END)============================= - * - */ -#include -#include -#include - -#include - -static __constant__ uint32_t c_IV[5] = { - 0x67452301u, 0xEFCDAB89u, 0x98BADCFEu, 0x10325476u, 0xC3D2E1F0u -}; - -__device__ __forceinline__ -uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) { - uint32_t result; -#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 - asm ("lop3.b32 %0, %1, %2, %3, 0x96; // xor3b" //0x96 = 0xF0 ^ 0xCC ^ 0xAA - : "=r"(result) : "r"(a), "r"(b),"r"(c)); -#else - result = a^b^c; -#endif - return result; -} - -//__host__ -//uint64_t xornot64(uint64_t a, uint64_t b, uint64_t c) { -// return c ^ (a | !b); -//} - -__forceinline__ __device__ -uint64_t xornot64(uint64_t a, uint64_t b, uint64_t c) -{ - uint64_t result; - asm("{ .reg .u64 m,n; // xornot64\n\t" - "not.b64 m,%2; \n\t" - "or.b64 n, %1,m;\n\t" - "xor.b64 %0, n,%3;\n\t" - "}\n\t" - : "=l"(result) : "l"(a), "l"(b), "l"(c)); - return result; -} - -//__host__ -//uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c) { -// return a ^ (b | !c); -//} - -__device__ __forceinline__ -uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c) -{ - uint64_t result; - asm("{ .reg .u64 m,n; // xornt64\n\t" - "not.b64 m,%3; \n\t" - "or.b64 n, %2,m;\n\t" - "xor.b64 %0, %1,n;\n\t" - "}\n\t" - : "=l"(result) : "l"(a), "l"(b), "l"(c)); - return result; -} - -/* - * Round functions for RIPEMD-128 and RIPEMD-160. - */ -#if 1 -#define F1(x, y, z) ((x) ^ (y) ^ (z)) -#define F2(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) -#define F3(x, y, z) (((x) | ~(y)) ^ (z)) -#define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y)) -#define F5(x, y, z) ((x) ^ ((y) | ~(z))) -#else -#define F1(x, y, z) xor3b(x,y,z) -#define F2(x, y, z) xandx(x,y,z) -#define F3(x, y, z) xornot64(x,y,z) -#define F4(x, y, z) xandx(z,x,y) -#define F5(x, y, z) xornt64(x,y,z) -#endif - -/* - * Round constants for RIPEMD-160. - */ -#define K11 0x00000000u -#define K12 0x5A827999u -#define K13 0x6ED9EBA1u -#define K14 0x8F1BBCDCu -#define K15 0xA953FD4Eu - -#define K21 0x50A28BE6u -#define K22 0x5C4DD124u -#define K23 0x6D703EF3u -#define K24 0x7A6D76E9u -#define K25 0x00000000u - -#define RR(a, b, c, d, e, f, s, r, k) { \ - a = SPH_T32(ROTL32(SPH_T32(a + f(b, c, d) + r + k), s) + e); \ - c = ROTL32(c, 10); \ -} - -#define ROUND1(a, b, c, d, e, f, s, r, k) \ - RR(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k) - -#define ROUND2(a, b, c, d, e, f, s, r, k) \ - RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k) - -#define RIPEMD160_ROUND_BODY(in, h) { \ - uint32_t A1, B1, C1, D1, E1; \ - uint32_t A2, B2, C2, D2, E2; \ - uint32_t tmp; \ -\ - A1 = A2 = h[0]; \ - B1 = B2 = h[1]; \ - C1 = C2 = h[2]; \ - D1 = D2 = h[3]; \ - E1 = E2 = h[4]; \ -\ - ROUND1(A, B, C, D, E, F1, 11, in[ 0], 1); \ - ROUND1(E, A, B, C, D, F1, 14, in[ 1], 1); \ - ROUND1(D, E, A, B, C, F1, 15, in[ 2], 1); \ - ROUND1(C, D, E, A, B, F1, 12, in[ 3], 1); \ - ROUND1(B, C, D, E, A, F1, 5, in[ 4], 1); \ - ROUND1(A, B, C, D, E, F1, 8, in[ 5], 1); \ - ROUND1(E, A, B, C, D, F1, 7, in[ 6], 1); \ - ROUND1(D, E, A, B, C, F1, 9, in[ 7], 1); \ - ROUND1(C, D, E, A, B, F1, 11, in[ 8], 1); \ - ROUND1(B, C, D, E, A, F1, 13, in[ 9], 1); \ - ROUND1(A, B, C, D, E, F1, 14, in[10], 1); \ - ROUND1(E, A, B, C, D, F1, 15, in[11], 1); \ - ROUND1(D, E, A, B, C, F1, 6, in[12], 1); \ - ROUND1(C, D, E, A, B, F1, 7, in[13], 1); \ - ROUND1(B, C, D, E, A, F1, 9, in[14], 1); \ - ROUND1(A, B, C, D, E, F1, 8, in[15], 1); \ -\ - ROUND1(E, A, B, C, D, F2, 7, in[ 7], 2); \ - ROUND1(D, E, A, B, C, F2, 6, in[ 4], 2); \ - ROUND1(C, D, E, A, B, F2, 8, in[13], 2); \ - ROUND1(B, C, D, E, A, F2, 13, in[ 1], 2); \ - ROUND1(A, B, C, D, E, F2, 11, in[10], 2); \ - ROUND1(E, A, B, C, D, F2, 9, in[ 6], 2); \ - ROUND1(D, E, A, B, C, F2, 7, in[15], 2); \ - ROUND1(C, D, E, A, B, F2, 15, in[ 3], 2); \ - ROUND1(B, C, D, E, A, F2, 7, in[12], 2); \ - ROUND1(A, B, C, D, E, F2, 12, in[ 0], 2); \ - ROUND1(E, A, B, C, D, F2, 15, in[ 9], 2); \ - ROUND1(D, E, A, B, C, F2, 9, in[ 5], 2); \ - ROUND1(C, D, E, A, B, F2, 11, in[ 2], 2); \ - ROUND1(B, C, D, E, A, F2, 7, in[14], 2); \ - ROUND1(A, B, C, D, E, F2, 13, in[11], 2); \ - ROUND1(E, A, B, C, D, F2, 12, in[ 8], 2); \ -\ - ROUND1(D, E, A, B, C, F3, 11, in[ 3], 3); \ - ROUND1(C, D, E, A, B, F3, 13, in[10], 3); \ - ROUND1(B, C, D, E, A, F3, 6, in[14], 3); \ - ROUND1(A, B, C, D, E, F3, 7, in[ 4], 3); \ - ROUND1(E, A, B, C, D, F3, 14, in[ 9], 3); \ - ROUND1(D, E, A, B, C, F3, 9, in[15], 3); \ - ROUND1(C, D, E, A, B, F3, 13, in[ 8], 3); \ - ROUND1(B, C, D, E, A, F3, 15, in[ 1], 3); \ - ROUND1(A, B, C, D, E, F3, 14, in[ 2], 3); \ - ROUND1(E, A, B, C, D, F3, 8, in[ 7], 3); \ - ROUND1(D, E, A, B, C, F3, 13, in[ 0], 3); \ - ROUND1(C, D, E, A, B, F3, 6, in[ 6], 3); \ - ROUND1(B, C, D, E, A, F3, 5, in[13], 3); \ - ROUND1(A, B, C, D, E, F3, 12, in[11], 3); \ - ROUND1(E, A, B, C, D, F3, 7, in[ 5], 3); \ - ROUND1(D, E, A, B, C, F3, 5, in[12], 3); \ -\ - ROUND1(C, D, E, A, B, F4, 11, in[ 1], 4); \ - ROUND1(B, C, D, E, A, F4, 12, in[ 9], 4); \ - ROUND1(A, B, C, D, E, F4, 14, in[11], 4); \ - ROUND1(E, A, B, C, D, F4, 15, in[10], 4); \ - ROUND1(D, E, A, B, C, F4, 14, in[ 0], 4); \ - ROUND1(C, D, E, A, B, F4, 15, in[ 8], 4); \ - ROUND1(B, C, D, E, A, F4, 9, in[12], 4); \ - ROUND1(A, B, C, D, E, F4, 8, in[ 4], 4); \ - ROUND1(E, A, B, C, D, F4, 9, in[13], 4); \ - ROUND1(D, E, A, B, C, F4, 14, in[ 3], 4); \ - ROUND1(C, D, E, A, B, F4, 5, in[ 7], 4); \ - ROUND1(B, C, D, E, A, F4, 6, in[15], 4); \ - ROUND1(A, B, C, D, E, F4, 8, in[14], 4); \ - ROUND1(E, A, B, C, D, F4, 6, in[ 5], 4); \ - ROUND1(D, E, A, B, C, F4, 5, in[ 6], 4); \ - ROUND1(C, D, E, A, B, F4, 12, in[ 2], 4); \ -\ - ROUND1(B, C, D, E, A, F5, 9, in[ 4], 5); \ - ROUND1(A, B, C, D, E, F5, 15, in[ 0], 5); \ - ROUND1(E, A, B, C, D, F5, 5, in[ 5], 5); \ - ROUND1(D, E, A, B, C, F5, 11, in[ 9], 5); \ - ROUND1(C, D, E, A, B, F5, 6, in[ 7], 5); \ - ROUND1(B, C, D, E, A, F5, 8, in[12], 5); \ - ROUND1(A, B, C, D, E, F5, 13, in[ 2], 5); \ - ROUND1(E, A, B, C, D, F5, 12, in[10], 5); \ - ROUND1(D, E, A, B, C, F5, 5, in[14], 5); \ - ROUND1(C, D, E, A, B, F5, 12, in[ 1], 5); \ - ROUND1(B, C, D, E, A, F5, 13, in[ 3], 5); \ - ROUND1(A, B, C, D, E, F5, 14, in[ 8], 5); \ - ROUND1(E, A, B, C, D, F5, 11, in[11], 5); \ - ROUND1(D, E, A, B, C, F5, 8, in[ 6], 5); \ - ROUND1(C, D, E, A, B, F5, 5, in[15], 5); \ - ROUND1(B, C, D, E, A, F5, 6, in[13], 5); \ -\ - ROUND2(A, B, C, D, E, F5, 8, in[ 5], 1); \ - ROUND2(E, A, B, C, D, F5, 9, in[14], 1); \ - ROUND2(D, E, A, B, C, F5, 9, in[ 7], 1); \ - ROUND2(C, D, E, A, B, F5, 11, in[ 0], 1); \ - ROUND2(B, C, D, E, A, F5, 13, in[ 9], 1); \ - ROUND2(A, B, C, D, E, F5, 15, in[ 2], 1); \ - ROUND2(E, A, B, C, D, F5, 15, in[11], 1); \ - ROUND2(D, E, A, B, C, F5, 5, in[ 4], 1); \ - ROUND2(C, D, E, A, B, F5, 7, in[13], 1); \ - ROUND2(B, C, D, E, A, F5, 7, in[ 6], 1); \ - ROUND2(A, B, C, D, E, F5, 8, in[15], 1); \ - ROUND2(E, A, B, C, D, F5, 11, in[ 8], 1); \ - ROUND2(D, E, A, B, C, F5, 14, in[ 1], 1); \ - ROUND2(C, D, E, A, B, F5, 14, in[10], 1); \ - ROUND2(B, C, D, E, A, F5, 12, in[ 3], 1); \ - ROUND2(A, B, C, D, E, F5, 6, in[12], 1); \ -\ - ROUND2(E, A, B, C, D, F4, 9, in[ 6], 2); \ - ROUND2(D, E, A, B, C, F4, 13, in[11], 2); \ - ROUND2(C, D, E, A, B, F4, 15, in[ 3], 2); \ - ROUND2(B, C, D, E, A, F4, 7, in[ 7], 2); \ - ROUND2(A, B, C, D, E, F4, 12, in[ 0], 2); \ - ROUND2(E, A, B, C, D, F4, 8, in[13], 2); \ - ROUND2(D, E, A, B, C, F4, 9, in[ 5], 2); \ - ROUND2(C, D, E, A, B, F4, 11, in[10], 2); \ - ROUND2(B, C, D, E, A, F4, 7, in[14], 2); \ - ROUND2(A, B, C, D, E, F4, 7, in[15], 2); \ - ROUND2(E, A, B, C, D, F4, 12, in[ 8], 2); \ - ROUND2(D, E, A, B, C, F4, 7, in[12], 2); \ - ROUND2(C, D, E, A, B, F4, 6, in[ 4], 2); \ - ROUND2(B, C, D, E, A, F4, 15, in[ 9], 2); \ - ROUND2(A, B, C, D, E, F4, 13, in[ 1], 2); \ - ROUND2(E, A, B, C, D, F4, 11, in[ 2], 2); \ -\ - ROUND2(D, E, A, B, C, F3, 9, in[15], 3); \ - ROUND2(C, D, E, A, B, F3, 7, in[ 5], 3); \ - ROUND2(B, C, D, E, A, F3, 15, in[ 1], 3); \ - ROUND2(A, B, C, D, E, F3, 11, in[ 3], 3); \ - ROUND2(E, A, B, C, D, F3, 8, in[ 7], 3); \ - ROUND2(D, E, A, B, C, F3, 6, in[14], 3); \ - ROUND2(C, D, E, A, B, F3, 6, in[ 6], 3); \ - ROUND2(B, C, D, E, A, F3, 14, in[ 9], 3); \ - ROUND2(A, B, C, D, E, F3, 12, in[11], 3); \ - ROUND2(E, A, B, C, D, F3, 13, in[ 8], 3); \ - ROUND2(D, E, A, B, C, F3, 5, in[12], 3); \ - ROUND2(C, D, E, A, B, F3, 14, in[ 2], 3); \ - ROUND2(B, C, D, E, A, F3, 13, in[10], 3); \ - ROUND2(A, B, C, D, E, F3, 13, in[ 0], 3); \ - ROUND2(E, A, B, C, D, F3, 7, in[ 4], 3); \ - ROUND2(D, E, A, B, C, F3, 5, in[13], 3); \ -\ - ROUND2(C, D, E, A, B, F2, 15, in[ 8], 4); \ - ROUND2(B, C, D, E, A, F2, 5, in[ 6], 4); \ - ROUND2(A, B, C, D, E, F2, 8, in[ 4], 4); \ - ROUND2(E, A, B, C, D, F2, 11, in[ 1], 4); \ - ROUND2(D, E, A, B, C, F2, 14, in[ 3], 4); \ - ROUND2(C, D, E, A, B, F2, 14, in[11], 4); \ - ROUND2(B, C, D, E, A, F2, 6, in[15], 4); \ - ROUND2(A, B, C, D, E, F2, 14, in[ 0], 4); \ - ROUND2(E, A, B, C, D, F2, 6, in[ 5], 4); \ - ROUND2(D, E, A, B, C, F2, 9, in[12], 4); \ - ROUND2(C, D, E, A, B, F2, 12, in[ 2], 4); \ - ROUND2(B, C, D, E, A, F2, 9, in[13], 4); \ - ROUND2(A, B, C, D, E, F2, 12, in[ 9], 4); \ - ROUND2(E, A, B, C, D, F2, 5, in[ 7], 4); \ - ROUND2(D, E, A, B, C, F2, 15, in[10], 4); \ - ROUND2(C, D, E, A, B, F2, 8, in[14], 4); \ -\ - ROUND2(B, C, D, E, A, F1, 8, in[12], 5); \ - ROUND2(A, B, C, D, E, F1, 5, in[15], 5); \ - ROUND2(E, A, B, C, D, F1, 12, in[10], 5); \ - ROUND2(D, E, A, B, C, F1, 9, in[ 4], 5); \ - ROUND2(C, D, E, A, B, F1, 12, in[ 1], 5); \ - ROUND2(B, C, D, E, A, F1, 5, in[ 5], 5); \ - ROUND2(A, B, C, D, E, F1, 14, in[ 8], 5); \ - ROUND2(E, A, B, C, D, F1, 6, in[ 7], 5); \ - ROUND2(D, E, A, B, C, F1, 8, in[ 6], 5); \ - ROUND2(C, D, E, A, B, F1, 13, in[ 2], 5); \ - ROUND2(B, C, D, E, A, F1, 6, in[13], 5); \ - ROUND2(A, B, C, D, E, F1, 5, in[14], 5); \ - ROUND2(E, A, B, C, D, F1, 15, in[ 0], 5); \ - ROUND2(D, E, A, B, C, F1, 13, in[ 3], 5); \ - ROUND2(C, D, E, A, B, F1, 11, in[ 9], 5); \ - ROUND2(B, C, D, E, A, F1, 11, in[11], 5); \ -\ - tmp = (h[1] + C1 + D2); \ - h[1] = (h[2] + D1 + E2); \ - h[2] = (h[3] + E1 + A2); \ - h[3] = (h[4] + A1 + B2); \ - h[4] = (h[0] + B1 + C2); \ - h[0] = tmp; \ -} - -__global__ -void lbry_ripemd160_gpu_hash_32x2(const uint32_t threads, uint64_t *g_hash) -{ - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U]); - - uint32_t in[16]; - #pragma unroll - for (int i=0; i<8; i++) - in[i] = (hash[i]); - in[8] = 0x80; - - #pragma unroll - for (int i=9;i<16;i++) in[i] = 0; - - in[14] = 0x100; // size in bits - - uint32_t h[5]; - #pragma unroll - for (int i=0; i<5; i++) - h[i] = c_IV[i]; - - RIPEMD160_ROUND_BODY(in, h); - - #pragma unroll - for (int i=0; i<5; i++) - hash[i] = h[i]; - -#ifdef PAD_ZEROS - // 20 bytes hash on 32 output space - hash[5] = 0; - hash[6] = 0; - hash[7] = 0; -#endif - // second 32 bytes block hash - hash += 8; - - #pragma unroll - for (int i=0; i<8; i++) - in[i] = (hash[i]); - in[8] = 0x80; - - #pragma unroll - for (int i=9;i<16;i++) in[i] = 0; - - in[14] = 0x100; // size in bits - - #pragma unroll - for (int i=0; i<5; i++) - h[i] = c_IV[i]; - - RIPEMD160_ROUND_BODY(in, h); - - #pragma unroll - for (int i=0; i<5; i++) - hash[i] = h[i]; - -#ifdef PAD_ZEROS - // 20 bytes hash on 32 output space - hash[5] = 0; - hash[6] = 0; - hash[7] = 0; -#endif - } -} - -__host__ -void lbry_ripemd160_hash_32x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream) -{ - const uint32_t threadsperblock = 128; - - dim3 grid(threads/threadsperblock); - dim3 block(threadsperblock); - - lbry_ripemd160_gpu_hash_32x2 <<>> (threads, (uint64_t*) g_Hash); -} - -void lbry_ripemd160_init(int thr_id) -{ - //cudaMemcpyToSymbol(c_IV, IV, sizeof(IV), 0, cudaMemcpyHostToDevice); -} diff --git a/lbry/cuda_sha256_lbry.cu b/lbry/cuda_sha256_lbry.cu index c75325e..52cddcd 100644 --- a/lbry/cuda_sha256_lbry.cu +++ b/lbry/cuda_sha256_lbry.cu @@ -21,6 +21,10 @@ static __thread uint32_t* d_resNonces; __constant__ static uint32_t __align__(8) c_target[2]; __device__ uint64_t d_target[1]; +#ifdef __INTELLISENSE__ +#define atomicExch(p,y) y +#endif + // ------------------------------------------------------------------------------------------------ static const uint32_t cpu_H256[8] = { @@ -324,8 +328,8 @@ uint64_t cuda_swab32ll(uint64_t x) { return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x))); } +#if 0 __global__ -/*__launch_bounds__(256,3)*/ void lbry_sha256_gpu_hash_112(const uint32_t threads, const uint32_t startNonce, const bool swabNonce, uint64_t *outputHash) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); @@ -359,7 +363,6 @@ void lbry_sha256_gpu_hash_112(const uint32_t threads, const uint32_t startNonce, } __global__ -/*__launch_bounds__(256,3)*/ void lbry_sha256_gpu_hash_32(uint32_t threads, uint64_t *Hash512) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); @@ -396,8 +399,31 @@ void lbry_sha256_gpu_hash_32(uint32_t threads, uint64_t *Hash512) } } +__host__ +void lbry_sha256_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream) +{ + const int threadsperblock = 256; + + dim3 grid(threads/threadsperblock); + dim3 block(threadsperblock); + + lbry_sha256_gpu_hash_112 <<>> (threads, startNonce, swabNonce, (uint64_t*) d_outputHash); + cudaGetLastError(); +} + +__host__ +void lbry_sha256_hash_32(int thr_id, uint32_t threads, uint32_t *d_Hash, cudaStream_t stream) +{ + const int threadsperblock = 256; + + dim3 grid(threads/threadsperblock); + dim3 block(threadsperblock); + + lbry_sha256_gpu_hash_32 <<>> (threads, (uint64_t*) d_Hash); +} +#endif + __global__ -/*__launch_bounds__(256,3)*/ void lbry_sha256d_gpu_hash_112(const uint32_t threads, const uint32_t startNonce, const bool swabNonce, uint64_t *outputHash) { const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); @@ -447,106 +473,15 @@ void lbry_sha256d_gpu_hash_112(const uint32_t threads, const uint32_t startNonce } } -__global__ -/*__launch_bounds__(256,3)*/ -void lbry_sha256_gpu_hash_20x2(uint32_t threads, uint64_t *Hash512) -{ - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t __align__(8) buf[8]; // align for vectorize - #pragma unroll - for (int i=0;i<8;i++) buf[i] = c_H256[i]; - - uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]); - - uint32_t dat[16]; - #pragma unroll - for (int i=0;i<5;i++) dat[i] = cuda_swab32(input[i]); - #pragma unroll - for (int i=0;i<5;i++) dat[i+5] = cuda_swab32(input[i+8]); - dat[10] = 0x80000000; - #pragma unroll - for (int i=11;i<15;i++) dat[i] = 0; - dat[15] = 0x140; - - sha256_round_body(dat, buf, c_K); - - // output - uint2* output = (uint2*) input; - #pragma unroll - for (int i=0;i<4;i++) { - //output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i])); - output[i] = vectorize(((uint64_t*)buf)[i]); - } -#ifdef PAD_ZEROS - #pragma unroll - for (int i=4; i<8; i++) output[i] = vectorize(0); -#endif - } -} - -__global__ -/*__launch_bounds__(256,3)*/ -void lbry_sha256d_gpu_hash_20x2(uint32_t threads, uint64_t *Hash512) +__host__ +void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream) { - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - extern __shared__ uint32_t s_K[]; - if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x]; - if (thread < threads) - { - uint32_t __align__(8) buf[8]; // align for vectorize - #pragma unroll - for (int i=0;i<8;i++) buf[i] = c_H256[i]; - - uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]); - - uint32_t dat[16]; - #pragma unroll - for (int i=0; i<5; i++) dat[i] = cuda_swab32(input[i]); - #pragma unroll - for (int i=0; i<5; i++) dat[i+5] = cuda_swab32(input[i+8]); - dat[10] = 0x80000000; - #pragma unroll - for (int i=11;i<15;i++) dat[i] = 0; - dat[15] = 0x140; - - sha256_round_body(dat, buf, s_K); - - // second sha256 - - #pragma unroll - for (int i=0; i<8; i++) dat[i] = buf[i]; - dat[8] = 0x80000000; - #pragma unroll - for (int i=9; i<15; i++) dat[i] = 0; - dat[15] = 0x100; - - #pragma unroll - for (int i=0;i<8;i++) buf[i] = c_H256[i]; - - sha256_round_body(dat, buf, s_K); - - // output - uint2* output = (uint2*) input; + const int threadsperblock = 256; -#ifdef FULL_HASH - #pragma unroll - for (int i=0;i<4;i++) { - output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i])); - //output[i] = vectorize(((uint64_t*)buf)[i]); - } -# ifdef PAD_ZEROS - #pragma unroll - for (int i=4; i<8; i++) output[i] = vectorize(0); -# endif + dim3 grid(threads/threadsperblock); + dim3 block(threadsperblock); -#else - //input[6] = cuda_swab32(buf[6]); - //input[7] = cuda_swab32(buf[7]); - output[3] = vectorize(cuda_swab32ll(((uint64_t*)buf)[3])); -#endif - } + lbry_sha256d_gpu_hash_112 <<>> (threads, startNonce, swabNonce, (uint64_t*) d_outputHash); } __host__ @@ -578,85 +513,309 @@ void lbry_sha256_setBlock_112(uint32_t *pdata, uint32_t *ptarget) CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &ptarget[6], sizeof(uint64_t), 0, cudaMemcpyHostToDevice)); } -__host__ -void lbry_sha256_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream) -{ - const int threadsperblock = 256; +// ------------------------------------------------------------------------------------------ - dim3 grid(threads/threadsperblock); - dim3 block(threadsperblock); - lbry_sha256_gpu_hash_112 <<>> (threads, startNonce, swabNonce, (uint64_t*) d_outputHash); - cudaGetLastError(); -} +static __constant__ uint32_t c_IV[5] = { + 0x67452301u, 0xEFCDAB89u, 0x98BADCFEu, 0x10325476u, 0xC3D2E1F0u +}; -__host__ -void lbry_sha256_hash_32(int thr_id, uint32_t threads, uint32_t *d_Hash, cudaStream_t stream) -{ - const int threadsperblock = 256; +/* + * Round functions for RIPEMD-128 and RIPEMD-160. + */ +#if 1 +#define F1(x, y, z) ((x) ^ (y) ^ (z)) +#define F2(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) +#define F3(x, y, z) (((x) | ~(y)) ^ (z)) +#define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y)) +#define F5(x, y, z) ((x) ^ ((y) | ~(z))) +#else +#define F1(x, y, z) xor3b(x,y,z) +#define F2(x, y, z) xandx(x,y,z) +#define F3(x, y, z) xornot64(x,y,z) +#define F4(x, y, z) xandx(z,x,y) +#define F5(x, y, z) xornt64(x,y,z) +#endif - dim3 grid(threads/threadsperblock); - dim3 block(threadsperblock); +/* + * Round constants for RIPEMD-160. + */ +#define K11 0x00000000u +#define K12 0x5A827999u +#define K13 0x6ED9EBA1u +#define K14 0x8F1BBCDCu +#define K15 0xA953FD4Eu + +#define K21 0x50A28BE6u +#define K22 0x5C4DD124u +#define K23 0x6D703EF3u +#define K24 0x7A6D76E9u +#define K25 0x00000000u + +#define RR(a, b, c, d, e, f, s, r, k) { \ + a = SPH_T32(ROTL32(SPH_T32(a + f(b, c, d) + r + k), s) + e); \ + c = ROTL32(c, 10); \ +} - lbry_sha256_gpu_hash_32 <<>> (threads, (uint64_t*) d_Hash); +#define ROUND1(a, b, c, d, e, f, s, r, k) \ + RR(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k) + +#define ROUND2(a, b, c, d, e, f, s, r, k) \ + RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k) + +#define RIPEMD160_ROUND_BODY(in, h) { \ + uint32_t A1, B1, C1, D1, E1; \ + uint32_t A2, B2, C2, D2, E2; \ + uint32_t tmp; \ +\ + A1 = A2 = h[0]; \ + B1 = B2 = h[1]; \ + C1 = C2 = h[2]; \ + D1 = D2 = h[3]; \ + E1 = E2 = h[4]; \ +\ + ROUND1(A, B, C, D, E, F1, 11, in[ 0], 1); \ + ROUND1(E, A, B, C, D, F1, 14, in[ 1], 1); \ + ROUND1(D, E, A, B, C, F1, 15, in[ 2], 1); \ + ROUND1(C, D, E, A, B, F1, 12, in[ 3], 1); \ + ROUND1(B, C, D, E, A, F1, 5, in[ 4], 1); \ + ROUND1(A, B, C, D, E, F1, 8, in[ 5], 1); \ + ROUND1(E, A, B, C, D, F1, 7, in[ 6], 1); \ + ROUND1(D, E, A, B, C, F1, 9, in[ 7], 1); \ + ROUND1(C, D, E, A, B, F1, 11, in[ 8], 1); \ + ROUND1(B, C, D, E, A, F1, 13, in[ 9], 1); \ + ROUND1(A, B, C, D, E, F1, 14, in[10], 1); \ + ROUND1(E, A, B, C, D, F1, 15, in[11], 1); \ + ROUND1(D, E, A, B, C, F1, 6, in[12], 1); \ + ROUND1(C, D, E, A, B, F1, 7, in[13], 1); \ + ROUND1(B, C, D, E, A, F1, 9, in[14], 1); \ + ROUND1(A, B, C, D, E, F1, 8, in[15], 1); \ +\ + ROUND1(E, A, B, C, D, F2, 7, in[ 7], 2); \ + ROUND1(D, E, A, B, C, F2, 6, in[ 4], 2); \ + ROUND1(C, D, E, A, B, F2, 8, in[13], 2); \ + ROUND1(B, C, D, E, A, F2, 13, in[ 1], 2); \ + ROUND1(A, B, C, D, E, F2, 11, in[10], 2); \ + ROUND1(E, A, B, C, D, F2, 9, in[ 6], 2); \ + ROUND1(D, E, A, B, C, F2, 7, in[15], 2); \ + ROUND1(C, D, E, A, B, F2, 15, in[ 3], 2); \ + ROUND1(B, C, D, E, A, F2, 7, in[12], 2); \ + ROUND1(A, B, C, D, E, F2, 12, in[ 0], 2); \ + ROUND1(E, A, B, C, D, F2, 15, in[ 9], 2); \ + ROUND1(D, E, A, B, C, F2, 9, in[ 5], 2); \ + ROUND1(C, D, E, A, B, F2, 11, in[ 2], 2); \ + ROUND1(B, C, D, E, A, F2, 7, in[14], 2); \ + ROUND1(A, B, C, D, E, F2, 13, in[11], 2); \ + ROUND1(E, A, B, C, D, F2, 12, in[ 8], 2); \ +\ + ROUND1(D, E, A, B, C, F3, 11, in[ 3], 3); \ + ROUND1(C, D, E, A, B, F3, 13, in[10], 3); \ + ROUND1(B, C, D, E, A, F3, 6, in[14], 3); \ + ROUND1(A, B, C, D, E, F3, 7, in[ 4], 3); \ + ROUND1(E, A, B, C, D, F3, 14, in[ 9], 3); \ + ROUND1(D, E, A, B, C, F3, 9, in[15], 3); \ + ROUND1(C, D, E, A, B, F3, 13, in[ 8], 3); \ + ROUND1(B, C, D, E, A, F3, 15, in[ 1], 3); \ + ROUND1(A, B, C, D, E, F3, 14, in[ 2], 3); \ + ROUND1(E, A, B, C, D, F3, 8, in[ 7], 3); \ + ROUND1(D, E, A, B, C, F3, 13, in[ 0], 3); \ + ROUND1(C, D, E, A, B, F3, 6, in[ 6], 3); \ + ROUND1(B, C, D, E, A, F3, 5, in[13], 3); \ + ROUND1(A, B, C, D, E, F3, 12, in[11], 3); \ + ROUND1(E, A, B, C, D, F3, 7, in[ 5], 3); \ + ROUND1(D, E, A, B, C, F3, 5, in[12], 3); \ +\ + ROUND1(C, D, E, A, B, F4, 11, in[ 1], 4); \ + ROUND1(B, C, D, E, A, F4, 12, in[ 9], 4); \ + ROUND1(A, B, C, D, E, F4, 14, in[11], 4); \ + ROUND1(E, A, B, C, D, F4, 15, in[10], 4); \ + ROUND1(D, E, A, B, C, F4, 14, in[ 0], 4); \ + ROUND1(C, D, E, A, B, F4, 15, in[ 8], 4); \ + ROUND1(B, C, D, E, A, F4, 9, in[12], 4); \ + ROUND1(A, B, C, D, E, F4, 8, in[ 4], 4); \ + ROUND1(E, A, B, C, D, F4, 9, in[13], 4); \ + ROUND1(D, E, A, B, C, F4, 14, in[ 3], 4); \ + ROUND1(C, D, E, A, B, F4, 5, in[ 7], 4); \ + ROUND1(B, C, D, E, A, F4, 6, in[15], 4); \ + ROUND1(A, B, C, D, E, F4, 8, in[14], 4); \ + ROUND1(E, A, B, C, D, F4, 6, in[ 5], 4); \ + ROUND1(D, E, A, B, C, F4, 5, in[ 6], 4); \ + ROUND1(C, D, E, A, B, F4, 12, in[ 2], 4); \ +\ + ROUND1(B, C, D, E, A, F5, 9, in[ 4], 5); \ + ROUND1(A, B, C, D, E, F5, 15, in[ 0], 5); \ + ROUND1(E, A, B, C, D, F5, 5, in[ 5], 5); \ + ROUND1(D, E, A, B, C, F5, 11, in[ 9], 5); \ + ROUND1(C, D, E, A, B, F5, 6, in[ 7], 5); \ + ROUND1(B, C, D, E, A, F5, 8, in[12], 5); \ + ROUND1(A, B, C, D, E, F5, 13, in[ 2], 5); \ + ROUND1(E, A, B, C, D, F5, 12, in[10], 5); \ + ROUND1(D, E, A, B, C, F5, 5, in[14], 5); \ + ROUND1(C, D, E, A, B, F5, 12, in[ 1], 5); \ + ROUND1(B, C, D, E, A, F5, 13, in[ 3], 5); \ + ROUND1(A, B, C, D, E, F5, 14, in[ 8], 5); \ + ROUND1(E, A, B, C, D, F5, 11, in[11], 5); \ + ROUND1(D, E, A, B, C, F5, 8, in[ 6], 5); \ + ROUND1(C, D, E, A, B, F5, 5, in[15], 5); \ + ROUND1(B, C, D, E, A, F5, 6, in[13], 5); \ +\ + ROUND2(A, B, C, D, E, F5, 8, in[ 5], 1); \ + ROUND2(E, A, B, C, D, F5, 9, in[14], 1); \ + ROUND2(D, E, A, B, C, F5, 9, in[ 7], 1); \ + ROUND2(C, D, E, A, B, F5, 11, in[ 0], 1); \ + ROUND2(B, C, D, E, A, F5, 13, in[ 9], 1); \ + ROUND2(A, B, C, D, E, F5, 15, in[ 2], 1); \ + ROUND2(E, A, B, C, D, F5, 15, in[11], 1); \ + ROUND2(D, E, A, B, C, F5, 5, in[ 4], 1); \ + ROUND2(C, D, E, A, B, F5, 7, in[13], 1); \ + ROUND2(B, C, D, E, A, F5, 7, in[ 6], 1); \ + ROUND2(A, B, C, D, E, F5, 8, in[15], 1); \ + ROUND2(E, A, B, C, D, F5, 11, in[ 8], 1); \ + ROUND2(D, E, A, B, C, F5, 14, in[ 1], 1); \ + ROUND2(C, D, E, A, B, F5, 14, in[10], 1); \ + ROUND2(B, C, D, E, A, F5, 12, in[ 3], 1); \ + ROUND2(A, B, C, D, E, F5, 6, in[12], 1); \ +\ + ROUND2(E, A, B, C, D, F4, 9, in[ 6], 2); \ + ROUND2(D, E, A, B, C, F4, 13, in[11], 2); \ + ROUND2(C, D, E, A, B, F4, 15, in[ 3], 2); \ + ROUND2(B, C, D, E, A, F4, 7, in[ 7], 2); \ + ROUND2(A, B, C, D, E, F4, 12, in[ 0], 2); \ + ROUND2(E, A, B, C, D, F4, 8, in[13], 2); \ + ROUND2(D, E, A, B, C, F4, 9, in[ 5], 2); \ + ROUND2(C, D, E, A, B, F4, 11, in[10], 2); \ + ROUND2(B, C, D, E, A, F4, 7, in[14], 2); \ + ROUND2(A, B, C, D, E, F4, 7, in[15], 2); \ + ROUND2(E, A, B, C, D, F4, 12, in[ 8], 2); \ + ROUND2(D, E, A, B, C, F4, 7, in[12], 2); \ + ROUND2(C, D, E, A, B, F4, 6, in[ 4], 2); \ + ROUND2(B, C, D, E, A, F4, 15, in[ 9], 2); \ + ROUND2(A, B, C, D, E, F4, 13, in[ 1], 2); \ + ROUND2(E, A, B, C, D, F4, 11, in[ 2], 2); \ +\ + ROUND2(D, E, A, B, C, F3, 9, in[15], 3); \ + ROUND2(C, D, E, A, B, F3, 7, in[ 5], 3); \ + ROUND2(B, C, D, E, A, F3, 15, in[ 1], 3); \ + ROUND2(A, B, C, D, E, F3, 11, in[ 3], 3); \ + ROUND2(E, A, B, C, D, F3, 8, in[ 7], 3); \ + ROUND2(D, E, A, B, C, F3, 6, in[14], 3); \ + ROUND2(C, D, E, A, B, F3, 6, in[ 6], 3); \ + ROUND2(B, C, D, E, A, F3, 14, in[ 9], 3); \ + ROUND2(A, B, C, D, E, F3, 12, in[11], 3); \ + ROUND2(E, A, B, C, D, F3, 13, in[ 8], 3); \ + ROUND2(D, E, A, B, C, F3, 5, in[12], 3); \ + ROUND2(C, D, E, A, B, F3, 14, in[ 2], 3); \ + ROUND2(B, C, D, E, A, F3, 13, in[10], 3); \ + ROUND2(A, B, C, D, E, F3, 13, in[ 0], 3); \ + ROUND2(E, A, B, C, D, F3, 7, in[ 4], 3); \ + ROUND2(D, E, A, B, C, F3, 5, in[13], 3); \ +\ + ROUND2(C, D, E, A, B, F2, 15, in[ 8], 4); \ + ROUND2(B, C, D, E, A, F2, 5, in[ 6], 4); \ + ROUND2(A, B, C, D, E, F2, 8, in[ 4], 4); \ + ROUND2(E, A, B, C, D, F2, 11, in[ 1], 4); \ + ROUND2(D, E, A, B, C, F2, 14, in[ 3], 4); \ + ROUND2(C, D, E, A, B, F2, 14, in[11], 4); \ + ROUND2(B, C, D, E, A, F2, 6, in[15], 4); \ + ROUND2(A, B, C, D, E, F2, 14, in[ 0], 4); \ + ROUND2(E, A, B, C, D, F2, 6, in[ 5], 4); \ + ROUND2(D, E, A, B, C, F2, 9, in[12], 4); \ + ROUND2(C, D, E, A, B, F2, 12, in[ 2], 4); \ + ROUND2(B, C, D, E, A, F2, 9, in[13], 4); \ + ROUND2(A, B, C, D, E, F2, 12, in[ 9], 4); \ + ROUND2(E, A, B, C, D, F2, 5, in[ 7], 4); \ + ROUND2(D, E, A, B, C, F2, 15, in[10], 4); \ + ROUND2(C, D, E, A, B, F2, 8, in[14], 4); \ +\ + ROUND2(B, C, D, E, A, F1, 8, in[12], 5); \ + ROUND2(A, B, C, D, E, F1, 5, in[15], 5); \ + ROUND2(E, A, B, C, D, F1, 12, in[10], 5); \ + ROUND2(D, E, A, B, C, F1, 9, in[ 4], 5); \ + ROUND2(C, D, E, A, B, F1, 12, in[ 1], 5); \ + ROUND2(B, C, D, E, A, F1, 5, in[ 5], 5); \ + ROUND2(A, B, C, D, E, F1, 14, in[ 8], 5); \ + ROUND2(E, A, B, C, D, F1, 6, in[ 7], 5); \ + ROUND2(D, E, A, B, C, F1, 8, in[ 6], 5); \ + ROUND2(C, D, E, A, B, F1, 13, in[ 2], 5); \ + ROUND2(B, C, D, E, A, F1, 6, in[13], 5); \ + ROUND2(A, B, C, D, E, F1, 5, in[14], 5); \ + ROUND2(E, A, B, C, D, F1, 15, in[ 0], 5); \ + ROUND2(D, E, A, B, C, F1, 13, in[ 3], 5); \ + ROUND2(C, D, E, A, B, F1, 11, in[ 9], 5); \ + ROUND2(B, C, D, E, A, F1, 11, in[11], 5); \ +\ + tmp = (h[1] + C1 + D2); \ + h[1] = (h[2] + D1 + E2); \ + h[2] = (h[3] + E1 + A2); \ + h[3] = (h[4] + A1 + B2); \ + h[4] = (h[0] + B1 + C2); \ + h[0] = tmp; \ } -__host__ -void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream) +__global__ +__launch_bounds__(256,3) +void lbry_sha256d_gpu_hash_final(const uint32_t threads, const uint32_t startNonce, uint64_t *Hash512, uint32_t *resNonces) { - const int threadsperblock = 256; + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]); - dim3 grid(threads/threadsperblock); - dim3 block(threadsperblock); + uint32_t __align__(8) buf[8]; // align for vectorize + uint32_t dat[16]; + #pragma unroll + for (int i=0; i<8; i++) + dat[i] = (input[i]); + dat[8] = 0x80; - lbry_sha256d_gpu_hash_112 <<>> (threads, startNonce, swabNonce, (uint64_t*) d_outputHash); -} + #pragma unroll + for (int i=9;i<16;i++) dat[i] = 0; -__host__ -void lbry_sha256_hash_20x2(int thr_id, uint32_t threads, uint32_t *d_Hash, cudaStream_t stream) -{ - const int threadsperblock = 256; + dat[14] = 0x100; // size in bits - dim3 grid(threads/threadsperblock); - dim3 block(threadsperblock); + uint32_t h[5]; + #pragma unroll + for (int i=0; i<5; i++) + h[i] = c_IV[i]; - lbry_sha256_gpu_hash_20x2 <<>> (threads, (uint64_t*) d_Hash); -} + RIPEMD160_ROUND_BODY(dat, h); -__host__ -void lbry_sha256d_hash_20x2(int thr_id, uint32_t threads, uint32_t *d_Hash, cudaStream_t stream) -{ - const int threadsperblock = 256; + #pragma unroll + for (int i=0; i<5; i++) + buf[i] = h[i]; - dim3 grid(threads/threadsperblock); - dim3 block(threadsperblock); + // second 32 bytes block hash - lbry_sha256d_gpu_hash_20x2 <<>> (threads, (uint64_t*) d_Hash); -} + #pragma unroll + for (int i=0; i<8; i++) + dat[i] = (input[8+i]); + dat[8] = 0x80; -__global__ -__launch_bounds__(256,3) -void lbry_sha256d_gpu_hash_final(const uint32_t threads, const uint32_t startNonce, uint64_t *Hash512, uint32_t *resNonces) -{ - const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); - if (thread < threads) - { - uint32_t __align__(8) buf[8]; // align for vectorize #pragma unroll - for (int i=0;i<8;i++) buf[i] = c_H256[i]; + for (int i=9;i<16;i++) dat[i] = 0; - uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]); + dat[14] = 0x100; // size in bits - uint32_t __align__(8) dat[16]; #pragma unroll - for (int i=0;i<5;i++) dat[i] = cuda_swab32(input[i]); + for (int i=0; i<5; i++) + h[i] = c_IV[i]; + + RIPEMD160_ROUND_BODY(dat, h); + + // first final sha256 + + #pragma unroll + for (int i=0;i<5;i++) dat[i] = cuda_swab32(buf[i]); #pragma unroll - for (int i=0;i<5;i++) dat[i+5] = cuda_swab32(input[i+8]); + for (int i=0;i<5;i++) dat[i+5] = cuda_swab32(h[i]); dat[10] = 0x80000000; #pragma unroll for (int i=11;i<15;i++) dat[i] = 0; dat[15] = 0x140; + #pragma unroll + for (int i=0;i<8;i++) buf[i] = c_H256[i]; + sha256_round_body(dat, buf, c_K); // second sha256 diff --git a/lbry/lbry.cu b/lbry/lbry.cu index b81cc79..ecf272b 100644 --- a/lbry/lbry.cu +++ b/lbry/lbry.cu @@ -62,7 +62,6 @@ extern "C" void lbry_hash(void* output, const void* input) /* ############################################################################################################################### */ -extern void lbry_ripemd160_init(int thr_id); extern void lbry_sha256_init(int thr_id); extern void lbry_sha256_free(int thr_id); extern void lbry_sha256_setBlock_112(uint32_t *pdata, uint32_t *ptarget); @@ -71,9 +70,6 @@ extern void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNo extern void lbry_sha256_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStream_t stream); extern void lbry_sha512_init(int thr_id); extern void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStream_t stream); -extern void lbry_ripemd160_hash_32x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream); -extern void lbry_sha256_hash_20x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream); -extern void lbry_sha256d_hash_20x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream); extern void lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_inputHash, uint32_t *resNonces, cudaStream_t stream); static __inline uint32_t swab32_if(uint32_t val, bool iftrue) { @@ -122,7 +118,6 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, lbry_sha256_init(thr_id); lbry_sha512_init(thr_id); - lbry_ripemd160_init(thr_id); cuda_check_cpu_init(thr_id, throughput); CUDA_LOG_ERROR(); @@ -149,17 +144,10 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, lbry_sha512_hash_32(thr_id, throughput, d_hash[thr_id], 0); - lbry_ripemd160_hash_32x2(thr_id, throughput, d_hash[thr_id], 0); - - #if 0 - lbry_sha256d_hash_20x2(thr_id, throughput, d_hash[thr_id], 0); - uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id]); - #else uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX }; lbry_sha256d_hash_final(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], resNonces, 0); - uint32_t foundNonce = resNonces[0]; - #endif + uint32_t foundNonce = resNonces[0]; *hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + throughput; if (foundNonce != UINT32_MAX) @@ -170,11 +158,9 @@ extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { int res = 1; uint32_t secNonce = resNonces[1]; - //uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], 1); work->nonces[0] = swab32_if(foundNonce, swap); work_set_target_ratio(work, vhash); if (secNonce != UINT32_MAX) { - //if (secNonce) { if (opt_debug) gpulog(LOG_BLUE, thr_id, "found second nonce %08x", swab32(secNonce)); endiandata[LBC_NONCE_OFT32] = swab32_if(secNonce, !swap);