diff --git a/Makefile.am b/Makefile.am index 0050b06..7a8fd47 100644 --- a/Makefile.am +++ b/Makefile.am @@ -50,6 +50,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \ sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \ sph/hamsi.c sph/hamsi_helper.c sph/streebog.c \ sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \ + sph/ripemd.c sph/sph_sha2.c \ + lbry/lbry.cu lbry/cuda_ripemd160.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu \ qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \ x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \ x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \ diff --git a/README.txt b/README.txt index 4f2bb58..f435c05 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccMiner preview 1.8-dev (May 2016) "Pascal and x11evo algo" +ccMiner 1.8 Preview (July 2016) "CUDA 8, lbry and x11evo algos" --------------------------------------------------------------- *************************************************************** @@ -28,6 +28,7 @@ Decred (Blake256 14-rounds - 180 bytes) HeavyCoin & MjollnirCoin FugueCoin GroestlCoin & Myriad-Groestl +Lbry Credits JackpotCoin QuarkCoin family & AnimeCoin TalkCoin @@ -35,13 +36,13 @@ DarkCoin and other X11 coins Chaincoin and Flaxscript (C11) Saffroncoin blake (256 14-rounds) BlakeCoin (256 8-rounds) -Midnight (BMW 256) Qubit (Digibyte, ...) Luffa (Joincoin) Keccak (Maxcoin) Pentablake (Blake 512 x5) 1Coin Triple S Neoscrypt (FeatherCoin) +Revolver (X11evo) Scrypt and Scrypt:N Scrypt-Jane (Chacha) Sibcoin (sib) @@ -82,6 +83,7 @@ its command line interface and options. heavy use to mine Heavycoin jackpot use to mine Jackpotcoin keccak use to mine Maxcoin + lbry use to mine LBRY Credits luffa use to mine Joincoin lyra2 use to mine Vertcoin mjollnir use to mine Mjollnircoin @@ -151,6 +153,7 @@ its command line interface and options. --max-diff=N Only mine if net difficulty is less than specified value --pstate=0 will force the Geforce 9xx to run in P0 P-State --plimit=150W set the gpu power limit, allow multiple values for N cards + --tlimit=85 Set the gpu thermal limit (windows only) --keep-clocks prevent reset clocks and/or power limit on exit --show-diff display submitted block and net difficulty -B, --background run the miner in the background @@ -242,6 +245,7 @@ features. July 2016 v1.8.0 Pascal support with cuda 8 + lbry new multi sha / ripemd algo (LBC) x11evo algo (XRE) Lyra2v2, Neoscrypt and Decred improvements Enhance windows NVAPI clock and power limits diff --git a/algos.h b/algos.h index a946b76..5184268 100644 --- a/algos.h +++ b/algos.h @@ -19,6 +19,7 @@ enum sha_algos { ALGO_HEAVY, /* Heavycoin hash */ ALGO_KECCAK, ALGO_JACKPOT, + ALGO_LBRY, ALGO_LUFFA, ALGO_LYRA2, ALGO_LYRA2v2, @@ -67,6 +68,7 @@ static const char *algo_names[] = { "heavy", "keccak", "jackpot", + "lbry", "luffa", "lyra2", "lyra2v2", diff --git a/ccminer.cpp b/ccminer.cpp index 9716db1..2f4bbef 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -228,6 +228,7 @@ Options:\n\ heavy Heavycoin\n\ jackpot Jackpot\n\ keccak Keccak-256 (Maxcoin)\n\ + lbry LBRY Credits (Sha/Ripemd)\n\ luffa Joincoin\n\ lyra2 LyraBar\n\ lyra2v2 VertCoin\n\ @@ -567,6 +568,7 @@ static void calc_network_diff(struct work *work) // sample for diff 43.281 : 1c05ea29 // todo: endian reversed on longpoll could be zr5 specific... uint32_t nbits = have_longpoll ? work->data[18] : swab32(work->data[18]); + if (opt_algo == ALGO_LBRY) nbits = swab32(work->data[26]); if (opt_algo == ALGO_DECRED) nbits = work->data[29]; uint32_t bits = (nbits & 0xffffff); int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28 @@ -837,6 +839,11 @@ static bool submit_upstream_work(CURL *curl, struct work *work) le32enc(&nonce, work->data[19]); be16enc(&nvote, *((uint16_t*)&work->data[20])); break; + case ALGO_LBRY: + check_dups = true; + le32enc(&ntime, work->data[25]); + le32enc(&nonce, work->data[27]); + break; case ALGO_ZR5: check_dups = true; be32enc(&ntime, work->data[17]); @@ -1296,6 +1303,8 @@ bool get_work(struct thr_info *thr, struct work *work) memset(work->data + 19, 0x00, 52); if (opt_algo == ALGO_DECRED) { memset(&work->data[35], 0x00, 52); + } else if (opt_algo == ALGO_LBRY) { + work->data[28] = 0x80000000; } else { work->data[20] = 0x80000000; work->data[31] = 0x00000280; @@ -1441,6 +1450,14 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) work->data[37] = (rand()*4) << 8; // random work data sctx->job.height = work->data[32]; //applog_hex(work->data, 180); + } else if (opt_algo == ALGO_LBRY) { + for (i = 0; i < 8; i++) + work->data[9 + i] = be32dec((uint32_t *)merkle_root + i); + for (i = 0; i < 8; i++) + work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i]; + work->data[25] = le32dec(sctx->job.ntime); + work->data[26] = le32dec(sctx->job.nbits); + work->data[28] = 0x80000000; } else { for (i = 0; i < 8; i++) work->data[9 + i] = be32dec((uint32_t *)merkle_root + i); @@ -1498,6 +1515,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) case ALGO_FRESH: case ALGO_FUGUE256: case ALGO_GROESTL: + case ALGO_LBRY: case ALGO_LYRA2v2: work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty)); break; @@ -1658,6 +1676,7 @@ static void *miner_thread(void *userdata) // &work.data[19] int wcmplen = (opt_algo == ALGO_DECRED) ? 140 : 76; + if (opt_algo == ALGO_LBRY) wcmplen = 108; int wcmpoft = 0; uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen); @@ -1910,6 +1929,7 @@ static void *miner_thread(void *userdata) minmax = 0x40000000U; break; case ALGO_KECCAK: + case ALGO_LBRY: case ALGO_LUFFA: case ALGO_SKEIN: case ALGO_SKEIN2: @@ -2035,6 +2055,9 @@ static void *miner_thread(void *userdata) case ALGO_JACKPOT: rc = scanhash_jackpot(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_LBRY: + rc = scanhash_lbry(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_LUFFA: rc = scanhash_luffa(thr_id, &work, max_nonce, &hashes_done); break; @@ -2130,7 +2153,7 @@ static void *miner_thread(void *userdata) // todo: update all algos to use work->nonces work.nonces[0] = nonceptr[0]; - if (opt_algo != ALGO_DECRED && opt_algo != ALGO_BLAKE2S) { + if (opt_algo != ALGO_DECRED && opt_algo != ALGO_BLAKE2S && opt_algo != ALGO_LBRY) { work.nonces[1] = nonceptr[2]; } diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 4d7b794..b304366 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -115,7 +115,7 @@ 80 true true - compute_50,sm_50 + compute_61,sm_61;compute_52,sm_52 $(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99 64 @@ -273,15 +273,11 @@ - - true - - - true - + + @@ -428,6 +424,10 @@ 92 + + + + 80 --ptxas-options="-dlcm=cg" %(AdditionalOptions) diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index 5decd0d..ad3842f 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -82,6 +82,9 @@ {1613763f-895c-4321-b58b-6f5849868956} + + {3079ea1f-f768-455a-acd6-f517fac535b4} + @@ -150,9 +153,6 @@ Source Files\sph - - Source Files\sph - Source Files\sph @@ -180,7 +180,10 @@ Source Files\sph - + + Source Files\sph + + Source Files\sph @@ -721,6 +724,18 @@ Source Files\CUDA\Algo256 + + Source Files\CUDA\lbry + + + Source Files\CUDA\lbry + + + Source Files\CUDA\lbry + + + Source Files\CUDA\lbry + @@ -737,4 +752,4 @@ Ressources - \ No newline at end of file + diff --git a/lbry/cuda_ripemd160.cu b/lbry/cuda_ripemd160.cu new file mode 100644 index 0000000..1ae7f35 --- /dev/null +++ b/lbry/cuda_ripemd160.cu @@ -0,0 +1,441 @@ +/* + * ripemd-160 kernel implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2014, 2016 djm34, tpruvot + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + */ +#include +#include +#include + +#include + +static __constant__ uint32_t c_IV[5] = { + 0x67452301u, 0xEFCDAB89u, 0x98BADCFEu, 0x10325476u, 0xC3D2E1F0u +}; + +//__host__ +//uint64_t xornot64(uint64_t a, uint64_t b, uint64_t c) { +// return c ^ (a | !b); +//} + +__forceinline__ __device__ +uint64_t xornot64(uint64_t a, uint64_t b, uint64_t c) +{ + uint64_t result; + asm("{ .reg .u64 m,n; // xornot64\n\t" + "not.b64 m,%2; \n\t" + "or.b64 n, %1,m;\n\t" + "xor.b64 %0, n,%3;\n\t" + "}\n\t" + : "=l"(result) : "l"(a), "l"(b), "l"(c)); + return result; +} + +//__host__ +//uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c) { +// return a ^ (b | !c); +//} + +__device__ __forceinline__ +uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c) +{ + uint64_t result; + asm("{ .reg .u64 m,n; // xornt64\n\t" + "not.b64 m,%3; \n\t" + "or.b64 n, %2,m;\n\t" + "xor.b64 %0, %1,n;\n\t" + "}\n\t" + : "=l"(result) : "l"(a), "l"(b), "l"(c)); + return result; +} + +/* + * Round functions for RIPEMD-128 and RIPEMD-160. + */ +#if 1 +#define F1(x, y, z) ((x) ^ (y) ^ (z)) +#define F2(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) +#define F3(x, y, z) (((x) | ~(y)) ^ (z)) +#define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y)) +#define F5(x, y, z) ((x) ^ ((y) | ~(z))) +#else +#define F1(x, y, z) xor3(x,y,z) +#define F2(x, y, z) xandx(x,y,z) +#define F3(x, y, z) xornot64(x,y,z) +#define F4(x, y, z) xandx(z,x,y) +#define F5(x, y, z) xornt64(x,y,z) +#endif + +/* + * Round constants for RIPEMD-160. + */ +#define K11 0x00000000u +#define K12 0x5A827999u +#define K13 0x6ED9EBA1u +#define K14 0x8F1BBCDCu +#define K15 0xA953FD4Eu + +#define K21 0x50A28BE6u +#define K22 0x5C4DD124u +#define K23 0x6D703EF3u +#define K24 0x7A6D76E9u +#define K25 0x00000000u + +#define RR(a, b, c, d, e, f, s, r, k) { \ + a = SPH_T32(ROTL32(SPH_T32(a + f(b, c, d) + r + k), s) + e); \ + c = ROTL32(c, 10); \ +} + +#define ROUND1(a, b, c, d, e, f, s, r, k) \ + RR(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k) + +#define ROUND2(a, b, c, d, e, f, s, r, k) \ + RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k) + +#define RIPEMD160_ROUND_BODY(in, h) { \ + uint32_t A1, B1, C1, D1, E1; \ + uint32_t A2, B2, C2, D2, E2; \ + uint32_t tmp; \ +\ + A1 = A2 = h[0]; \ + B1 = B2 = h[1]; \ + C1 = C2 = h[2]; \ + D1 = D2 = h[3]; \ + E1 = E2 = h[4]; \ +\ + ROUND1(A, B, C, D, E, F1, 11, in[ 0], 1); \ + ROUND1(E, A, B, C, D, F1, 14, in[ 1], 1); \ + ROUND1(D, E, A, B, C, F1, 15, in[ 2], 1); \ + ROUND1(C, D, E, A, B, F1, 12, in[ 3], 1); \ + ROUND1(B, C, D, E, A, F1, 5, in[ 4], 1); \ + ROUND1(A, B, C, D, E, F1, 8, in[ 5], 1); \ + ROUND1(E, A, B, C, D, F1, 7, in[ 6], 1); \ + ROUND1(D, E, A, B, C, F1, 9, in[ 7], 1); \ + ROUND1(C, D, E, A, B, F1, 11, in[ 8], 1); \ + ROUND1(B, C, D, E, A, F1, 13, in[ 9], 1); \ + ROUND1(A, B, C, D, E, F1, 14, in[10], 1); \ + ROUND1(E, A, B, C, D, F1, 15, in[11], 1); \ + ROUND1(D, E, A, B, C, F1, 6, in[12], 1); \ + ROUND1(C, D, E, A, B, F1, 7, in[13], 1); \ + ROUND1(B, C, D, E, A, F1, 9, in[14], 1); \ + ROUND1(A, B, C, D, E, F1, 8, in[15], 1); \ +\ + ROUND1(E, A, B, C, D, F2, 7, in[ 7], 2); \ + ROUND1(D, E, A, B, C, F2, 6, in[ 4], 2); \ + ROUND1(C, D, E, A, B, F2, 8, in[13], 2); \ + ROUND1(B, C, D, E, A, F2, 13, in[ 1], 2); \ + ROUND1(A, B, C, D, E, F2, 11, in[10], 2); \ + ROUND1(E, A, B, C, D, F2, 9, in[ 6], 2); \ + ROUND1(D, E, A, B, C, F2, 7, in[15], 2); \ + ROUND1(C, D, E, A, B, F2, 15, in[ 3], 2); \ + ROUND1(B, C, D, E, A, F2, 7, in[12], 2); \ + ROUND1(A, B, C, D, E, F2, 12, in[ 0], 2); \ + ROUND1(E, A, B, C, D, F2, 15, in[ 9], 2); \ + ROUND1(D, E, A, B, C, F2, 9, in[ 5], 2); \ + ROUND1(C, D, E, A, B, F2, 11, in[ 2], 2); \ + ROUND1(B, C, D, E, A, F2, 7, in[14], 2); \ + ROUND1(A, B, C, D, E, F2, 13, in[11], 2); \ + ROUND1(E, A, B, C, D, F2, 12, in[ 8], 2); \ +\ + ROUND1(D, E, A, B, C, F3, 11, in[ 3], 3); \ + ROUND1(C, D, E, A, B, F3, 13, in[10], 3); \ + ROUND1(B, C, D, E, A, F3, 6, in[14], 3); \ + ROUND1(A, B, C, D, E, F3, 7, in[ 4], 3); \ + ROUND1(E, A, B, C, D, F3, 14, in[ 9], 3); \ + ROUND1(D, E, A, B, C, F3, 9, in[15], 3); \ + ROUND1(C, D, E, A, B, F3, 13, in[ 8], 3); \ + ROUND1(B, C, D, E, A, F3, 15, in[ 1], 3); \ + ROUND1(A, B, C, D, E, F3, 14, in[ 2], 3); \ + ROUND1(E, A, B, C, D, F3, 8, in[ 7], 3); \ + ROUND1(D, E, A, B, C, F3, 13, in[ 0], 3); \ + ROUND1(C, D, E, A, B, F3, 6, in[ 6], 3); \ + ROUND1(B, C, D, E, A, F3, 5, in[13], 3); \ + ROUND1(A, B, C, D, E, F3, 12, in[11], 3); \ + ROUND1(E, A, B, C, D, F3, 7, in[ 5], 3); \ + ROUND1(D, E, A, B, C, F3, 5, in[12], 3); \ +\ + ROUND1(C, D, E, A, B, F4, 11, in[ 1], 4); \ + ROUND1(B, C, D, E, A, F4, 12, in[ 9], 4); \ + ROUND1(A, B, C, D, E, F4, 14, in[11], 4); \ + ROUND1(E, A, B, C, D, F4, 15, in[10], 4); \ + ROUND1(D, E, A, B, C, F4, 14, in[ 0], 4); \ + ROUND1(C, D, E, A, B, F4, 15, in[ 8], 4); \ + ROUND1(B, C, D, E, A, F4, 9, in[12], 4); \ + ROUND1(A, B, C, D, E, F4, 8, in[ 4], 4); \ + ROUND1(E, A, B, C, D, F4, 9, in[13], 4); \ + ROUND1(D, E, A, B, C, F4, 14, in[ 3], 4); \ + ROUND1(C, D, E, A, B, F4, 5, in[ 7], 4); \ + ROUND1(B, C, D, E, A, F4, 6, in[15], 4); \ + ROUND1(A, B, C, D, E, F4, 8, in[14], 4); \ + ROUND1(E, A, B, C, D, F4, 6, in[ 5], 4); \ + ROUND1(D, E, A, B, C, F4, 5, in[ 6], 4); \ + ROUND1(C, D, E, A, B, F4, 12, in[ 2], 4); \ +\ + ROUND1(B, C, D, E, A, F5, 9, in[ 4], 5); \ + ROUND1(A, B, C, D, E, F5, 15, in[ 0], 5); \ + ROUND1(E, A, B, C, D, F5, 5, in[ 5], 5); \ + ROUND1(D, E, A, B, C, F5, 11, in[ 9], 5); \ + ROUND1(C, D, E, A, B, F5, 6, in[ 7], 5); \ + ROUND1(B, C, D, E, A, F5, 8, in[12], 5); \ + ROUND1(A, B, C, D, E, F5, 13, in[ 2], 5); \ + ROUND1(E, A, B, C, D, F5, 12, in[10], 5); \ + ROUND1(D, E, A, B, C, F5, 5, in[14], 5); \ + ROUND1(C, D, E, A, B, F5, 12, in[ 1], 5); \ + ROUND1(B, C, D, E, A, F5, 13, in[ 3], 5); \ + ROUND1(A, B, C, D, E, F5, 14, in[ 8], 5); \ + ROUND1(E, A, B, C, D, F5, 11, in[11], 5); \ + ROUND1(D, E, A, B, C, F5, 8, in[ 6], 5); \ + ROUND1(C, D, E, A, B, F5, 5, in[15], 5); \ + ROUND1(B, C, D, E, A, F5, 6, in[13], 5); \ +\ + ROUND2(A, B, C, D, E, F5, 8, in[ 5], 1); \ + ROUND2(E, A, B, C, D, F5, 9, in[14], 1); \ + ROUND2(D, E, A, B, C, F5, 9, in[ 7], 1); \ + ROUND2(C, D, E, A, B, F5, 11, in[ 0], 1); \ + ROUND2(B, C, D, E, A, F5, 13, in[ 9], 1); \ + ROUND2(A, B, C, D, E, F5, 15, in[ 2], 1); \ + ROUND2(E, A, B, C, D, F5, 15, in[11], 1); \ + ROUND2(D, E, A, B, C, F5, 5, in[ 4], 1); \ + ROUND2(C, D, E, A, B, F5, 7, in[13], 1); \ + ROUND2(B, C, D, E, A, F5, 7, in[ 6], 1); \ + ROUND2(A, B, C, D, E, F5, 8, in[15], 1); \ + ROUND2(E, A, B, C, D, F5, 11, in[ 8], 1); \ + ROUND2(D, E, A, B, C, F5, 14, in[ 1], 1); \ + ROUND2(C, D, E, A, B, F5, 14, in[10], 1); \ + ROUND2(B, C, D, E, A, F5, 12, in[ 3], 1); \ + ROUND2(A, B, C, D, E, F5, 6, in[12], 1); \ +\ + ROUND2(E, A, B, C, D, F4, 9, in[ 6], 2); \ + ROUND2(D, E, A, B, C, F4, 13, in[11], 2); \ + ROUND2(C, D, E, A, B, F4, 15, in[ 3], 2); \ + ROUND2(B, C, D, E, A, F4, 7, in[ 7], 2); \ + ROUND2(A, B, C, D, E, F4, 12, in[ 0], 2); \ + ROUND2(E, A, B, C, D, F4, 8, in[13], 2); \ + ROUND2(D, E, A, B, C, F4, 9, in[ 5], 2); \ + ROUND2(C, D, E, A, B, F4, 11, in[10], 2); \ + ROUND2(B, C, D, E, A, F4, 7, in[14], 2); \ + ROUND2(A, B, C, D, E, F4, 7, in[15], 2); \ + ROUND2(E, A, B, C, D, F4, 12, in[ 8], 2); \ + ROUND2(D, E, A, B, C, F4, 7, in[12], 2); \ + ROUND2(C, D, E, A, B, F4, 6, in[ 4], 2); \ + ROUND2(B, C, D, E, A, F4, 15, in[ 9], 2); \ + ROUND2(A, B, C, D, E, F4, 13, in[ 1], 2); \ + ROUND2(E, A, B, C, D, F4, 11, in[ 2], 2); \ +\ + ROUND2(D, E, A, B, C, F3, 9, in[15], 3); \ + ROUND2(C, D, E, A, B, F3, 7, in[ 5], 3); \ + ROUND2(B, C, D, E, A, F3, 15, in[ 1], 3); \ + ROUND2(A, B, C, D, E, F3, 11, in[ 3], 3); \ + ROUND2(E, A, B, C, D, F3, 8, in[ 7], 3); \ + ROUND2(D, E, A, B, C, F3, 6, in[14], 3); \ + ROUND2(C, D, E, A, B, F3, 6, in[ 6], 3); \ + ROUND2(B, C, D, E, A, F3, 14, in[ 9], 3); \ + ROUND2(A, B, C, D, E, F3, 12, in[11], 3); \ + ROUND2(E, A, B, C, D, F3, 13, in[ 8], 3); \ + ROUND2(D, E, A, B, C, F3, 5, in[12], 3); \ + ROUND2(C, D, E, A, B, F3, 14, in[ 2], 3); \ + ROUND2(B, C, D, E, A, F3, 13, in[10], 3); \ + ROUND2(A, B, C, D, E, F3, 13, in[ 0], 3); \ + ROUND2(E, A, B, C, D, F3, 7, in[ 4], 3); \ + ROUND2(D, E, A, B, C, F3, 5, in[13], 3); \ +\ + ROUND2(C, D, E, A, B, F2, 15, in[ 8], 4); \ + ROUND2(B, C, D, E, A, F2, 5, in[ 6], 4); \ + ROUND2(A, B, C, D, E, F2, 8, in[ 4], 4); \ + ROUND2(E, A, B, C, D, F2, 11, in[ 1], 4); \ + ROUND2(D, E, A, B, C, F2, 14, in[ 3], 4); \ + ROUND2(C, D, E, A, B, F2, 14, in[11], 4); \ + ROUND2(B, C, D, E, A, F2, 6, in[15], 4); \ + ROUND2(A, B, C, D, E, F2, 14, in[ 0], 4); \ + ROUND2(E, A, B, C, D, F2, 6, in[ 5], 4); \ + ROUND2(D, E, A, B, C, F2, 9, in[12], 4); \ + ROUND2(C, D, E, A, B, F2, 12, in[ 2], 4); \ + ROUND2(B, C, D, E, A, F2, 9, in[13], 4); \ + ROUND2(A, B, C, D, E, F2, 12, in[ 9], 4); \ + ROUND2(E, A, B, C, D, F2, 5, in[ 7], 4); \ + ROUND2(D, E, A, B, C, F2, 15, in[10], 4); \ + ROUND2(C, D, E, A, B, F2, 8, in[14], 4); \ +\ + ROUND2(B, C, D, E, A, F1, 8, in[12], 5); \ + ROUND2(A, B, C, D, E, F1, 5, in[15], 5); \ + ROUND2(E, A, B, C, D, F1, 12, in[10], 5); \ + ROUND2(D, E, A, B, C, F1, 9, in[ 4], 5); \ + ROUND2(C, D, E, A, B, F1, 12, in[ 1], 5); \ + ROUND2(B, C, D, E, A, F1, 5, in[ 5], 5); \ + ROUND2(A, B, C, D, E, F1, 14, in[ 8], 5); \ + ROUND2(E, A, B, C, D, F1, 6, in[ 7], 5); \ + ROUND2(D, E, A, B, C, F1, 8, in[ 6], 5); \ + ROUND2(C, D, E, A, B, F1, 13, in[ 2], 5); \ + ROUND2(B, C, D, E, A, F1, 6, in[13], 5); \ + ROUND2(A, B, C, D, E, F1, 5, in[14], 5); \ + ROUND2(E, A, B, C, D, F1, 15, in[ 0], 5); \ + ROUND2(D, E, A, B, C, F1, 13, in[ 3], 5); \ + ROUND2(C, D, E, A, B, F1, 11, in[ 9], 5); \ + ROUND2(B, C, D, E, A, F1, 11, in[11], 5); \ +\ + tmp = (h[1] + C1 + D2); \ + h[1] = (h[2] + D1 + E2); \ + h[2] = (h[3] + E1 + A2); \ + h[3] = (h[4] + A1 + B2); \ + h[4] = (h[0] + B1 + C2); \ + h[0] = tmp; \ +} + +#if 0 +__global__ +void lbry_ripemd160_gpu_hash_32(const uint32_t threads, uint64_t *g_hash, const uint32_t byteOffset) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U + byteOffset/8]); + + uint32_t in[16]; + for (int i=0; i<8; i++) + in[i] = (hash[i]); + in[8] = 0x80; + + #pragma unroll + for (int i=9;i<16;i++) in[i] = 0; + + in[14] = 0x100; // size in bits + + uint32_t h[5]; + #pragma unroll + for (int i=0; i<5; i++) + h[i] = c_IV[i]; + + RIPEMD160_ROUND_BODY(in, h); + + #pragma unroll + for (int i=0; i<5; i++) + hash[i] = h[i]; + +#ifdef PAD_ZEROS + // 20 bytes hash on 32 or 64 bytes output space + hash[5] = 0; + hash[6] = 0; + hash[7] = 0; +#endif + } +} + +__host__ +void lbry_ripemd160_hash_32(int thr_id, uint32_t threads, uint32_t *g_Hash, uint32_t byteOffset, cudaStream_t stream) +{ + const uint32_t threadsperblock = 128; + + dim3 grid(threads/threadsperblock); + dim3 block(threadsperblock); + + lbry_ripemd160_gpu_hash_32 <<>> (threads, (uint64_t*) g_Hash, byteOffset); +} +#endif + +__global__ +//__launch_bounds__(256,6) +void lbry_ripemd160_gpu_hash_32x2(const uint32_t threads, uint64_t *g_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U]); + + uint32_t in[16]; + for (int i=0; i<8; i++) + in[i] = (hash[i]); + in[8] = 0x80; + + #pragma unroll + for (int i=9;i<16;i++) in[i] = 0; + + in[14] = 0x100; // size in bits + + uint32_t h[5]; + #pragma unroll + for (int i=0; i<5; i++) + h[i] = c_IV[i]; + + RIPEMD160_ROUND_BODY(in, h); + + #pragma unroll + for (int i=0; i<5; i++) + hash[i] = h[i]; + +#ifdef PAD_ZEROS + // 20 bytes hash on 32 output space + hash[5] = 0; + hash[6] = 0; + hash[7] = 0; +#endif + // second 32 bytes block hash + hash += 8; + + #pragma unroll + for (int i=0; i<8; i++) + in[i] = (hash[i]); + in[8] = 0x80; + + #pragma unroll + for (int i=9;i<16;i++) in[i] = 0; + + in[14] = 0x100; // size in bits + + #pragma unroll + for (int i=0; i<5; i++) + h[i] = c_IV[i]; + + RIPEMD160_ROUND_BODY(in, h); + + #pragma unroll + for (int i=0; i<5; i++) + hash[i] = h[i]; + +#ifdef PAD_ZEROS + // 20 bytes hash on 32 output space + hash[5] = 0; + hash[6] = 0; + hash[7] = 0; +#endif + } +} + +__host__ +void lbry_ripemd160_hash_32x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream) +{ + const uint32_t threadsperblock = 128; + + dim3 grid(threads/threadsperblock); + dim3 block(threadsperblock); + + lbry_ripemd160_gpu_hash_32x2 <<>> (threads, (uint64_t*) g_Hash); +} + +void lbry_ripemd160_init(int thr_id) +{ + //cudaMemcpyToSymbol(c_IV, IV, sizeof(IV), 0, cudaMemcpyHostToDevice); +} diff --git a/lbry/cuda_sha256_lbry.cu b/lbry/cuda_sha256_lbry.cu new file mode 100644 index 0000000..db0c41c --- /dev/null +++ b/lbry/cuda_sha256_lbry.cu @@ -0,0 +1,712 @@ +/* + * sha256 CUDA implementation. + */ +#include +#include +#include + +#include +#include + +__constant__ static uint32_t __align__(8) c_midstate112[8]; +__constant__ static uint32_t __align__(8) c_dataEnd112[12]; + +const __constant__ uint32_t __align__(8) c_H256[8] = { + 0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU, + 0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U +}; +__constant__ static uint32_t __align__(8) c_K[64]; + +static __thread uint32_t* d_resNonces; +__constant__ static uint32_t __align__(8) c_target[2]; +__device__ uint64_t d_target[1]; + +// ------------------------------------------------------------------------------------------------ + +static const uint32_t cpu_H256[8] = { + 0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU, + 0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U +}; + +static const uint32_t cpu_K[64] = { + 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, + 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, + 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, + 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, + 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, + 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, + 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, + 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2 +}; + +#define ROTR ROTR32 + +__host__ +static void sha256_step1_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, + uint32_t e, uint32_t f, uint32_t g, uint32_t &h, + uint32_t in, const uint32_t Kshared) +{ + uint32_t t1,t2; + uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g); + uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e); + uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a); + uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c); + + t1 = h + bsg21 + vxandx + Kshared + in; + t2 = bsg20 + andorv; + d = d + t1; + h = t1 + t2; +} + +__host__ +static void sha256_step2_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, + uint32_t e, uint32_t f, uint32_t g, uint32_t &h, + uint32_t* in, uint32_t pc, const uint32_t Kshared) +{ + uint32_t t1,t2; + + int pcidx1 = (pc-2) & 0xF; + int pcidx2 = (pc-7) & 0xF; + int pcidx3 = (pc-15) & 0xF; + + uint32_t inx0 = in[pc]; + uint32_t inx1 = in[pcidx1]; + uint32_t inx2 = in[pcidx2]; + uint32_t inx3 = in[pcidx3]; + + uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1); + uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3); + uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g); + uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e); + uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a); + uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c); + + in[pc] = ssg21 + inx2 + ssg20 + inx0; + + t1 = h + bsg21 + vxandx + Kshared + in[pc]; + t2 = bsg20 + andorv; + d = d + t1; + h = t1 + t2; +} + +__host__ +static void sha256_round_body_host(uint32_t* in, uint32_t* state, const uint32_t* Kshared) +{ + uint32_t a = state[0]; + uint32_t b = state[1]; + uint32_t c = state[2]; + uint32_t d = state[3]; + uint32_t e = state[4]; + uint32_t f = state[5]; + uint32_t g = state[6]; + uint32_t h = state[7]; + + sha256_step1_host(a,b,c,d,e,f,g,h,in[0], Kshared[0]); + sha256_step1_host(h,a,b,c,d,e,f,g,in[1], Kshared[1]); + sha256_step1_host(g,h,a,b,c,d,e,f,in[2], Kshared[2]); + sha256_step1_host(f,g,h,a,b,c,d,e,in[3], Kshared[3]); + sha256_step1_host(e,f,g,h,a,b,c,d,in[4], Kshared[4]); + sha256_step1_host(d,e,f,g,h,a,b,c,in[5], Kshared[5]); + sha256_step1_host(c,d,e,f,g,h,a,b,in[6], Kshared[6]); + sha256_step1_host(b,c,d,e,f,g,h,a,in[7], Kshared[7]); + sha256_step1_host(a,b,c,d,e,f,g,h,in[8], Kshared[8]); + sha256_step1_host(h,a,b,c,d,e,f,g,in[9], Kshared[9]); + sha256_step1_host(g,h,a,b,c,d,e,f,in[10],Kshared[10]); + sha256_step1_host(f,g,h,a,b,c,d,e,in[11],Kshared[11]); + sha256_step1_host(e,f,g,h,a,b,c,d,in[12],Kshared[12]); + sha256_step1_host(d,e,f,g,h,a,b,c,in[13],Kshared[13]); + sha256_step1_host(c,d,e,f,g,h,a,b,in[14],Kshared[14]); + sha256_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]); + + for (int i=0; i<3; i++) + { + sha256_step2_host(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]); + sha256_step2_host(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]); + sha256_step2_host(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]); + sha256_step2_host(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]); + sha256_step2_host(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]); + sha256_step2_host(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]); + sha256_step2_host(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]); + sha256_step2_host(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]); + sha256_step2_host(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]); + sha256_step2_host(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]); + sha256_step2_host(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]); + sha256_step2_host(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]); + sha256_step2_host(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]); + sha256_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]); + sha256_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]); + sha256_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]); + } + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + state[5] += f; + state[6] += g; + state[7] += h; +} + +__device__ __forceinline__ +uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) { + uint32_t result; +#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050 + asm ("lop3.b32 %0, %1, %2, %3, 0x96; // xor3b" //0x96 = 0xF0 ^ 0xCC ^ 0xAA + : "=r"(result) : "r"(a), "r"(b),"r"(c)); +#else + result = a^b^c; +#endif + return result; +} + +/* +__device__ __forceinline__ +uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) { + uint32_t result; + asm("{ .reg .u32 t1; // xor3b \n\t" + "xor.b32 t1, %2, %3;\n\t" + "xor.b32 %0, %1, t1;" + "}" + : "=r"(result) : "r"(a) ,"r"(b),"r"(c)); + return result; +} +#define xor3b(a,b,c) (a ^ b ^ c) +*/ + +__device__ __forceinline__ uint32_t bsg2_0(const uint32_t x) +{ + uint32_t r1 = ROTR32(x,2); + uint32_t r2 = ROTR32(x,13); + uint32_t r3 = ROTR32(x,22); + return xor3b(r1,r2,r3); +} + +__device__ __forceinline__ uint32_t bsg2_1(const uint32_t x) +{ + uint32_t r1 = ROTR32(x,6); + uint32_t r2 = ROTR32(x,11); + uint32_t r3 = ROTR32(x,25); + return xor3b(r1,r2,r3); +} + +__device__ __forceinline__ uint32_t ssg2_0(const uint32_t x) +{ + uint64_t r1 = ROTR32(x,7); + uint64_t r2 = ROTR32(x,18); + uint64_t r3 = shr_t32(x,3); + return xor3b(r1,r2,r3); +} + +__device__ __forceinline__ uint32_t ssg2_1(const uint32_t x) +{ + uint64_t r1 = ROTR32(x,17); + uint64_t r2 = ROTR32(x,19); + uint64_t r3 = shr_t32(x,10); + return xor3b(r1,r2,r3); +} + +__device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b, const uint32_t c) +{ + uint32_t result; + asm("{\n\t" + ".reg .u32 m,n,o;\n\t" + "and.b32 m, %1, %2;\n\t" + " or.b32 n, %1, %2;\n\t" + "and.b32 o, n, %3;\n\t" + " or.b32 %0, m, o ;\n\t" + "}\n\t" : "=r"(result) : "r"(a), "r"(b), "r"(c) + ); + return result; +} + +__device__ +static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h, + uint32_t in, const uint32_t Kshared) +{ + uint32_t t1,t2; + uint32_t vxandx = xandx(e, f, g); + uint32_t bsg21 = bsg2_1(e); + uint32_t bsg20 = bsg2_0(a); + uint32_t andorv = andor32(a,b,c); + + t1 = h + bsg21 + vxandx + Kshared + in; + t2 = bsg20 + andorv; + d = d + t1; + h = t1 + t2; +} + +__device__ +static void sha2_step2(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h, + uint32_t* in, uint32_t pc, const uint32_t Kshared) +{ + uint32_t t1,t2; + + int pcidx1 = (pc-2) & 0xF; + int pcidx2 = (pc-7) & 0xF; + int pcidx3 = (pc-15) & 0xF; + + uint32_t inx0 = in[pc]; + uint32_t inx1 = in[pcidx1]; + uint32_t inx2 = in[pcidx2]; + uint32_t inx3 = in[pcidx3]; + + uint32_t ssg21 = ssg2_1(inx1); + uint32_t ssg20 = ssg2_0(inx3); + uint32_t vxandx = xandx(e, f, g); + uint32_t bsg21 = bsg2_1(e); + uint32_t bsg20 = bsg2_0(a); + uint32_t andorv = andor32(a,b,c); + + in[pc] = ssg21 + inx2 + ssg20 + inx0; + + t1 = h + bsg21 + vxandx + Kshared + in[pc]; + t2 = bsg20 + andorv; + d = d + t1; + h = t1 + t2; +} + +__device__ +static void sha256_round_body(uint32_t* in, uint32_t* state, uint32_t* const Kshared) +{ + uint32_t a = state[0]; + uint32_t b = state[1]; + uint32_t c = state[2]; + uint32_t d = state[3]; + uint32_t e = state[4]; + uint32_t f = state[5]; + uint32_t g = state[6]; + uint32_t h = state[7]; + + sha2_step1(a,b,c,d,e,f,g,h,in[0], Kshared[0]); + sha2_step1(h,a,b,c,d,e,f,g,in[1], Kshared[1]); + sha2_step1(g,h,a,b,c,d,e,f,in[2], Kshared[2]); + sha2_step1(f,g,h,a,b,c,d,e,in[3], Kshared[3]); + sha2_step1(e,f,g,h,a,b,c,d,in[4], Kshared[4]); + sha2_step1(d,e,f,g,h,a,b,c,in[5], Kshared[5]); + sha2_step1(c,d,e,f,g,h,a,b,in[6], Kshared[6]); + sha2_step1(b,c,d,e,f,g,h,a,in[7], Kshared[7]); + sha2_step1(a,b,c,d,e,f,g,h,in[8], Kshared[8]); + sha2_step1(h,a,b,c,d,e,f,g,in[9], Kshared[9]); + sha2_step1(g,h,a,b,c,d,e,f,in[10],Kshared[10]); + sha2_step1(f,g,h,a,b,c,d,e,in[11],Kshared[11]); + sha2_step1(e,f,g,h,a,b,c,d,in[12],Kshared[12]); + sha2_step1(d,e,f,g,h,a,b,c,in[13],Kshared[13]); + sha2_step1(c,d,e,f,g,h,a,b,in[14],Kshared[14]); + sha2_step1(b,c,d,e,f,g,h,a,in[15],Kshared[15]); + + #pragma unroll + for (int i=0; i<3; i++) + { + sha2_step2(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]); + sha2_step2(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]); + sha2_step2(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]); + sha2_step2(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]); + sha2_step2(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]); + sha2_step2(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]); + sha2_step2(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]); + sha2_step2(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]); + sha2_step2(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]); + sha2_step2(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]); + sha2_step2(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]); + sha2_step2(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]); + sha2_step2(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]); + sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]); + sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]); + sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]); + } + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + state[5] += f; + state[6] += g; + state[7] += h; +} + +__device__ +uint64_t cuda_swab32ll(uint64_t x) { + return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x))); +} + +__global__ +/*__launch_bounds__(256,3)*/ +void lbry_sha256_gpu_hash_112(const uint32_t threads, const uint32_t startNonce, const bool swabNonce, uint64_t *outputHash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + const uint32_t nonce = startNonce + thread; + + uint32_t dat[16]; + #pragma unroll + for (int i=0;i<11;i++) dat[i] = c_dataEnd112[i]; // pre "swabed" + dat[11] = swabNonce ? cuda_swab32(nonce) : nonce; + dat[12] = 0x80000000; + dat[13] = 0; + dat[14] = 0; + dat[15] = 0x380; + + uint32_t __align__(8) buf[8]; + #pragma unroll + for (int i=0;i<8;i++) buf[i] = c_midstate112[i]; + + sha256_round_body(dat, buf, c_K); + + // output + uint2* output = (uint2*) (&outputHash[thread * 8U]); + #pragma unroll + for (int i=0;i<4;i++) { + //output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i])); + output[i] = vectorize(((uint64_t*)buf)[i]); // out without swap, new sha256 after + } + } +} + +__global__ +/*__launch_bounds__(256,3)*/ +void lbry_sha256_gpu_hash_32(uint32_t threads, uint64_t *Hash512) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t __align__(8) buf[8]; // align for vectorize + #pragma unroll + for (int i=0; i<8; i++) buf[i] = c_H256[i]; + + uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]); + + uint32_t dat[16]; + #pragma unroll + //for (int i=0;i<8;i++) dat[i] = cuda_swab32(input[i]); + for (int i=0; i<8; i++) dat[i] = input[i]; + dat[8] = 0x80000000; + #pragma unroll + for (int i=9; i<15; i++) dat[i] = 0; + dat[15] = 0x100; + + sha256_round_body(dat, buf, c_K); + + // output + uint2* output = (uint2*) input; + #pragma unroll + for (int i=0;i<4;i++) { + output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i])); + } +#ifdef PAD_ZEROS + #pragma unroll + for (int i=4; i<8; i++) output[i] = vectorize(0); +#endif + } +} + +__global__ +/*__launch_bounds__(256,3)*/ +void lbry_sha256d_gpu_hash_112(const uint32_t threads, const uint32_t startNonce, const bool swabNonce, uint64_t *outputHash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + extern __shared__ uint32_t s_K[]; + //s_K[thread & 63] = c_K[thread & 63]; + if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x]; + if (thread < threads) + { + const uint32_t nonce = startNonce + thread; + + uint32_t dat[16]; + #pragma unroll + for (int i=0; i<11; i++) dat[i] = c_dataEnd112[i]; + dat[11] = swabNonce ? cuda_swab32(nonce) : nonce; + dat[12] = 0x80000000; + dat[13] = 0; + dat[14] = 0; + dat[15] = 0x380; + + uint32_t __align__(8) buf[8]; + #pragma unroll + for (int i=0;i<8;i++) buf[i] = c_midstate112[i]; + + sha256_round_body(dat, buf, s_K); + + // second sha256 + + #pragma unroll + for (int i=0; i<8; i++) dat[i] = buf[i]; + dat[8] = 0x80000000; + #pragma unroll + for (int i=9; i<15; i++) dat[i] = 0; + dat[15] = 0x100; + + #pragma unroll + for (int i=0; i<8; i++) buf[i] = c_H256[i]; + + sha256_round_body(dat, buf, s_K); + + // output + uint2* output = (uint2*) (&outputHash[thread * 8U]); + #pragma unroll + for (int i=0;i<4;i++) { + output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i])); + //output[i] = vectorize(((uint64_t*)buf)[i]); + } + } +} + +__global__ +/*__launch_bounds__(256,3)*/ +void lbry_sha256_gpu_hash_20x2(uint32_t threads, uint64_t *Hash512) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t __align__(8) buf[8]; // align for vectorize + #pragma unroll + for (int i=0;i<8;i++) buf[i] = c_H256[i]; + + uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]); + + uint32_t dat[16]; + #pragma unroll + for (int i=0;i<5;i++) dat[i] = cuda_swab32(input[i]); + #pragma unroll + for (int i=0;i<5;i++) dat[i+5] = cuda_swab32(input[i+8]); + dat[10] = 0x80000000; + #pragma unroll + for (int i=11;i<15;i++) dat[i] = 0; + dat[15] = 0x140; + + sha256_round_body(dat, buf, c_K); + + // output + uint2* output = (uint2*) input; + #pragma unroll + for (int i=0;i<4;i++) { + //output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i])); + output[i] = vectorize(((uint64_t*)buf)[i]); + } +#ifdef PAD_ZEROS + #pragma unroll + for (int i=4; i<8; i++) output[i] = vectorize(0); +#endif + } +} + +__global__ +/*__launch_bounds__(256,3)*/ +void lbry_sha256d_gpu_hash_20x2(uint32_t threads, uint64_t *Hash512) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + extern __shared__ uint32_t s_K[]; + if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x]; + if (thread < threads) + { + uint32_t __align__(8) buf[8]; // align for vectorize + #pragma unroll + for (int i=0;i<8;i++) buf[i] = c_H256[i]; + + uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]); + + uint32_t dat[16]; + #pragma unroll + for (int i=0; i<5; i++) dat[i] = cuda_swab32(input[i]); + #pragma unroll + for (int i=0; i<5; i++) dat[i+5] = cuda_swab32(input[i+8]); + dat[10] = 0x80000000; + #pragma unroll + for (int i=11;i<15;i++) dat[i] = 0; + dat[15] = 0x140; + + sha256_round_body(dat, buf, s_K); + + // second sha256 + + #pragma unroll + for (int i=0; i<8; i++) dat[i] = buf[i]; + dat[8] = 0x80000000; + #pragma unroll + for (int i=9; i<15; i++) dat[i] = 0; + dat[15] = 0x100; + + #pragma unroll + for (int i=0;i<8;i++) buf[i] = c_H256[i]; + + sha256_round_body(dat, buf, s_K); + + // output + uint2* output = (uint2*) input; + +#ifdef FULL_HASH + #pragma unroll + for (int i=0;i<4;i++) { + output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i])); + //output[i] = vectorize(((uint64_t*)buf)[i]); + } +# ifdef PAD_ZEROS + #pragma unroll + for (int i=4; i<8; i++) output[i] = vectorize(0); +# endif + +#else + //input[6] = cuda_swab32(buf[6]); + //input[7] = cuda_swab32(buf[7]); + output[3] = vectorize(cuda_swab32ll(((uint64_t*)buf)[3])); +#endif + } +} + +__host__ +void lbry_sha256_init(int thr_id) +{ + //cudaMemcpyToSymbol(c_H256, cpu_H256, sizeof(cpu_H256), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(c_K, cpu_K, sizeof(cpu_K), 0, cudaMemcpyHostToDevice); + CUDA_SAFE_CALL(cudaMalloc(&d_resNonces, 4*sizeof(uint32_t))); +} + +__host__ +void lbry_sha256_free(int thr_id) +{ + cudaFree(d_resNonces); +} + +__host__ +void lbry_sha256_setBlock_112(uint32_t *pdata, uint32_t *ptarget) +{ + uint32_t in[16], buf[8], end[11]; + for (int i=0;i<16;i++) in[i] = cuda_swab32(pdata[i]); + for (int i=0; i<8;i++) buf[i] = cpu_H256[i]; + for (int i=0;i<11;i++) end[i] = cuda_swab32(pdata[16+i]); + sha256_round_body_host(in, buf, cpu_K); + + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_midstate112, buf, 32, 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_dataEnd112, end, sizeof(end), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_target, &ptarget[6], sizeof(uint64_t), 0, cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &ptarget[6], sizeof(uint64_t), 0, cudaMemcpyHostToDevice)); +} + +__host__ +void lbry_sha256_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream) +{ + const int threadsperblock = 256; + + dim3 grid(threads/threadsperblock); + dim3 block(threadsperblock); + + lbry_sha256_gpu_hash_112 <<>> (threads, startNonce, swabNonce, (uint64_t*) d_outputHash); + cudaGetLastError(); +} + +__host__ +void lbry_sha256_hash_32(int thr_id, uint32_t threads, uint32_t *d_Hash, cudaStream_t stream) +{ + const int threadsperblock = 256; + + dim3 grid(threads/threadsperblock); + dim3 block(threadsperblock); + + lbry_sha256_gpu_hash_32 <<>> (threads, (uint64_t*) d_Hash); +} + +__host__ +void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream) +{ + const int threadsperblock = 256; + + dim3 grid(threads/threadsperblock); + dim3 block(threadsperblock); + + lbry_sha256d_gpu_hash_112 <<>> (threads, startNonce, swabNonce, (uint64_t*) d_outputHash); +} + +__host__ +void lbry_sha256_hash_20x2(int thr_id, uint32_t threads, uint32_t *d_Hash, cudaStream_t stream) +{ + const int threadsperblock = 256; + + dim3 grid(threads/threadsperblock); + dim3 block(threadsperblock); + + lbry_sha256_gpu_hash_20x2 <<>> (threads, (uint64_t*) d_Hash); +} + +__host__ +void lbry_sha256d_hash_20x2(int thr_id, uint32_t threads, uint32_t *d_Hash, cudaStream_t stream) +{ + const int threadsperblock = 256; + + dim3 grid(threads/threadsperblock); + dim3 block(threadsperblock); + + lbry_sha256d_gpu_hash_20x2 <<>> (threads, (uint64_t*) d_Hash); +} + +__global__ +__launch_bounds__(256,3) +void lbry_sha256d_gpu_hash_final(const uint32_t threads, const uint32_t startNonce, uint64_t *Hash512, uint32_t *resNonces) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t __align__(8) buf[8]; // align for vectorize + #pragma unroll + for (int i=0;i<8;i++) buf[i] = c_H256[i]; + + uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]); + + uint32_t __align__(8) dat[16]; + #pragma unroll + for (int i=0;i<5;i++) dat[i] = cuda_swab32(input[i]); + #pragma unroll + for (int i=0;i<5;i++) dat[i+5] = cuda_swab32(input[i+8]); + dat[10] = 0x80000000; + #pragma unroll + for (int i=11;i<15;i++) dat[i] = 0; + dat[15] = 0x140; + + sha256_round_body(dat, buf, c_K); + + // second sha256 + + #pragma unroll + for (int i=0;i<8;i++) dat[i] = buf[i]; + dat[8] = 0x80000000; + #pragma unroll + for (int i=9;i<15;i++) dat[i] = 0; + dat[15] = 0x100; + + #pragma unroll + for (int i=0;i<8;i++) buf[i] = c_H256[i]; + + sha256_round_body(dat, buf, c_K); + + // valid nonces + uint64_t high = cuda_swab32ll(((uint64_t*)buf)[3]); + if (high <= d_target[0]) { + // printf("%08x %08x - %016llx %016llx - %08x %08x\n", buf[7], buf[6], high, d_target[0], c_target[1], c_target[0]); + uint32_t nonce = startNonce + thread; + resNonces[1] = atomicExch(resNonces, nonce); + d_target[0] = high; + } + } +} + +__host__ +void lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_inputHash, uint32_t *resNonces, cudaStream_t stream) +{ + const int threadsperblock = 256; + + dim3 grid(threads/threadsperblock); + dim3 block(threadsperblock); + + CUDA_SAFE_CALL(cudaMemset(d_resNonces, 0xFF, 2 * sizeof(uint32_t))); + cudaThreadSynchronize(); + + lbry_sha256d_gpu_hash_final <<>> (threads, startNonce, (uint64_t*) d_inputHash, d_resNonces); + + cudaThreadSynchronize(); + + CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_resNonces, 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + if (resNonces[0] == resNonces[1]) { + resNonces[1] = UINT32_MAX; + } +} \ No newline at end of file diff --git a/lbry/cuda_sha512_lbry.cu b/lbry/cuda_sha512_lbry.cu new file mode 100644 index 0000000..39f549b --- /dev/null +++ b/lbry/cuda_sha512_lbry.cu @@ -0,0 +1,181 @@ +/** + * sha-512 CUDA implementation. + */ + +#include +#include +#include + +#include + +static __constant__ uint64_t K_512[80]; + +static const uint64_t K512[80] = { + 0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC, + 0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118, + 0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2, + 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694, + 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65, + 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5, + 0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4, + 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70, + 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF, + 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B, + 0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30, + 0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8, + 0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8, + 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3, + 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC, + 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B, + 0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178, + 0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B, + 0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C, + 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817 +}; + +//#undef xor3 +//#define xor3(a,b,c) (a^b^c) + +//#undef + +static __device__ __forceinline__ +uint64_t bsg5_0(const uint64_t x) +{ + uint64_t r1 = ROTR64(x,28); + uint64_t r2 = ROTR64(x,34); + uint64_t r3 = ROTR64(x,39); + return xor3(r1,r2,r3); +} + +static __device__ __forceinline__ +uint64_t bsg5_1(const uint64_t x) +{ + uint64_t r1 = ROTR64(x,14); + uint64_t r2 = ROTR64(x,18); + uint64_t r3 = ROTR64(x,41); + return xor3(r1,r2,r3); +} + +static __device__ __forceinline__ +uint64_t ssg5_0(const uint64_t x) +{ + uint64_t r1 = ROTR64(x,1); + uint64_t r2 = ROTR64(x,8); + uint64_t r3 = shr_t64(x,7); + return xor3(r1,r2,r3); +} + +static __device__ __forceinline__ +uint64_t ssg5_1(const uint64_t x) +{ + uint64_t r1 = ROTR64(x,19); + uint64_t r2 = ROTR64(x,61); + uint64_t r3 = shr_t64(x,6); + return xor3(r1,r2,r3); +} + +static __device__ __forceinline__ +uint64_t xandx64(const uint64_t a, const uint64_t b, const uint64_t c) +{ + uint64_t result; + asm("{ .reg .u64 m,n; // xandx64\n\t" + "xor.b64 m, %2,%3;\n\t" + "and.b64 n, m,%1;\n\t" + "xor.b64 %0, n,%3;\n\t" + "}" : "=l"(result) : "l"(a), "l"(b), "l"(c)); + return result; +} + +static __device__ __forceinline__ +void sha512_step2(uint64_t* r, uint64_t* W, uint64_t* K, const int ord, int i) +{ + int u = 8-ord; + uint64_t a = r[(0+u) & 7]; + uint64_t b = r[(1+u) & 7]; + uint64_t c = r[(2+u) & 7]; + uint64_t d = r[(3+u) & 7]; + uint64_t e = r[(4+u) & 7]; + uint64_t f = r[(5+u) & 7]; + uint64_t g = r[(6+u) & 7]; + uint64_t h = r[(7+u) & 7]; + + uint64_t T1 = h + bsg5_1(e) + xandx64(e,f,g) + W[i] + K[i]; + uint64_t T2 = bsg5_0(a) + andor(a,b,c); + r[(3+u)& 7] = d + T1; + r[(7+u)& 7] = T1 + T2; +} + +/**************************************************************************************************/ + +__global__ +void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint64_t *pHash = &g_hash[thread * 8U]; + + uint64_t W[80]; + uint64_t r[8]; + + uint64_t IV512[8] = { + 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, + 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 + }; + + #pragma unroll + for (int i = 0; i < 8; i++) + r[i] = IV512[i]; + + #pragma unroll + for (int i = 0; i < 4; i++) { + // 32 bytes input + W[i] = cuda_swab64(pHash[i]); + } + + W[4] = 0x8000000000000000; // end tag + + #pragma unroll + for (int i = 5; i < 15; i++) W[i] = 0; + + W[15] = 0x100; // 256 bits + + #pragma unroll + for (int i = 16; i < 80; i++) W[i] = 0; + + #pragma unroll 64 + for (int i = 16; i < 80; i++) + W[i] = ssg5_1(W[i - 2]) + W[i - 7] + ssg5_0(W[i - 15]) + W[i - 16]; + + #pragma unroll 10 + for (int i = 0; i < 10; i++) { + #pragma unroll 8 + for (int ord=0; ord<8; ord++) + sha512_step2(r, W, K_512, ord, 8*i + ord); + } + + #pragma unroll 8 + for (int i = 0; i < 8; i++) + pHash[i] = cuda_swab64(r[i] + IV512[i]); + } +} + +__host__ +void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStream_t stream) +{ + const int threadsperblock = 256; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + size_t shared_size = 80*8; + lbry_sha512_gpu_hash_32 <<>> (threads, (uint64_t*)d_hash); +} + +/**************************************************************************************************/ + +__host__ +void lbry_sha512_init(int thr_id) +{ + cudaMemcpyToSymbol(K_512, K512, 80*sizeof(uint64_t), 0, cudaMemcpyHostToDevice); +} diff --git a/lbry/lbry.cu b/lbry/lbry.cu new file mode 100644 index 0000000..f5da268 --- /dev/null +++ b/lbry/lbry.cu @@ -0,0 +1,225 @@ +/** + * Lbry CUDA Implementation + * + * by tpruvot@github - July 2016 + * + */ + +#include +#include + +extern "C" { +#include +#include +} + +#include +#include + +#define A 64 +#define debug_cpu 0 + +extern "C" void lbry_hash(void* output, const void* input) +{ + uint32_t _ALIGN(A) hashA[16]; + uint32_t _ALIGN(A) hashB[8]; + uint32_t _ALIGN(A) hashC[8]; + + sph_sha256_context ctx_sha256; + sph_sha512_context ctx_sha512; + sph_ripemd160_context ctx_ripemd; + + sph_sha256_init(&ctx_sha256); + sph_sha256(&ctx_sha256, input, 112); + sph_sha256_close(&ctx_sha256, hashA); + + sph_sha256(&ctx_sha256, hashA, 32); + sph_sha256_close(&ctx_sha256, hashA); + + sph_sha512_init(&ctx_sha512); + sph_sha512(&ctx_sha512, hashA, 32); + sph_sha512_close(&ctx_sha512, hashA); + + sph_ripemd160_init(&ctx_ripemd); + sph_ripemd160(&ctx_ripemd, hashA, 32); // sha512 low + sph_ripemd160_close(&ctx_ripemd, hashB); + if (debug_cpu) applog_hex(hashB, 20); + + sph_ripemd160(&ctx_ripemd, &hashA[8], 32); // sha512 high + sph_ripemd160_close(&ctx_ripemd, hashC); + if (debug_cpu) applog_hex(hashC, 20); + + sph_sha256(&ctx_sha256, hashB, 20); + sph_sha256(&ctx_sha256, hashC, 20); + sph_sha256_close(&ctx_sha256, hashA); + if (debug_cpu) applog_hex(hashA,32); + + sph_sha256(&ctx_sha256, hashA, 32); + sph_sha256_close(&ctx_sha256, hashA); + + memcpy(output, hashA, 32); +} + +/* ############################################################################################################################### */ + +extern void lbry_ripemd160_init(int thr_id); +extern void lbry_sha256_init(int thr_id); +extern void lbry_sha256_free(int thr_id); +extern void lbry_sha256_setBlock_112(uint32_t *pdata, uint32_t *ptarget); +extern void lbry_sha256_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream); +extern void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream); +extern void lbry_sha256_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStream_t stream); +extern void lbry_sha512_init(int thr_id); +extern void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStream_t stream); +extern void lbry_ripemd160_hash_32x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream); +extern void lbry_sha256_hash_20x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream); +extern void lbry_sha256d_hash_20x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream); +extern void lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_inputHash, uint32_t *resNonces, cudaStream_t stream); + +static __inline uint32_t swab32_if(uint32_t val, bool iftrue) { + return iftrue ? swab32(val) : val; +} + +static bool init[MAX_GPUS] = { 0 }; + +static uint32_t *d_hash[MAX_GPUS]; + +// nonce position is different +#define LBC_NONCE_OFT32 27 + +extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t _ALIGN(A) vhash[8]; + uint32_t _ALIGN(A) endiandata[28]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + + const uint32_t first_nonce = pdata[LBC_NONCE_OFT32]; + const int swap = 0; // to toggle nonce endian + + const int dev_id = device_map[thr_id]; + int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 22 : 20; + if (device_sm[dev_id] >= 600) intensity = 23; + if (device_sm[dev_id] < 350) intensity = 18; + + uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); + if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + if (opt_benchmark) { + ptarget[7] = 0xff; + } + + if (!init[thr_id]){ + cudaSetDevice(dev_id); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage (linux) + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + CUDA_LOG_ERROR(); + } + + CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput)); + + lbry_sha256_init(thr_id); + lbry_sha512_init(thr_id); + lbry_ripemd160_init(thr_id); + cuda_check_cpu_init(thr_id, throughput); + CUDA_LOG_ERROR(); + + init[thr_id] = true; + } + + for (int i=0; i < LBC_NONCE_OFT32; i++) { + be32enc(&endiandata[i], pdata[i]); + } + + lbry_sha256_setBlock_112(endiandata, ptarget); + cuda_check_cpu_setTarget(ptarget); + + do { + + // Hash with CUDA + #if 0 + lbry_sha256_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], swap, 0); + lbry_sha256_hash_32(thr_id, throughput, d_hash[thr_id], 0); + #else + lbry_sha256d_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], swap, 0); + #endif + CUDA_LOG_ERROR(); + + lbry_sha512_hash_32(thr_id, throughput, d_hash[thr_id], 0); + + lbry_ripemd160_hash_32x2(thr_id, throughput, d_hash[thr_id], 0); + + #if 0 + lbry_sha256d_hash_20x2(thr_id, throughput, d_hash[thr_id], 0); + uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id]); + #else + uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX }; + lbry_sha256d_hash_final(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], resNonces, 0); + uint32_t foundNonce = resNonces[0]; + #endif + + *hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + throughput; + + if (foundNonce != UINT32_MAX) + { + endiandata[LBC_NONCE_OFT32] = swab32_if(foundNonce, !swap); + lbry_hash(vhash, endiandata); + + if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) { + int res = 1; + uint32_t secNonce = resNonces[1]; + //uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], 1); + work->nonces[0] = swab32_if(foundNonce, swap); + work_set_target_ratio(work, vhash); + if (secNonce != UINT32_MAX) { + //if (secNonce) { + if (opt_debug) + gpulog(LOG_BLUE, thr_id, "found second nonce %08x", swab32(secNonce)); + endiandata[LBC_NONCE_OFT32] = swab32_if(secNonce, !swap); + lbry_hash(vhash, endiandata); + if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio) { + work_set_target_ratio(work, vhash); + xchg(work->nonces[0], work->nonces[1]); + } + work->nonces[1] = swab32_if(secNonce, swap); + res++; + } + pdata[LBC_NONCE_OFT32] = work->nonces[0]; + return res; + } else { + gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU %08x > %08x!", foundNonce, vhash[7], ptarget[7]); + } + } + + if ((uint64_t) throughput + pdata[LBC_NONCE_OFT32] >= max_nonce) { + pdata[LBC_NONCE_OFT32] = max_nonce; + break; + } + + pdata[LBC_NONCE_OFT32] += throughput; + + } while (!work_restart[thr_id].restart); + + //*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce; + + return 0; +} + +// cleanup +void free_lbry(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + cudaFree(d_hash[thr_id]); + lbry_sha256_free(thr_id); + + cuda_check_cpu_free(thr_id); + init[thr_id] = false; + + cudaDeviceSynchronize(); +} diff --git a/miner.h b/miner.h index 6bbb3a8..fa8335a 100644 --- a/miner.h +++ b/miner.h @@ -275,6 +275,7 @@ extern int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, extern int scanhash_groestlcoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_heavy(int thr_id,struct work *work, uint32_t max_nonce, unsigned long *hashes_done, uint32_t maxvote, int blocklen); extern int scanhash_jackpot(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_lyra2v2(int thr_id,struct work* work, uint32_t max_nonce, unsigned long *hashes_done); @@ -584,6 +585,7 @@ struct stratum_job { unsigned char version[4]; unsigned char nbits[4]; unsigned char ntime[4]; + unsigned char claim[32]; // lbry bool clean; unsigned char nreward[2]; uint32_t height; @@ -797,6 +799,7 @@ void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); void keccak256_hash(void *state, const void *input); unsigned int jackpothash(void *state, const void *input); void groestlhash(void *state, const void *input); +void lbry_hash(void *output, const void *input); void lyra2re_hash(void *state, const void *input); void lyra2v2_hash(void *state, const void *input); void myriadhash(void *state, const void *input); diff --git a/sph/ripemd.c b/sph/ripemd.c new file mode 100644 index 0000000..dd12b1f --- /dev/null +++ b/sph/ripemd.c @@ -0,0 +1,833 @@ +/* $Id: ripemd.c 216 2010-06-08 09:46:57Z tp $ */ +/* + * RIPEMD-160 implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_ripemd.h" + +/* + * Round functions for RIPEMD (original). + */ +#define F(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) +#define G(x, y, z) (((x) & (y)) | (((x) | (y)) & (z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) + +static const sph_u32 oIV[5] = { + SPH_C32(0x67452301), SPH_C32(0xEFCDAB89), + SPH_C32(0x98BADCFE), SPH_C32(0x10325476) +}; + +/* + * Round functions for RIPEMD-128 and RIPEMD-160. + */ +#define F1(x, y, z) ((x) ^ (y) ^ (z)) +#define F2(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) +#define F3(x, y, z) (((x) | ~(y)) ^ (z)) +#define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y)) +#define F5(x, y, z) ((x) ^ ((y) | ~(z))) + +static const sph_u32 IV[5] = { + SPH_C32(0x67452301), SPH_C32(0xEFCDAB89), SPH_C32(0x98BADCFE), + SPH_C32(0x10325476), SPH_C32(0xC3D2E1F0) +}; + +#define ROTL SPH_ROTL32 + +/* ===================================================================== */ +/* + * RIPEMD (original hash, deprecated). + */ + +#define FF1(A, B, C, D, X, s) do { \ + sph_u32 tmp = SPH_T32((A) + F(B, C, D) + (X)); \ + (A) = ROTL(tmp, (s)); \ + } while (0) + +#define GG1(A, B, C, D, X, s) do { \ + sph_u32 tmp = SPH_T32((A) + G(B, C, D) \ + + (X) + SPH_C32(0x5A827999)); \ + (A) = ROTL(tmp, (s)); \ + } while (0) + +#define HH1(A, B, C, D, X, s) do { \ + sph_u32 tmp = SPH_T32((A) + H(B, C, D) \ + + (X) + SPH_C32(0x6ED9EBA1)); \ + (A) = ROTL(tmp, (s)); \ + } while (0) + +#define FF2(A, B, C, D, X, s) do { \ + sph_u32 tmp = SPH_T32((A) + F(B, C, D) \ + + (X) + SPH_C32(0x50A28BE6)); \ + (A) = ROTL(tmp, (s)); \ + } while (0) + +#define GG2(A, B, C, D, X, s) do { \ + sph_u32 tmp = SPH_T32((A) + G(B, C, D) + (X)); \ + (A) = ROTL(tmp, (s)); \ + } while (0) + +#define HH2(A, B, C, D, X, s) do { \ + sph_u32 tmp = SPH_T32((A) + H(B, C, D) \ + + (X) + SPH_C32(0x5C4DD124)); \ + (A) = ROTL(tmp, (s)); \ + } while (0) + +#define RIPEMD_ROUND_BODY(in, h) do { \ + sph_u32 A1, B1, C1, D1; \ + sph_u32 A2, B2, C2, D2; \ + sph_u32 tmp; \ + \ + A1 = A2 = (h)[0]; \ + B1 = B2 = (h)[1]; \ + C1 = C2 = (h)[2]; \ + D1 = D2 = (h)[3]; \ + \ + FF1(A1, B1, C1, D1, in( 0), 11); \ + FF1(D1, A1, B1, C1, in( 1), 14); \ + FF1(C1, D1, A1, B1, in( 2), 15); \ + FF1(B1, C1, D1, A1, in( 3), 12); \ + FF1(A1, B1, C1, D1, in( 4), 5); \ + FF1(D1, A1, B1, C1, in( 5), 8); \ + FF1(C1, D1, A1, B1, in( 6), 7); \ + FF1(B1, C1, D1, A1, in( 7), 9); \ + FF1(A1, B1, C1, D1, in( 8), 11); \ + FF1(D1, A1, B1, C1, in( 9), 13); \ + FF1(C1, D1, A1, B1, in(10), 14); \ + FF1(B1, C1, D1, A1, in(11), 15); \ + FF1(A1, B1, C1, D1, in(12), 6); \ + FF1(D1, A1, B1, C1, in(13), 7); \ + FF1(C1, D1, A1, B1, in(14), 9); \ + FF1(B1, C1, D1, A1, in(15), 8); \ + \ + GG1(A1, B1, C1, D1, in( 7), 7); \ + GG1(D1, A1, B1, C1, in( 4), 6); \ + GG1(C1, D1, A1, B1, in(13), 8); \ + GG1(B1, C1, D1, A1, in( 1), 13); \ + GG1(A1, B1, C1, D1, in(10), 11); \ + GG1(D1, A1, B1, C1, in( 6), 9); \ + GG1(C1, D1, A1, B1, in(15), 7); \ + GG1(B1, C1, D1, A1, in( 3), 15); \ + GG1(A1, B1, C1, D1, in(12), 7); \ + GG1(D1, A1, B1, C1, in( 0), 12); \ + GG1(C1, D1, A1, B1, in( 9), 15); \ + GG1(B1, C1, D1, A1, in( 5), 9); \ + GG1(A1, B1, C1, D1, in(14), 7); \ + GG1(D1, A1, B1, C1, in( 2), 11); \ + GG1(C1, D1, A1, B1, in(11), 13); \ + GG1(B1, C1, D1, A1, in( 8), 12); \ + \ + HH1(A1, B1, C1, D1, in( 3), 11); \ + HH1(D1, A1, B1, C1, in(10), 13); \ + HH1(C1, D1, A1, B1, in( 2), 14); \ + HH1(B1, C1, D1, A1, in( 4), 7); \ + HH1(A1, B1, C1, D1, in( 9), 14); \ + HH1(D1, A1, B1, C1, in(15), 9); \ + HH1(C1, D1, A1, B1, in( 8), 13); \ + HH1(B1, C1, D1, A1, in( 1), 15); \ + HH1(A1, B1, C1, D1, in(14), 6); \ + HH1(D1, A1, B1, C1, in( 7), 8); \ + HH1(C1, D1, A1, B1, in( 0), 13); \ + HH1(B1, C1, D1, A1, in( 6), 6); \ + HH1(A1, B1, C1, D1, in(11), 12); \ + HH1(D1, A1, B1, C1, in(13), 5); \ + HH1(C1, D1, A1, B1, in( 5), 7); \ + HH1(B1, C1, D1, A1, in(12), 5); \ + \ + FF2(A2, B2, C2, D2, in( 0), 11); \ + FF2(D2, A2, B2, C2, in( 1), 14); \ + FF2(C2, D2, A2, B2, in( 2), 15); \ + FF2(B2, C2, D2, A2, in( 3), 12); \ + FF2(A2, B2, C2, D2, in( 4), 5); \ + FF2(D2, A2, B2, C2, in( 5), 8); \ + FF2(C2, D2, A2, B2, in( 6), 7); \ + FF2(B2, C2, D2, A2, in( 7), 9); \ + FF2(A2, B2, C2, D2, in( 8), 11); \ + FF2(D2, A2, B2, C2, in( 9), 13); \ + FF2(C2, D2, A2, B2, in(10), 14); \ + FF2(B2, C2, D2, A2, in(11), 15); \ + FF2(A2, B2, C2, D2, in(12), 6); \ + FF2(D2, A2, B2, C2, in(13), 7); \ + FF2(C2, D2, A2, B2, in(14), 9); \ + FF2(B2, C2, D2, A2, in(15), 8); \ + \ + GG2(A2, B2, C2, D2, in( 7), 7); \ + GG2(D2, A2, B2, C2, in( 4), 6); \ + GG2(C2, D2, A2, B2, in(13), 8); \ + GG2(B2, C2, D2, A2, in( 1), 13); \ + GG2(A2, B2, C2, D2, in(10), 11); \ + GG2(D2, A2, B2, C2, in( 6), 9); \ + GG2(C2, D2, A2, B2, in(15), 7); \ + GG2(B2, C2, D2, A2, in( 3), 15); \ + GG2(A2, B2, C2, D2, in(12), 7); \ + GG2(D2, A2, B2, C2, in( 0), 12); \ + GG2(C2, D2, A2, B2, in( 9), 15); \ + GG2(B2, C2, D2, A2, in( 5), 9); \ + GG2(A2, B2, C2, D2, in(14), 7); \ + GG2(D2, A2, B2, C2, in( 2), 11); \ + GG2(C2, D2, A2, B2, in(11), 13); \ + GG2(B2, C2, D2, A2, in( 8), 12); \ + \ + HH2(A2, B2, C2, D2, in( 3), 11); \ + HH2(D2, A2, B2, C2, in(10), 13); \ + HH2(C2, D2, A2, B2, in( 2), 14); \ + HH2(B2, C2, D2, A2, in( 4), 7); \ + HH2(A2, B2, C2, D2, in( 9), 14); \ + HH2(D2, A2, B2, C2, in(15), 9); \ + HH2(C2, D2, A2, B2, in( 8), 13); \ + HH2(B2, C2, D2, A2, in( 1), 15); \ + HH2(A2, B2, C2, D2, in(14), 6); \ + HH2(D2, A2, B2, C2, in( 7), 8); \ + HH2(C2, D2, A2, B2, in( 0), 13); \ + HH2(B2, C2, D2, A2, in( 6), 6); \ + HH2(A2, B2, C2, D2, in(11), 12); \ + HH2(D2, A2, B2, C2, in(13), 5); \ + HH2(C2, D2, A2, B2, in( 5), 7); \ + HH2(B2, C2, D2, A2, in(12), 5); \ + \ + tmp = SPH_T32((h)[1] + C1 + D2); \ + (h)[1] = SPH_T32((h)[2] + D1 + A2); \ + (h)[2] = SPH_T32((h)[3] + A1 + B2); \ + (h)[3] = SPH_T32((h)[0] + B1 + C2); \ + (h)[0] = tmp; \ + } while (0) + +/* + * One round of RIPEMD. The data must be aligned for 32-bit access. + */ +static void +ripemd_round(const unsigned char *data, sph_u32 r[5]) +{ +#if SPH_LITTLE_FAST + +#define RIPEMD_IN(x) sph_dec32le_aligned(data + (4 * (x))) + +#else + + sph_u32 X_var[16]; + int i; + + for (i = 0; i < 16; i ++) + X_var[i] = sph_dec32le_aligned(data + 4 * i); +#define RIPEMD_IN(x) X_var[x] + +#endif + RIPEMD_ROUND_BODY(RIPEMD_IN, r); +#undef RIPEMD_IN +} + +/* see sph_ripemd.h */ +void +sph_ripemd_init(void *cc) +{ + sph_ripemd_context *sc; + + sc = cc; + memcpy(sc->val, oIV, sizeof sc->val); +#if SPH_64 + sc->count = 0; +#else + sc->count_high = sc->count_low = 0; +#endif +} + +#define RFUN ripemd_round +#define HASH ripemd +#define LE32 1 +#include "md_helper.c" +#undef RFUN +#undef HASH +#undef LE32 + +/* see sph_ripemd.h */ +void +sph_ripemd_close(void *cc, void *dst) +{ + ripemd_close(cc, dst, 4); + sph_ripemd_init(cc); +} + +/* see sph_ripemd.h */ +void +sph_ripemd_comp(const sph_u32 msg[16], sph_u32 val[4]) +{ +#define RIPEMD_IN(x) msg[x] + RIPEMD_ROUND_BODY(RIPEMD_IN, val); +#undef RIPEMD_IN +} + +/* ===================================================================== */ +/* + * RIPEMD-128. + */ + +/* + * Round constants for RIPEMD-128. + */ +#define sK11 SPH_C32(0x00000000) +#define sK12 SPH_C32(0x5A827999) +#define sK13 SPH_C32(0x6ED9EBA1) +#define sK14 SPH_C32(0x8F1BBCDC) + +#define sK21 SPH_C32(0x50A28BE6) +#define sK22 SPH_C32(0x5C4DD124) +#define sK23 SPH_C32(0x6D703EF3) +#define sK24 SPH_C32(0x00000000) + +#define sRR(a, b, c, d, f, s, r, k) do { \ + a = ROTL(SPH_T32(a + f(b, c, d) + r + k), s); \ + } while (0) + +#define sROUND1(a, b, c, d, f, s, r, k) \ + sRR(a ## 1, b ## 1, c ## 1, d ## 1, f, s, r, sK1 ## k) + +#define sROUND2(a, b, c, d, f, s, r, k) \ + sRR(a ## 2, b ## 2, c ## 2, d ## 2, f, s, r, sK2 ## k) + +/* + * This macro defines the body for a RIPEMD-128 compression function + * implementation. The "in" parameter should evaluate, when applied to a + * numerical input parameter from 0 to 15, to an expression which yields + * the corresponding input block. The "h" parameter should evaluate to + * an array or pointer expression designating the array of 4 words which + * contains the input and output of the compression function. + */ + +#define RIPEMD128_ROUND_BODY(in, h) do { \ + sph_u32 A1, B1, C1, D1; \ + sph_u32 A2, B2, C2, D2; \ + sph_u32 tmp; \ + \ + A1 = A2 = (h)[0]; \ + B1 = B2 = (h)[1]; \ + C1 = C2 = (h)[2]; \ + D1 = D2 = (h)[3]; \ + \ + sROUND1(A, B, C, D, F1, 11, in( 0), 1); \ + sROUND1(D, A, B, C, F1, 14, in( 1), 1); \ + sROUND1(C, D, A, B, F1, 15, in( 2), 1); \ + sROUND1(B, C, D, A, F1, 12, in( 3), 1); \ + sROUND1(A, B, C, D, F1, 5, in( 4), 1); \ + sROUND1(D, A, B, C, F1, 8, in( 5), 1); \ + sROUND1(C, D, A, B, F1, 7, in( 6), 1); \ + sROUND1(B, C, D, A, F1, 9, in( 7), 1); \ + sROUND1(A, B, C, D, F1, 11, in( 8), 1); \ + sROUND1(D, A, B, C, F1, 13, in( 9), 1); \ + sROUND1(C, D, A, B, F1, 14, in(10), 1); \ + sROUND1(B, C, D, A, F1, 15, in(11), 1); \ + sROUND1(A, B, C, D, F1, 6, in(12), 1); \ + sROUND1(D, A, B, C, F1, 7, in(13), 1); \ + sROUND1(C, D, A, B, F1, 9, in(14), 1); \ + sROUND1(B, C, D, A, F1, 8, in(15), 1); \ + \ + sROUND1(A, B, C, D, F2, 7, in( 7), 2); \ + sROUND1(D, A, B, C, F2, 6, in( 4), 2); \ + sROUND1(C, D, A, B, F2, 8, in(13), 2); \ + sROUND1(B, C, D, A, F2, 13, in( 1), 2); \ + sROUND1(A, B, C, D, F2, 11, in(10), 2); \ + sROUND1(D, A, B, C, F2, 9, in( 6), 2); \ + sROUND1(C, D, A, B, F2, 7, in(15), 2); \ + sROUND1(B, C, D, A, F2, 15, in( 3), 2); \ + sROUND1(A, B, C, D, F2, 7, in(12), 2); \ + sROUND1(D, A, B, C, F2, 12, in( 0), 2); \ + sROUND1(C, D, A, B, F2, 15, in( 9), 2); \ + sROUND1(B, C, D, A, F2, 9, in( 5), 2); \ + sROUND1(A, B, C, D, F2, 11, in( 2), 2); \ + sROUND1(D, A, B, C, F2, 7, in(14), 2); \ + sROUND1(C, D, A, B, F2, 13, in(11), 2); \ + sROUND1(B, C, D, A, F2, 12, in( 8), 2); \ + \ + sROUND1(A, B, C, D, F3, 11, in( 3), 3); \ + sROUND1(D, A, B, C, F3, 13, in(10), 3); \ + sROUND1(C, D, A, B, F3, 6, in(14), 3); \ + sROUND1(B, C, D, A, F3, 7, in( 4), 3); \ + sROUND1(A, B, C, D, F3, 14, in( 9), 3); \ + sROUND1(D, A, B, C, F3, 9, in(15), 3); \ + sROUND1(C, D, A, B, F3, 13, in( 8), 3); \ + sROUND1(B, C, D, A, F3, 15, in( 1), 3); \ + sROUND1(A, B, C, D, F3, 14, in( 2), 3); \ + sROUND1(D, A, B, C, F3, 8, in( 7), 3); \ + sROUND1(C, D, A, B, F3, 13, in( 0), 3); \ + sROUND1(B, C, D, A, F3, 6, in( 6), 3); \ + sROUND1(A, B, C, D, F3, 5, in(13), 3); \ + sROUND1(D, A, B, C, F3, 12, in(11), 3); \ + sROUND1(C, D, A, B, F3, 7, in( 5), 3); \ + sROUND1(B, C, D, A, F3, 5, in(12), 3); \ + \ + sROUND1(A, B, C, D, F4, 11, in( 1), 4); \ + sROUND1(D, A, B, C, F4, 12, in( 9), 4); \ + sROUND1(C, D, A, B, F4, 14, in(11), 4); \ + sROUND1(B, C, D, A, F4, 15, in(10), 4); \ + sROUND1(A, B, C, D, F4, 14, in( 0), 4); \ + sROUND1(D, A, B, C, F4, 15, in( 8), 4); \ + sROUND1(C, D, A, B, F4, 9, in(12), 4); \ + sROUND1(B, C, D, A, F4, 8, in( 4), 4); \ + sROUND1(A, B, C, D, F4, 9, in(13), 4); \ + sROUND1(D, A, B, C, F4, 14, in( 3), 4); \ + sROUND1(C, D, A, B, F4, 5, in( 7), 4); \ + sROUND1(B, C, D, A, F4, 6, in(15), 4); \ + sROUND1(A, B, C, D, F4, 8, in(14), 4); \ + sROUND1(D, A, B, C, F4, 6, in( 5), 4); \ + sROUND1(C, D, A, B, F4, 5, in( 6), 4); \ + sROUND1(B, C, D, A, F4, 12, in( 2), 4); \ + \ + sROUND2(A, B, C, D, F4, 8, in( 5), 1); \ + sROUND2(D, A, B, C, F4, 9, in(14), 1); \ + sROUND2(C, D, A, B, F4, 9, in( 7), 1); \ + sROUND2(B, C, D, A, F4, 11, in( 0), 1); \ + sROUND2(A, B, C, D, F4, 13, in( 9), 1); \ + sROUND2(D, A, B, C, F4, 15, in( 2), 1); \ + sROUND2(C, D, A, B, F4, 15, in(11), 1); \ + sROUND2(B, C, D, A, F4, 5, in( 4), 1); \ + sROUND2(A, B, C, D, F4, 7, in(13), 1); \ + sROUND2(D, A, B, C, F4, 7, in( 6), 1); \ + sROUND2(C, D, A, B, F4, 8, in(15), 1); \ + sROUND2(B, C, D, A, F4, 11, in( 8), 1); \ + sROUND2(A, B, C, D, F4, 14, in( 1), 1); \ + sROUND2(D, A, B, C, F4, 14, in(10), 1); \ + sROUND2(C, D, A, B, F4, 12, in( 3), 1); \ + sROUND2(B, C, D, A, F4, 6, in(12), 1); \ + \ + sROUND2(A, B, C, D, F3, 9, in( 6), 2); \ + sROUND2(D, A, B, C, F3, 13, in(11), 2); \ + sROUND2(C, D, A, B, F3, 15, in( 3), 2); \ + sROUND2(B, C, D, A, F3, 7, in( 7), 2); \ + sROUND2(A, B, C, D, F3, 12, in( 0), 2); \ + sROUND2(D, A, B, C, F3, 8, in(13), 2); \ + sROUND2(C, D, A, B, F3, 9, in( 5), 2); \ + sROUND2(B, C, D, A, F3, 11, in(10), 2); \ + sROUND2(A, B, C, D, F3, 7, in(14), 2); \ + sROUND2(D, A, B, C, F3, 7, in(15), 2); \ + sROUND2(C, D, A, B, F3, 12, in( 8), 2); \ + sROUND2(B, C, D, A, F3, 7, in(12), 2); \ + sROUND2(A, B, C, D, F3, 6, in( 4), 2); \ + sROUND2(D, A, B, C, F3, 15, in( 9), 2); \ + sROUND2(C, D, A, B, F3, 13, in( 1), 2); \ + sROUND2(B, C, D, A, F3, 11, in( 2), 2); \ + \ + sROUND2(A, B, C, D, F2, 9, in(15), 3); \ + sROUND2(D, A, B, C, F2, 7, in( 5), 3); \ + sROUND2(C, D, A, B, F2, 15, in( 1), 3); \ + sROUND2(B, C, D, A, F2, 11, in( 3), 3); \ + sROUND2(A, B, C, D, F2, 8, in( 7), 3); \ + sROUND2(D, A, B, C, F2, 6, in(14), 3); \ + sROUND2(C, D, A, B, F2, 6, in( 6), 3); \ + sROUND2(B, C, D, A, F2, 14, in( 9), 3); \ + sROUND2(A, B, C, D, F2, 12, in(11), 3); \ + sROUND2(D, A, B, C, F2, 13, in( 8), 3); \ + sROUND2(C, D, A, B, F2, 5, in(12), 3); \ + sROUND2(B, C, D, A, F2, 14, in( 2), 3); \ + sROUND2(A, B, C, D, F2, 13, in(10), 3); \ + sROUND2(D, A, B, C, F2, 13, in( 0), 3); \ + sROUND2(C, D, A, B, F2, 7, in( 4), 3); \ + sROUND2(B, C, D, A, F2, 5, in(13), 3); \ + \ + sROUND2(A, B, C, D, F1, 15, in( 8), 4); \ + sROUND2(D, A, B, C, F1, 5, in( 6), 4); \ + sROUND2(C, D, A, B, F1, 8, in( 4), 4); \ + sROUND2(B, C, D, A, F1, 11, in( 1), 4); \ + sROUND2(A, B, C, D, F1, 14, in( 3), 4); \ + sROUND2(D, A, B, C, F1, 14, in(11), 4); \ + sROUND2(C, D, A, B, F1, 6, in(15), 4); \ + sROUND2(B, C, D, A, F1, 14, in( 0), 4); \ + sROUND2(A, B, C, D, F1, 6, in( 5), 4); \ + sROUND2(D, A, B, C, F1, 9, in(12), 4); \ + sROUND2(C, D, A, B, F1, 12, in( 2), 4); \ + sROUND2(B, C, D, A, F1, 9, in(13), 4); \ + sROUND2(A, B, C, D, F1, 12, in( 9), 4); \ + sROUND2(D, A, B, C, F1, 5, in( 7), 4); \ + sROUND2(C, D, A, B, F1, 15, in(10), 4); \ + sROUND2(B, C, D, A, F1, 8, in(14), 4); \ + \ + tmp = SPH_T32((h)[1] + C1 + D2); \ + (h)[1] = SPH_T32((h)[2] + D1 + A2); \ + (h)[2] = SPH_T32((h)[3] + A1 + B2); \ + (h)[3] = SPH_T32((h)[0] + B1 + C2); \ + (h)[0] = tmp; \ + } while (0) + +/* + * One round of RIPEMD-128. The data must be aligned for 32-bit access. + */ +static void +ripemd128_round(const unsigned char *data, sph_u32 r[5]) +{ +#if SPH_LITTLE_FAST + +#define RIPEMD128_IN(x) sph_dec32le_aligned(data + (4 * (x))) + +#else + + sph_u32 X_var[16]; + int i; + + for (i = 0; i < 16; i ++) + X_var[i] = sph_dec32le_aligned(data + 4 * i); +#define RIPEMD128_IN(x) X_var[x] + +#endif + RIPEMD128_ROUND_BODY(RIPEMD128_IN, r); +#undef RIPEMD128_IN +} + +/* see sph_ripemd.h */ +void +sph_ripemd128_init(void *cc) +{ + sph_ripemd128_context *sc; + + sc = cc; + memcpy(sc->val, IV, sizeof sc->val); +#if SPH_64 + sc->count = 0; +#else + sc->count_high = sc->count_low = 0; +#endif +} + +#define RFUN ripemd128_round +#define HASH ripemd128 +#define LE32 1 +#include "md_helper.c" +#undef RFUN +#undef HASH +#undef LE32 + +/* see sph_ripemd.h */ +void +sph_ripemd128_close(void *cc, void *dst) +{ + ripemd128_close(cc, dst, 4); + sph_ripemd128_init(cc); +} + +/* see sph_ripemd.h */ +void +sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4]) +{ +#define RIPEMD128_IN(x) msg[x] + RIPEMD128_ROUND_BODY(RIPEMD128_IN, val); +#undef RIPEMD128_IN +} + +/* ===================================================================== */ +/* + * RIPEMD-160. + */ + +/* + * Round constants for RIPEMD-160. + */ +#define K11 SPH_C32(0x00000000) +#define K12 SPH_C32(0x5A827999) +#define K13 SPH_C32(0x6ED9EBA1) +#define K14 SPH_C32(0x8F1BBCDC) +#define K15 SPH_C32(0xA953FD4E) + +#define K21 SPH_C32(0x50A28BE6) +#define K22 SPH_C32(0x5C4DD124) +#define K23 SPH_C32(0x6D703EF3) +#define K24 SPH_C32(0x7A6D76E9) +#define K25 SPH_C32(0x00000000) + +#define RR(a, b, c, d, e, f, s, r, k) do { \ + a = SPH_T32(ROTL(SPH_T32(a + f(b, c, d) + r + k), s) + e); \ + c = ROTL(c, 10); \ + } while (0) + +#define ROUND1(a, b, c, d, e, f, s, r, k) \ + RR(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k) + +#define ROUND2(a, b, c, d, e, f, s, r, k) \ + RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k) + +/* + * This macro defines the body for a RIPEMD-160 compression function + * implementation. The "in" parameter should evaluate, when applied to a + * numerical input parameter from 0 to 15, to an expression which yields + * the corresponding input block. The "h" parameter should evaluate to + * an array or pointer expression designating the array of 5 words which + * contains the input and output of the compression function. + */ + +#define RIPEMD160_ROUND_BODY(in, h) do { \ + sph_u32 A1, B1, C1, D1, E1; \ + sph_u32 A2, B2, C2, D2, E2; \ + sph_u32 tmp; \ + \ + A1 = A2 = (h)[0]; \ + B1 = B2 = (h)[1]; \ + C1 = C2 = (h)[2]; \ + D1 = D2 = (h)[3]; \ + E1 = E2 = (h)[4]; \ + \ + ROUND1(A, B, C, D, E, F1, 11, in( 0), 1); \ + ROUND1(E, A, B, C, D, F1, 14, in( 1), 1); \ + ROUND1(D, E, A, B, C, F1, 15, in( 2), 1); \ + ROUND1(C, D, E, A, B, F1, 12, in( 3), 1); \ + ROUND1(B, C, D, E, A, F1, 5, in( 4), 1); \ + ROUND1(A, B, C, D, E, F1, 8, in( 5), 1); \ + ROUND1(E, A, B, C, D, F1, 7, in( 6), 1); \ + ROUND1(D, E, A, B, C, F1, 9, in( 7), 1); \ + ROUND1(C, D, E, A, B, F1, 11, in( 8), 1); \ + ROUND1(B, C, D, E, A, F1, 13, in( 9), 1); \ + ROUND1(A, B, C, D, E, F1, 14, in(10), 1); \ + ROUND1(E, A, B, C, D, F1, 15, in(11), 1); \ + ROUND1(D, E, A, B, C, F1, 6, in(12), 1); \ + ROUND1(C, D, E, A, B, F1, 7, in(13), 1); \ + ROUND1(B, C, D, E, A, F1, 9, in(14), 1); \ + ROUND1(A, B, C, D, E, F1, 8, in(15), 1); \ + \ + ROUND1(E, A, B, C, D, F2, 7, in( 7), 2); \ + ROUND1(D, E, A, B, C, F2, 6, in( 4), 2); \ + ROUND1(C, D, E, A, B, F2, 8, in(13), 2); \ + ROUND1(B, C, D, E, A, F2, 13, in( 1), 2); \ + ROUND1(A, B, C, D, E, F2, 11, in(10), 2); \ + ROUND1(E, A, B, C, D, F2, 9, in( 6), 2); \ + ROUND1(D, E, A, B, C, F2, 7, in(15), 2); \ + ROUND1(C, D, E, A, B, F2, 15, in( 3), 2); \ + ROUND1(B, C, D, E, A, F2, 7, in(12), 2); \ + ROUND1(A, B, C, D, E, F2, 12, in( 0), 2); \ + ROUND1(E, A, B, C, D, F2, 15, in( 9), 2); \ + ROUND1(D, E, A, B, C, F2, 9, in( 5), 2); \ + ROUND1(C, D, E, A, B, F2, 11, in( 2), 2); \ + ROUND1(B, C, D, E, A, F2, 7, in(14), 2); \ + ROUND1(A, B, C, D, E, F2, 13, in(11), 2); \ + ROUND1(E, A, B, C, D, F2, 12, in( 8), 2); \ + \ + ROUND1(D, E, A, B, C, F3, 11, in( 3), 3); \ + ROUND1(C, D, E, A, B, F3, 13, in(10), 3); \ + ROUND1(B, C, D, E, A, F3, 6, in(14), 3); \ + ROUND1(A, B, C, D, E, F3, 7, in( 4), 3); \ + ROUND1(E, A, B, C, D, F3, 14, in( 9), 3); \ + ROUND1(D, E, A, B, C, F3, 9, in(15), 3); \ + ROUND1(C, D, E, A, B, F3, 13, in( 8), 3); \ + ROUND1(B, C, D, E, A, F3, 15, in( 1), 3); \ + ROUND1(A, B, C, D, E, F3, 14, in( 2), 3); \ + ROUND1(E, A, B, C, D, F3, 8, in( 7), 3); \ + ROUND1(D, E, A, B, C, F3, 13, in( 0), 3); \ + ROUND1(C, D, E, A, B, F3, 6, in( 6), 3); \ + ROUND1(B, C, D, E, A, F3, 5, in(13), 3); \ + ROUND1(A, B, C, D, E, F3, 12, in(11), 3); \ + ROUND1(E, A, B, C, D, F3, 7, in( 5), 3); \ + ROUND1(D, E, A, B, C, F3, 5, in(12), 3); \ + \ + ROUND1(C, D, E, A, B, F4, 11, in( 1), 4); \ + ROUND1(B, C, D, E, A, F4, 12, in( 9), 4); \ + ROUND1(A, B, C, D, E, F4, 14, in(11), 4); \ + ROUND1(E, A, B, C, D, F4, 15, in(10), 4); \ + ROUND1(D, E, A, B, C, F4, 14, in( 0), 4); \ + ROUND1(C, D, E, A, B, F4, 15, in( 8), 4); \ + ROUND1(B, C, D, E, A, F4, 9, in(12), 4); \ + ROUND1(A, B, C, D, E, F4, 8, in( 4), 4); \ + ROUND1(E, A, B, C, D, F4, 9, in(13), 4); \ + ROUND1(D, E, A, B, C, F4, 14, in( 3), 4); \ + ROUND1(C, D, E, A, B, F4, 5, in( 7), 4); \ + ROUND1(B, C, D, E, A, F4, 6, in(15), 4); \ + ROUND1(A, B, C, D, E, F4, 8, in(14), 4); \ + ROUND1(E, A, B, C, D, F4, 6, in( 5), 4); \ + ROUND1(D, E, A, B, C, F4, 5, in( 6), 4); \ + ROUND1(C, D, E, A, B, F4, 12, in( 2), 4); \ + \ + ROUND1(B, C, D, E, A, F5, 9, in( 4), 5); \ + ROUND1(A, B, C, D, E, F5, 15, in( 0), 5); \ + ROUND1(E, A, B, C, D, F5, 5, in( 5), 5); \ + ROUND1(D, E, A, B, C, F5, 11, in( 9), 5); \ + ROUND1(C, D, E, A, B, F5, 6, in( 7), 5); \ + ROUND1(B, C, D, E, A, F5, 8, in(12), 5); \ + ROUND1(A, B, C, D, E, F5, 13, in( 2), 5); \ + ROUND1(E, A, B, C, D, F5, 12, in(10), 5); \ + ROUND1(D, E, A, B, C, F5, 5, in(14), 5); \ + ROUND1(C, D, E, A, B, F5, 12, in( 1), 5); \ + ROUND1(B, C, D, E, A, F5, 13, in( 3), 5); \ + ROUND1(A, B, C, D, E, F5, 14, in( 8), 5); \ + ROUND1(E, A, B, C, D, F5, 11, in(11), 5); \ + ROUND1(D, E, A, B, C, F5, 8, in( 6), 5); \ + ROUND1(C, D, E, A, B, F5, 5, in(15), 5); \ + ROUND1(B, C, D, E, A, F5, 6, in(13), 5); \ + \ + ROUND2(A, B, C, D, E, F5, 8, in( 5), 1); \ + ROUND2(E, A, B, C, D, F5, 9, in(14), 1); \ + ROUND2(D, E, A, B, C, F5, 9, in( 7), 1); \ + ROUND2(C, D, E, A, B, F5, 11, in( 0), 1); \ + ROUND2(B, C, D, E, A, F5, 13, in( 9), 1); \ + ROUND2(A, B, C, D, E, F5, 15, in( 2), 1); \ + ROUND2(E, A, B, C, D, F5, 15, in(11), 1); \ + ROUND2(D, E, A, B, C, F5, 5, in( 4), 1); \ + ROUND2(C, D, E, A, B, F5, 7, in(13), 1); \ + ROUND2(B, C, D, E, A, F5, 7, in( 6), 1); \ + ROUND2(A, B, C, D, E, F5, 8, in(15), 1); \ + ROUND2(E, A, B, C, D, F5, 11, in( 8), 1); \ + ROUND2(D, E, A, B, C, F5, 14, in( 1), 1); \ + ROUND2(C, D, E, A, B, F5, 14, in(10), 1); \ + ROUND2(B, C, D, E, A, F5, 12, in( 3), 1); \ + ROUND2(A, B, C, D, E, F5, 6, in(12), 1); \ + \ + ROUND2(E, A, B, C, D, F4, 9, in( 6), 2); \ + ROUND2(D, E, A, B, C, F4, 13, in(11), 2); \ + ROUND2(C, D, E, A, B, F4, 15, in( 3), 2); \ + ROUND2(B, C, D, E, A, F4, 7, in( 7), 2); \ + ROUND2(A, B, C, D, E, F4, 12, in( 0), 2); \ + ROUND2(E, A, B, C, D, F4, 8, in(13), 2); \ + ROUND2(D, E, A, B, C, F4, 9, in( 5), 2); \ + ROUND2(C, D, E, A, B, F4, 11, in(10), 2); \ + ROUND2(B, C, D, E, A, F4, 7, in(14), 2); \ + ROUND2(A, B, C, D, E, F4, 7, in(15), 2); \ + ROUND2(E, A, B, C, D, F4, 12, in( 8), 2); \ + ROUND2(D, E, A, B, C, F4, 7, in(12), 2); \ + ROUND2(C, D, E, A, B, F4, 6, in( 4), 2); \ + ROUND2(B, C, D, E, A, F4, 15, in( 9), 2); \ + ROUND2(A, B, C, D, E, F4, 13, in( 1), 2); \ + ROUND2(E, A, B, C, D, F4, 11, in( 2), 2); \ + \ + ROUND2(D, E, A, B, C, F3, 9, in(15), 3); \ + ROUND2(C, D, E, A, B, F3, 7, in( 5), 3); \ + ROUND2(B, C, D, E, A, F3, 15, in( 1), 3); \ + ROUND2(A, B, C, D, E, F3, 11, in( 3), 3); \ + ROUND2(E, A, B, C, D, F3, 8, in( 7), 3); \ + ROUND2(D, E, A, B, C, F3, 6, in(14), 3); \ + ROUND2(C, D, E, A, B, F3, 6, in( 6), 3); \ + ROUND2(B, C, D, E, A, F3, 14, in( 9), 3); \ + ROUND2(A, B, C, D, E, F3, 12, in(11), 3); \ + ROUND2(E, A, B, C, D, F3, 13, in( 8), 3); \ + ROUND2(D, E, A, B, C, F3, 5, in(12), 3); \ + ROUND2(C, D, E, A, B, F3, 14, in( 2), 3); \ + ROUND2(B, C, D, E, A, F3, 13, in(10), 3); \ + ROUND2(A, B, C, D, E, F3, 13, in( 0), 3); \ + ROUND2(E, A, B, C, D, F3, 7, in( 4), 3); \ + ROUND2(D, E, A, B, C, F3, 5, in(13), 3); \ + \ + ROUND2(C, D, E, A, B, F2, 15, in( 8), 4); \ + ROUND2(B, C, D, E, A, F2, 5, in( 6), 4); \ + ROUND2(A, B, C, D, E, F2, 8, in( 4), 4); \ + ROUND2(E, A, B, C, D, F2, 11, in( 1), 4); \ + ROUND2(D, E, A, B, C, F2, 14, in( 3), 4); \ + ROUND2(C, D, E, A, B, F2, 14, in(11), 4); \ + ROUND2(B, C, D, E, A, F2, 6, in(15), 4); \ + ROUND2(A, B, C, D, E, F2, 14, in( 0), 4); \ + ROUND2(E, A, B, C, D, F2, 6, in( 5), 4); \ + ROUND2(D, E, A, B, C, F2, 9, in(12), 4); \ + ROUND2(C, D, E, A, B, F2, 12, in( 2), 4); \ + ROUND2(B, C, D, E, A, F2, 9, in(13), 4); \ + ROUND2(A, B, C, D, E, F2, 12, in( 9), 4); \ + ROUND2(E, A, B, C, D, F2, 5, in( 7), 4); \ + ROUND2(D, E, A, B, C, F2, 15, in(10), 4); \ + ROUND2(C, D, E, A, B, F2, 8, in(14), 4); \ + \ + ROUND2(B, C, D, E, A, F1, 8, in(12), 5); \ + ROUND2(A, B, C, D, E, F1, 5, in(15), 5); \ + ROUND2(E, A, B, C, D, F1, 12, in(10), 5); \ + ROUND2(D, E, A, B, C, F1, 9, in( 4), 5); \ + ROUND2(C, D, E, A, B, F1, 12, in( 1), 5); \ + ROUND2(B, C, D, E, A, F1, 5, in( 5), 5); \ + ROUND2(A, B, C, D, E, F1, 14, in( 8), 5); \ + ROUND2(E, A, B, C, D, F1, 6, in( 7), 5); \ + ROUND2(D, E, A, B, C, F1, 8, in( 6), 5); \ + ROUND2(C, D, E, A, B, F1, 13, in( 2), 5); \ + ROUND2(B, C, D, E, A, F1, 6, in(13), 5); \ + ROUND2(A, B, C, D, E, F1, 5, in(14), 5); \ + ROUND2(E, A, B, C, D, F1, 15, in( 0), 5); \ + ROUND2(D, E, A, B, C, F1, 13, in( 3), 5); \ + ROUND2(C, D, E, A, B, F1, 11, in( 9), 5); \ + ROUND2(B, C, D, E, A, F1, 11, in(11), 5); \ + \ + tmp = SPH_T32((h)[1] + C1 + D2); \ + (h)[1] = SPH_T32((h)[2] + D1 + E2); \ + (h)[2] = SPH_T32((h)[3] + E1 + A2); \ + (h)[3] = SPH_T32((h)[4] + A1 + B2); \ + (h)[4] = SPH_T32((h)[0] + B1 + C2); \ + (h)[0] = tmp; \ + } while (0) + +/* + * One round of RIPEMD-160. The data must be aligned for 32-bit access. + */ +static void +ripemd160_round(const unsigned char *data, sph_u32 r[5]) +{ +#if SPH_LITTLE_FAST + +#define RIPEMD160_IN(x) sph_dec32le_aligned(data + (4 * (x))) + +#else + + sph_u32 X_var[16]; + int i; + + for (i = 0; i < 16; i ++) + X_var[i] = sph_dec32le_aligned(data + 4 * i); +#define RIPEMD160_IN(x) X_var[x] + +#endif + RIPEMD160_ROUND_BODY(RIPEMD160_IN, r); +#undef RIPEMD160_IN +} + +/* see sph_ripemd.h */ +void +sph_ripemd160_init(void *cc) +{ + sph_ripemd160_context *sc; + + sc = cc; + memcpy(sc->val, IV, sizeof sc->val); +#if SPH_64 + sc->count = 0; +#else + sc->count_high = sc->count_low = 0; +#endif +} + +#define RFUN ripemd160_round +#define HASH ripemd160 +#define LE32 1 +#include "md_helper.c" +#undef RFUN +#undef HASH +#undef LE32 + +/* see sph_ripemd.h */ +void +sph_ripemd160_close(void *cc, void *dst) +{ + ripemd160_close(cc, dst, 5); + sph_ripemd160_init(cc); +} + +/* see sph_ripemd.h */ +void +sph_ripemd160_comp(const sph_u32 msg[16], sph_u32 val[5]) +{ +#define RIPEMD160_IN(x) msg[x] + RIPEMD160_ROUND_BODY(RIPEMD160_IN, val); +#undef RIPEMD160_IN +} diff --git a/sph/sph_ripemd.h b/sph/sph_ripemd.h new file mode 100644 index 0000000..f1f0982 --- /dev/null +++ b/sph/sph_ripemd.h @@ -0,0 +1,273 @@ +/* $Id: sph_ripemd.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * RIPEMD, RIPEMD-128 and RIPEMD-160 interface. + * + * RIPEMD was first described in: Research and Development in Advanced + * Communication Technologies in Europe, "RIPE Integrity Primitives: + * Final Report of RACE Integrity Primitives Evaluation (R1040)", RACE, + * June 1992. + * + * A new, strengthened version, dubbed RIPEMD-160, was published in: H. + * Dobbertin, A. Bosselaers, and B. Preneel, "RIPEMD-160, a strengthened + * version of RIPEMD", Fast Software Encryption - FSE'96, LNCS 1039, + * Springer (1996), pp. 71--82. + * + * This article describes both RIPEMD-160, with a 160-bit output, and a + * reduced version called RIPEMD-128, which has a 128-bit output. RIPEMD-128 + * was meant as a "drop-in" replacement for any hash function with 128-bit + * output, especially the original RIPEMD. + * + * @warning Collisions, and an efficient method to build other collisions, + * have been published for the original RIPEMD, which is thus considered as + * cryptographically broken. It is also very rarely encountered, and there + * seems to exist no free description or implementation of RIPEMD (except + * the sphlib code, of course). As of january 2007, RIPEMD-128 and RIPEMD-160 + * seem as secure as their output length allows. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_ripemd.h + * @author Thomas Pornin + */ + +#ifndef SPH_RIPEMD_H__ +#define SPH_RIPEMD_H__ + +#include +#include "sph_types.h" + +/** + * Output size (in bits) for RIPEMD. + */ +#define SPH_SIZE_ripemd 128 + +/** + * Output size (in bits) for RIPEMD-128. + */ +#define SPH_SIZE_ripemd128 128 + +/** + * Output size (in bits) for RIPEMD-160. + */ +#define SPH_SIZE_ripemd160 160 + +/** + * This structure is a context for RIPEMD computations: it contains the + * intermediate values and some data from the last entered block. Once + * a RIPEMD computation has been performed, the context can be reused for + * another computation. + * + * The contents of this structure are private. A running RIPEMD computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + sph_u32 val[4]; +#if SPH_64 + sph_u64 count; +#else + sph_u32 count_high, count_low; +#endif +#endif +} sph_ripemd_context; + +/** + * Initialize a RIPEMD context. This process performs no memory allocation. + * + * @param cc the RIPEMD context (pointer to + * a sph_ripemd_context) + */ +void sph_ripemd_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the RIPEMD context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_ripemd(void *cc, const void *data, size_t len); + +/** + * Terminate the current RIPEMD computation and output the result into the + * provided buffer. The destination buffer must be wide enough to + * accomodate the result (16 bytes). The context is automatically + * reinitialized. + * + * @param cc the RIPEMD context + * @param dst the destination buffer + */ +void sph_ripemd_close(void *cc, void *dst); + +/** + * Apply the RIPEMD compression function on the provided data. The + * msg parameter contains the 16 32-bit input blocks, + * as numerical values (hence after the little-endian decoding). The + * val parameter contains the 5 32-bit input blocks for + * the compression function; the output is written in place in this + * array. + * + * @param msg the message block (16 values) + * @param val the function 128-bit input and output + */ +void sph_ripemd_comp(const sph_u32 msg[16], sph_u32 val[4]); + +/* ===================================================================== */ + +/** + * This structure is a context for RIPEMD-128 computations: it contains the + * intermediate values and some data from the last entered block. Once + * a RIPEMD-128 computation has been performed, the context can be reused for + * another computation. + * + * The contents of this structure are private. A running RIPEMD-128 computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + sph_u32 val[4]; +#if SPH_64 + sph_u64 count; +#else + sph_u32 count_high, count_low; +#endif +#endif +} sph_ripemd128_context; + +/** + * Initialize a RIPEMD-128 context. This process performs no memory allocation. + * + * @param cc the RIPEMD-128 context (pointer to + * a sph_ripemd128_context) + */ +void sph_ripemd128_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the RIPEMD-128 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_ripemd128(void *cc, const void *data, size_t len); + +/** + * Terminate the current RIPEMD-128 computation and output the result into the + * provided buffer. The destination buffer must be wide enough to + * accomodate the result (16 bytes). The context is automatically + * reinitialized. + * + * @param cc the RIPEMD-128 context + * @param dst the destination buffer + */ +void sph_ripemd128_close(void *cc, void *dst); + +/** + * Apply the RIPEMD-128 compression function on the provided data. The + * msg parameter contains the 16 32-bit input blocks, + * as numerical values (hence after the little-endian decoding). The + * val parameter contains the 5 32-bit input blocks for + * the compression function; the output is written in place in this + * array. + * + * @param msg the message block (16 values) + * @param val the function 128-bit input and output + */ +void sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4]); + +/* ===================================================================== */ + +/** + * This structure is a context for RIPEMD-160 computations: it contains the + * intermediate values and some data from the last entered block. Once + * a RIPEMD-160 computation has been performed, the context can be reused for + * another computation. + * + * The contents of this structure are private. A running RIPEMD-160 computation + * can be cloned by copying the context (e.g. with a simple + * memcpy()). + */ +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + sph_u32 val[5]; +#if SPH_64 + sph_u64 count; +#else + sph_u32 count_high, count_low; +#endif +#endif +} sph_ripemd160_context; + +/** + * Initialize a RIPEMD-160 context. This process performs no memory allocation. + * + * @param cc the RIPEMD-160 context (pointer to + * a sph_ripemd160_context) + */ +void sph_ripemd160_init(void *cc); + +/** + * Process some data bytes. It is acceptable that len is zero + * (in which case this function does nothing). + * + * @param cc the RIPEMD-160 context + * @param data the input data + * @param len the input data length (in bytes) + */ +void sph_ripemd160(void *cc, const void *data, size_t len); + +/** + * Terminate the current RIPEMD-160 computation and output the result into the + * provided buffer. The destination buffer must be wide enough to + * accomodate the result (20 bytes). The context is automatically + * reinitialized. + * + * @param cc the RIPEMD-160 context + * @param dst the destination buffer + */ +void sph_ripemd160_close(void *cc, void *dst); + +/** + * Apply the RIPEMD-160 compression function on the provided data. The + * msg parameter contains the 16 32-bit input blocks, + * as numerical values (hence after the little-endian decoding). The + * val parameter contains the 5 32-bit input blocks for + * the compression function; the output is written in place in this + * array. + * + * @param msg the message block (16 values) + * @param val the function 160-bit input and output + */ +void sph_ripemd160_comp(const sph_u32 msg[16], sph_u32 val[5]); + +#endif diff --git a/sph/sph_sha2.c b/sph/sph_sha2.c new file mode 100644 index 0000000..aab2c55 --- /dev/null +++ b/sph/sph_sha2.c @@ -0,0 +1,691 @@ +/* $Id: sha2.c 227 2010-06-16 17:28:38Z tp $ */ +/* + * SHA-224 / SHA-256 implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_sha2.h" + +#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHA2 +#define SPH_SMALL_FOOTPRINT_SHA2 1 +#endif + +#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z)) +#define MAJ(X, Y, Z) (((Y) & (Z)) | (((Y) | (Z)) & (X))) + +#define ROTR SPH_ROTR32 + +#define BSG2_0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define BSG2_1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define SSG2_0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SPH_T32((x) >> 3)) +#define SSG2_1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SPH_T32((x) >> 10)) + +static const sph_u32 H224[8] = { + SPH_C32(0xC1059ED8), SPH_C32(0x367CD507), SPH_C32(0x3070DD17), + SPH_C32(0xF70E5939), SPH_C32(0xFFC00B31), SPH_C32(0x68581511), + SPH_C32(0x64F98FA7), SPH_C32(0xBEFA4FA4) +}; + +static const sph_u32 H256[8] = { + SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85), SPH_C32(0x3C6EF372), + SPH_C32(0xA54FF53A), SPH_C32(0x510E527F), SPH_C32(0x9B05688C), + SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19) +}; + +/* + * The SHA2_ROUND_BODY defines the body for a SHA-224 / SHA-256 + * compression function implementation. The "in" parameter should + * evaluate, when applied to a numerical input parameter from 0 to 15, + * to an expression which yields the corresponding input block. The "r" + * parameter should evaluate to an array or pointer expression + * designating the array of 8 words which contains the input and output + * of the compression function. + */ + +#if SPH_SMALL_FOOTPRINT_SHA2 + +static const sph_u32 K[64] = { + SPH_C32(0x428A2F98), SPH_C32(0x71374491), + SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5), + SPH_C32(0x3956C25B), SPH_C32(0x59F111F1), + SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5), + SPH_C32(0xD807AA98), SPH_C32(0x12835B01), + SPH_C32(0x243185BE), SPH_C32(0x550C7DC3), + SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE), + SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174), + SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786), + SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC), + SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA), + SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA), + SPH_C32(0x983E5152), SPH_C32(0xA831C66D), + SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7), + SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147), + SPH_C32(0x06CA6351), SPH_C32(0x14292967), + SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138), + SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13), + SPH_C32(0x650A7354), SPH_C32(0x766A0ABB), + SPH_C32(0x81C2C92E), SPH_C32(0x92722C85), + SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B), + SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3), + SPH_C32(0xD192E819), SPH_C32(0xD6990624), + SPH_C32(0xF40E3585), SPH_C32(0x106AA070), + SPH_C32(0x19A4C116), SPH_C32(0x1E376C08), + SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5), + SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A), + SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3), + SPH_C32(0x748F82EE), SPH_C32(0x78A5636F), + SPH_C32(0x84C87814), SPH_C32(0x8CC70208), + SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB), + SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2) +}; + +#define SHA2_MEXP1(in, pc) do { \ + W[pc] = in(pc); \ + } while (0) + +#define SHA2_MEXP2(in, pc) do { \ + W[(pc) & 0x0F] = SPH_T32(SSG2_1(W[((pc) - 2) & 0x0F]) \ + + W[((pc) - 7) & 0x0F] \ + + SSG2_0(W[((pc) - 15) & 0x0F]) + W[(pc) & 0x0F]); \ + } while (0) + +#define SHA2_STEPn(n, a, b, c, d, e, f, g, h, in, pc) do { \ + sph_u32 t1, t2; \ + SHA2_MEXP ## n(in, pc); \ + t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \ + + K[pcount + (pc)] + W[(pc) & 0x0F]); \ + t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \ + d = SPH_T32(d + t1); \ + h = SPH_T32(t1 + t2); \ + } while (0) + +#define SHA2_STEP1(a, b, c, d, e, f, g, h, in, pc) \ + SHA2_STEPn(1, a, b, c, d, e, f, g, h, in, pc) +#define SHA2_STEP2(a, b, c, d, e, f, g, h, in, pc) \ + SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc) + +#define SHA2_ROUND_BODY(in, r) do { \ + sph_u32 A, B, C, D, E, F, G, H; \ + sph_u32 W[16]; \ + unsigned pcount; \ + \ + A = (r)[0]; \ + B = (r)[1]; \ + C = (r)[2]; \ + D = (r)[3]; \ + E = (r)[4]; \ + F = (r)[5]; \ + G = (r)[6]; \ + H = (r)[7]; \ + pcount = 0; \ + SHA2_STEP1(A, B, C, D, E, F, G, H, in, 0); \ + SHA2_STEP1(H, A, B, C, D, E, F, G, in, 1); \ + SHA2_STEP1(G, H, A, B, C, D, E, F, in, 2); \ + SHA2_STEP1(F, G, H, A, B, C, D, E, in, 3); \ + SHA2_STEP1(E, F, G, H, A, B, C, D, in, 4); \ + SHA2_STEP1(D, E, F, G, H, A, B, C, in, 5); \ + SHA2_STEP1(C, D, E, F, G, H, A, B, in, 6); \ + SHA2_STEP1(B, C, D, E, F, G, H, A, in, 7); \ + SHA2_STEP1(A, B, C, D, E, F, G, H, in, 8); \ + SHA2_STEP1(H, A, B, C, D, E, F, G, in, 9); \ + SHA2_STEP1(G, H, A, B, C, D, E, F, in, 10); \ + SHA2_STEP1(F, G, H, A, B, C, D, E, in, 11); \ + SHA2_STEP1(E, F, G, H, A, B, C, D, in, 12); \ + SHA2_STEP1(D, E, F, G, H, A, B, C, in, 13); \ + SHA2_STEP1(C, D, E, F, G, H, A, B, in, 14); \ + SHA2_STEP1(B, C, D, E, F, G, H, A, in, 15); \ + for (pcount = 16; pcount < 64; pcount += 16) { \ + SHA2_STEP2(A, B, C, D, E, F, G, H, in, 0); \ + SHA2_STEP2(H, A, B, C, D, E, F, G, in, 1); \ + SHA2_STEP2(G, H, A, B, C, D, E, F, in, 2); \ + SHA2_STEP2(F, G, H, A, B, C, D, E, in, 3); \ + SHA2_STEP2(E, F, G, H, A, B, C, D, in, 4); \ + SHA2_STEP2(D, E, F, G, H, A, B, C, in, 5); \ + SHA2_STEP2(C, D, E, F, G, H, A, B, in, 6); \ + SHA2_STEP2(B, C, D, E, F, G, H, A, in, 7); \ + SHA2_STEP2(A, B, C, D, E, F, G, H, in, 8); \ + SHA2_STEP2(H, A, B, C, D, E, F, G, in, 9); \ + SHA2_STEP2(G, H, A, B, C, D, E, F, in, 10); \ + SHA2_STEP2(F, G, H, A, B, C, D, E, in, 11); \ + SHA2_STEP2(E, F, G, H, A, B, C, D, in, 12); \ + SHA2_STEP2(D, E, F, G, H, A, B, C, in, 13); \ + SHA2_STEP2(C, D, E, F, G, H, A, B, in, 14); \ + SHA2_STEP2(B, C, D, E, F, G, H, A, in, 15); \ + } \ + (r)[0] = SPH_T32((r)[0] + A); \ + (r)[1] = SPH_T32((r)[1] + B); \ + (r)[2] = SPH_T32((r)[2] + C); \ + (r)[3] = SPH_T32((r)[3] + D); \ + (r)[4] = SPH_T32((r)[4] + E); \ + (r)[5] = SPH_T32((r)[5] + F); \ + (r)[6] = SPH_T32((r)[6] + G); \ + (r)[7] = SPH_T32((r)[7] + H); \ + } while (0) + +#else + +#define SHA2_ROUND_BODY(in, r) do { \ + sph_u32 A, B, C, D, E, F, G, H, T1, T2; \ + sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \ + sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \ + int i; \ + \ + A = (r)[0]; \ + B = (r)[1]; \ + C = (r)[2]; \ + D = (r)[3]; \ + E = (r)[4]; \ + F = (r)[5]; \ + G = (r)[6]; \ + H = (r)[7]; \ + W00 = in(0); \ + T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + + SPH_C32(0x428A2F98) + W00); \ + T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + D = SPH_T32(D + T1); \ + H = SPH_T32(T1 + T2); \ + W01 = in(1); \ + T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + + SPH_C32(0x71374491) + W01); \ + T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + C = SPH_T32(C + T1); \ + G = SPH_T32(T1 + T2); \ + W02 = in(2); \ + T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + + SPH_C32(0xB5C0FBCF) + W02); \ + T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + B = SPH_T32(B + T1); \ + F = SPH_T32(T1 + T2); \ + W03 = in(3); \ + T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + + SPH_C32(0xE9B5DBA5) + W03); \ + T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + A = SPH_T32(A + T1); \ + E = SPH_T32(T1 + T2); \ + W04 = in(4); \ + T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + + SPH_C32(0x3956C25B) + W04); \ + T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + H = SPH_T32(H + T1); \ + D = SPH_T32(T1 + T2); \ + W05 = in(5); \ + T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + + SPH_C32(0x59F111F1) + W05); \ + T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + G = SPH_T32(G + T1); \ + C = SPH_T32(T1 + T2); \ + W06 = in(6); \ + T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + + SPH_C32(0x923F82A4) + W06); \ + T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + F = SPH_T32(F + T1); \ + B = SPH_T32(T1 + T2); \ + W07 = in(7); \ + T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + + SPH_C32(0xAB1C5ED5) + W07); \ + T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + E = SPH_T32(E + T1); \ + A = SPH_T32(T1 + T2); \ + W08 = in(8); \ + T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + + SPH_C32(0xD807AA98) + W08); \ + T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + D = SPH_T32(D + T1); \ + H = SPH_T32(T1 + T2); \ + W09 = in(9); \ + T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + + SPH_C32(0x12835B01) + W09); \ + T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + C = SPH_T32(C + T1); \ + G = SPH_T32(T1 + T2); \ + W10 = in(10); \ + T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + + SPH_C32(0x243185BE) + W10); \ + T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + B = SPH_T32(B + T1); \ + F = SPH_T32(T1 + T2); \ + W11 = in(11); \ + T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + + SPH_C32(0x550C7DC3) + W11); \ + T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + A = SPH_T32(A + T1); \ + E = SPH_T32(T1 + T2); \ + W12 = in(12); \ + T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + + SPH_C32(0x72BE5D74) + W12); \ + T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + H = SPH_T32(H + T1); \ + D = SPH_T32(T1 + T2); \ + W13 = in(13); \ + T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + + SPH_C32(0x80DEB1FE) + W13); \ + T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + G = SPH_T32(G + T1); \ + C = SPH_T32(T1 + T2); \ + W14 = in(14); \ + T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + + SPH_C32(0x9BDC06A7) + W14); \ + T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + F = SPH_T32(F + T1); \ + B = SPH_T32(T1 + T2); \ + W15 = in(15); \ + T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + + SPH_C32(0xC19BF174) + W15); \ + T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + E = SPH_T32(E + T1); \ + A = SPH_T32(T1 + T2); \ + W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \ + T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + + SPH_C32(0xE49B69C1) + W00); \ + T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + D = SPH_T32(D + T1); \ + H = SPH_T32(T1 + T2); \ + W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \ + T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + + SPH_C32(0xEFBE4786) + W01); \ + T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + C = SPH_T32(C + T1); \ + G = SPH_T32(T1 + T2); \ + W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \ + T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + + SPH_C32(0x0FC19DC6) + W02); \ + T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + B = SPH_T32(B + T1); \ + F = SPH_T32(T1 + T2); \ + W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \ + T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + + SPH_C32(0x240CA1CC) + W03); \ + T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + A = SPH_T32(A + T1); \ + E = SPH_T32(T1 + T2); \ + W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \ + T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + + SPH_C32(0x2DE92C6F) + W04); \ + T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + H = SPH_T32(H + T1); \ + D = SPH_T32(T1 + T2); \ + W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \ + T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + + SPH_C32(0x4A7484AA) + W05); \ + T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + G = SPH_T32(G + T1); \ + C = SPH_T32(T1 + T2); \ + W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \ + T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + + SPH_C32(0x5CB0A9DC) + W06); \ + T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + F = SPH_T32(F + T1); \ + B = SPH_T32(T1 + T2); \ + W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \ + T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + + SPH_C32(0x76F988DA) + W07); \ + T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + E = SPH_T32(E + T1); \ + A = SPH_T32(T1 + T2); \ + W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \ + T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + + SPH_C32(0x983E5152) + W08); \ + T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + D = SPH_T32(D + T1); \ + H = SPH_T32(T1 + T2); \ + W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \ + T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + + SPH_C32(0xA831C66D) + W09); \ + T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + C = SPH_T32(C + T1); \ + G = SPH_T32(T1 + T2); \ + W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \ + T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + + SPH_C32(0xB00327C8) + W10); \ + T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + B = SPH_T32(B + T1); \ + F = SPH_T32(T1 + T2); \ + W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \ + T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + + SPH_C32(0xBF597FC7) + W11); \ + T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + A = SPH_T32(A + T1); \ + E = SPH_T32(T1 + T2); \ + W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \ + T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + + SPH_C32(0xC6E00BF3) + W12); \ + T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + H = SPH_T32(H + T1); \ + D = SPH_T32(T1 + T2); \ + W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \ + T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + + SPH_C32(0xD5A79147) + W13); \ + T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + G = SPH_T32(G + T1); \ + C = SPH_T32(T1 + T2); \ + W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \ + T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + + SPH_C32(0x06CA6351) + W14); \ + T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + F = SPH_T32(F + T1); \ + B = SPH_T32(T1 + T2); \ + W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \ + T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + + SPH_C32(0x14292967) + W15); \ + T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + E = SPH_T32(E + T1); \ + A = SPH_T32(T1 + T2); \ + W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \ + T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + + SPH_C32(0x27B70A85) + W00); \ + T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + D = SPH_T32(D + T1); \ + H = SPH_T32(T1 + T2); \ + W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \ + T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + + SPH_C32(0x2E1B2138) + W01); \ + T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + C = SPH_T32(C + T1); \ + G = SPH_T32(T1 + T2); \ + W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \ + T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + + SPH_C32(0x4D2C6DFC) + W02); \ + T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + B = SPH_T32(B + T1); \ + F = SPH_T32(T1 + T2); \ + W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \ + T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + + SPH_C32(0x53380D13) + W03); \ + T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + A = SPH_T32(A + T1); \ + E = SPH_T32(T1 + T2); \ + W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \ + T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + + SPH_C32(0x650A7354) + W04); \ + T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + H = SPH_T32(H + T1); \ + D = SPH_T32(T1 + T2); \ + W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \ + T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + + SPH_C32(0x766A0ABB) + W05); \ + T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + G = SPH_T32(G + T1); \ + C = SPH_T32(T1 + T2); \ + W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \ + T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + + SPH_C32(0x81C2C92E) + W06); \ + T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + F = SPH_T32(F + T1); \ + B = SPH_T32(T1 + T2); \ + W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \ + T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + + SPH_C32(0x92722C85) + W07); \ + T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + E = SPH_T32(E + T1); \ + A = SPH_T32(T1 + T2); \ + W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \ + T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + + SPH_C32(0xA2BFE8A1) + W08); \ + T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + D = SPH_T32(D + T1); \ + H = SPH_T32(T1 + T2); \ + W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \ + T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + + SPH_C32(0xA81A664B) + W09); \ + T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + C = SPH_T32(C + T1); \ + G = SPH_T32(T1 + T2); \ + W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \ + T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + + SPH_C32(0xC24B8B70) + W10); \ + T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + B = SPH_T32(B + T1); \ + F = SPH_T32(T1 + T2); \ + W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \ + T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + + SPH_C32(0xC76C51A3) + W11); \ + T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + A = SPH_T32(A + T1); \ + E = SPH_T32(T1 + T2); \ + W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \ + T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + + SPH_C32(0xD192E819) + W12); \ + T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + H = SPH_T32(H + T1); \ + D = SPH_T32(T1 + T2); \ + W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \ + T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + + SPH_C32(0xD6990624) + W13); \ + T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + G = SPH_T32(G + T1); \ + C = SPH_T32(T1 + T2); \ + W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \ + T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + + SPH_C32(0xF40E3585) + W14); \ + T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + F = SPH_T32(F + T1); \ + B = SPH_T32(T1 + T2); \ + W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \ + T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + + SPH_C32(0x106AA070) + W15); \ + T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + E = SPH_T32(E + T1); \ + A = SPH_T32(T1 + T2); \ + W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \ + T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + + SPH_C32(0x19A4C116) + W00); \ + T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + D = SPH_T32(D + T1); \ + H = SPH_T32(T1 + T2); \ + W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \ + T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + + SPH_C32(0x1E376C08) + W01); \ + T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + C = SPH_T32(C + T1); \ + G = SPH_T32(T1 + T2); \ + W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \ + T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + + SPH_C32(0x2748774C) + W02); \ + T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + B = SPH_T32(B + T1); \ + F = SPH_T32(T1 + T2); \ + W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \ + T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + + SPH_C32(0x34B0BCB5) + W03); \ + T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + A = SPH_T32(A + T1); \ + E = SPH_T32(T1 + T2); \ + W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \ + T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + + SPH_C32(0x391C0CB3) + W04); \ + T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + H = SPH_T32(H + T1); \ + D = SPH_T32(T1 + T2); \ + W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \ + T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + + SPH_C32(0x4ED8AA4A) + W05); \ + T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + G = SPH_T32(G + T1); \ + C = SPH_T32(T1 + T2); \ + W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \ + T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + + SPH_C32(0x5B9CCA4F) + W06); \ + T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + F = SPH_T32(F + T1); \ + B = SPH_T32(T1 + T2); \ + W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \ + T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + + SPH_C32(0x682E6FF3) + W07); \ + T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + E = SPH_T32(E + T1); \ + A = SPH_T32(T1 + T2); \ + W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \ + T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \ + + SPH_C32(0x748F82EE) + W08); \ + T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \ + D = SPH_T32(D + T1); \ + H = SPH_T32(T1 + T2); \ + W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \ + T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \ + + SPH_C32(0x78A5636F) + W09); \ + T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \ + C = SPH_T32(C + T1); \ + G = SPH_T32(T1 + T2); \ + W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \ + T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \ + + SPH_C32(0x84C87814) + W10); \ + T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \ + B = SPH_T32(B + T1); \ + F = SPH_T32(T1 + T2); \ + W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \ + T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \ + + SPH_C32(0x8CC70208) + W11); \ + T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \ + A = SPH_T32(A + T1); \ + E = SPH_T32(T1 + T2); \ + W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \ + T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \ + + SPH_C32(0x90BEFFFA) + W12); \ + T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \ + H = SPH_T32(H + T1); \ + D = SPH_T32(T1 + T2); \ + W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \ + T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \ + + SPH_C32(0xA4506CEB) + W13); \ + T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \ + G = SPH_T32(G + T1); \ + C = SPH_T32(T1 + T2); \ + W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \ + T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \ + + SPH_C32(0xBEF9A3F7) + W14); \ + T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \ + F = SPH_T32(F + T1); \ + B = SPH_T32(T1 + T2); \ + W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \ + T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \ + + SPH_C32(0xC67178F2) + W15); \ + T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \ + E = SPH_T32(E + T1); \ + A = SPH_T32(T1 + T2); \ + (r)[0] = SPH_T32((r)[0] + A); \ + (r)[1] = SPH_T32((r)[1] + B); \ + (r)[2] = SPH_T32((r)[2] + C); \ + (r)[3] = SPH_T32((r)[3] + D); \ + (r)[4] = SPH_T32((r)[4] + E); \ + (r)[5] = SPH_T32((r)[5] + F); \ + (r)[6] = SPH_T32((r)[6] + G); \ + (r)[7] = SPH_T32((r)[7] + H); \ + } while (0) + +#endif + +/* + * One round of SHA-224 / SHA-256. The data must be aligned for 32-bit access. + */ +static void +sha2_round(const unsigned char *data, sph_u32 r[8]) +{ +#define SHA2_IN(x) sph_dec32be_aligned(data + (4 * (x))) + SHA2_ROUND_BODY(SHA2_IN, r); +#undef SHA2_IN +} + +/* see sph_sha2.h */ +void +sph_sha224_init(void *cc) +{ + sph_sha224_context *sc; + + sc = cc; + memcpy(sc->val, H224, sizeof H224); +#if SPH_64 + sc->count = 0; +#else + sc->count_high = sc->count_low = 0; +#endif +} + +/* see sph_sha2.h */ +void +sph_sha256_init(void *cc) +{ + sph_sha256_context *sc; + + sc = cc; + memcpy(sc->val, H256, sizeof H256); +#if SPH_64 + sc->count = 0; +#else + sc->count_high = sc->count_low = 0; +#endif +} + +#define RFUN sha2_round +#define HASH sha224 +#define BE32 1 +#include "md_helper.c" + +/* see sph_sha2.h */ +void +sph_sha224_close(void *cc, void *dst) +{ + sha224_close(cc, dst, 7); + sph_sha224_init(cc); +} + +/* see sph_sha2.h */ +void +sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + sha224_addbits_and_close(cc, ub, n, dst, 7); + sph_sha224_init(cc); +} + +/* see sph_sha2.h */ +void +sph_sha256_close(void *cc, void *dst) +{ + sha224_close(cc, dst, 8); + sph_sha256_init(cc); +} + +/* see sph_sha2.h */ +void +sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + sha224_addbits_and_close(cc, ub, n, dst, 8); + sph_sha256_init(cc); +} + +/* see sph_sha2.h */ +void +sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8]) +{ +#define SHA2_IN(x) msg[x] + SHA2_ROUND_BODY(SHA2_IN, val); +#undef SHA2_IN +} diff --git a/util.cpp b/util.cpp index e7b305a..40352c8 100644 --- a/util.cpp +++ b/util.cpp @@ -1421,28 +1421,39 @@ static uint32_t getblocheight(struct stratum_ctx *sctx) static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) { - const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime, *nreward; + const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime; + const char *claim = NULL, *nreward = NULL; size_t coinb1_size, coinb2_size; bool clean, ret = false; - int merkle_count, i; + int merkle_count, i, p=0; json_t *merkle_arr; uchar **merkle = NULL; // uchar(*merkle_tree)[32] = { 0 }; int ntime; - - job_id = json_string_value(json_array_get(params, 0)); - prevhash = json_string_value(json_array_get(params, 1)); - coinb1 = json_string_value(json_array_get(params, 2)); - coinb2 = json_string_value(json_array_get(params, 3)); - merkle_arr = json_array_get(params, 4); + char algo[64] = { 0 }; + get_currentalgo(algo, sizeof(algo)); + bool has_claim = !strcasecmp(algo, "lbry"); + + job_id = json_string_value(json_array_get(params, p++)); + prevhash = json_string_value(json_array_get(params, p++)); + if (has_claim) { + claim = json_string_value(json_array_get(params, p++)); + if (!claim || strlen(claim) != 64) { + applog(LOG_ERR, "Stratum notify: invalid claim parameter"); + goto out; + } + } + coinb1 = json_string_value(json_array_get(params, p++)); + coinb2 = json_string_value(json_array_get(params, p++)); + merkle_arr = json_array_get(params, p++); if (!merkle_arr || !json_is_array(merkle_arr)) goto out; merkle_count = (int) json_array_size(merkle_arr); - version = json_string_value(json_array_get(params, 5)); - nbits = json_string_value(json_array_get(params, 6)); - stime = json_string_value(json_array_get(params, 7)); - clean = json_is_true(json_array_get(params, 8)); - nreward = json_string_value(json_array_get(params, 9)); + version = json_string_value(json_array_get(params, p++)); + nbits = json_string_value(json_array_get(params, p++)); + stime = json_string_value(json_array_get(params, p++)); + clean = json_is_true(json_array_get(params, p)); p++; + nreward = json_string_value(json_array_get(params, p++)); if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime || strlen(prevhash) != 64 || strlen(version) != 8 || @@ -1494,6 +1505,7 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) free(sctx->job.job_id); sctx->job.job_id = strdup(job_id); hex2bin(sctx->job.prevhash, prevhash, 32); + if (has_claim) hex2bin(sctx->job.claim, claim, 32); sctx->job.height = getblocheight(sctx); @@ -2079,7 +2091,8 @@ void do_gpu_tests(void) //scanhash_scrypt_jane(0, &work, NULL, 1, &done, &tv, &tv); memset(work.data, 0, sizeof(work.data)); - scanhash_decred(0, &work, 1, &done); + work.data[0] = 0; + scanhash_lbry(0, &work, 1, &done); free(work_restart); work_restart = NULL; @@ -2142,6 +2155,10 @@ void print_hash_tests(void) keccak256_hash(&hash[0], &buf[0]); printpfx("keccak", hash); + memset(buf, 0, 128); + lbry_hash(&hash[0], &buf[0]); + printpfx("lbry", hash); + luffa_hash(&hash[0], &buf[0]); printpfx("luffa", hash);