diff --git a/Makefile.am b/Makefile.am
index 0050b06..7a8fd47 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -50,6 +50,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \
sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
sph/hamsi.c sph/hamsi_helper.c sph/streebog.c \
sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \
+ sph/ripemd.c sph/sph_sha2.c \
+ lbry/lbry.cu lbry/cuda_ripemd160.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu \
qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
diff --git a/README.txt b/README.txt
index 4f2bb58..f435c05 100644
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,5 @@
-ccMiner preview 1.8-dev (May 2016) "Pascal and x11evo algo"
+ccMiner 1.8 Preview (July 2016) "CUDA 8, lbry and x11evo algos"
---------------------------------------------------------------
***************************************************************
@@ -28,6 +28,7 @@ Decred (Blake256 14-rounds - 180 bytes)
HeavyCoin & MjollnirCoin
FugueCoin
GroestlCoin & Myriad-Groestl
+Lbry Credits
JackpotCoin
QuarkCoin family & AnimeCoin
TalkCoin
@@ -35,13 +36,13 @@ DarkCoin and other X11 coins
Chaincoin and Flaxscript (C11)
Saffroncoin blake (256 14-rounds)
BlakeCoin (256 8-rounds)
-Midnight (BMW 256)
Qubit (Digibyte, ...)
Luffa (Joincoin)
Keccak (Maxcoin)
Pentablake (Blake 512 x5)
1Coin Triple S
Neoscrypt (FeatherCoin)
+Revolver (X11evo)
Scrypt and Scrypt:N
Scrypt-Jane (Chacha)
Sibcoin (sib)
@@ -82,6 +83,7 @@ its command line interface and options.
heavy use to mine Heavycoin
jackpot use to mine Jackpotcoin
keccak use to mine Maxcoin
+ lbry use to mine LBRY Credits
luffa use to mine Joincoin
lyra2 use to mine Vertcoin
mjollnir use to mine Mjollnircoin
@@ -151,6 +153,7 @@ its command line interface and options.
--max-diff=N Only mine if net difficulty is less than specified value
--pstate=0 will force the Geforce 9xx to run in P0 P-State
--plimit=150W set the gpu power limit, allow multiple values for N cards
+ --tlimit=85 Set the gpu thermal limit (windows only)
--keep-clocks prevent reset clocks and/or power limit on exit
--show-diff display submitted block and net difficulty
-B, --background run the miner in the background
@@ -242,6 +245,7 @@ features.
July 2016 v1.8.0
Pascal support with cuda 8
+ lbry new multi sha / ripemd algo (LBC)
x11evo algo (XRE)
Lyra2v2, Neoscrypt and Decred improvements
Enhance windows NVAPI clock and power limits
diff --git a/algos.h b/algos.h
index a946b76..5184268 100644
--- a/algos.h
+++ b/algos.h
@@ -19,6 +19,7 @@ enum sha_algos {
ALGO_HEAVY, /* Heavycoin hash */
ALGO_KECCAK,
ALGO_JACKPOT,
+ ALGO_LBRY,
ALGO_LUFFA,
ALGO_LYRA2,
ALGO_LYRA2v2,
@@ -67,6 +68,7 @@ static const char *algo_names[] = {
"heavy",
"keccak",
"jackpot",
+ "lbry",
"luffa",
"lyra2",
"lyra2v2",
diff --git a/ccminer.cpp b/ccminer.cpp
index 9716db1..2f4bbef 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -228,6 +228,7 @@ Options:\n\
heavy Heavycoin\n\
jackpot Jackpot\n\
keccak Keccak-256 (Maxcoin)\n\
+ lbry LBRY Credits (Sha/Ripemd)\n\
luffa Joincoin\n\
lyra2 LyraBar\n\
lyra2v2 VertCoin\n\
@@ -567,6 +568,7 @@ static void calc_network_diff(struct work *work)
// sample for diff 43.281 : 1c05ea29
// todo: endian reversed on longpoll could be zr5 specific...
uint32_t nbits = have_longpoll ? work->data[18] : swab32(work->data[18]);
+ if (opt_algo == ALGO_LBRY) nbits = swab32(work->data[26]);
if (opt_algo == ALGO_DECRED) nbits = work->data[29];
uint32_t bits = (nbits & 0xffffff);
int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
@@ -837,6 +839,11 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
le32enc(&nonce, work->data[19]);
be16enc(&nvote, *((uint16_t*)&work->data[20]));
break;
+ case ALGO_LBRY:
+ check_dups = true;
+ le32enc(&ntime, work->data[25]);
+ le32enc(&nonce, work->data[27]);
+ break;
case ALGO_ZR5:
check_dups = true;
be32enc(&ntime, work->data[17]);
@@ -1296,6 +1303,8 @@ bool get_work(struct thr_info *thr, struct work *work)
memset(work->data + 19, 0x00, 52);
if (opt_algo == ALGO_DECRED) {
memset(&work->data[35], 0x00, 52);
+ } else if (opt_algo == ALGO_LBRY) {
+ work->data[28] = 0x80000000;
} else {
work->data[20] = 0x80000000;
work->data[31] = 0x00000280;
@@ -1441,6 +1450,14 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
work->data[37] = (rand()*4) << 8; // random work data
sctx->job.height = work->data[32];
//applog_hex(work->data, 180);
+ } else if (opt_algo == ALGO_LBRY) {
+ for (i = 0; i < 8; i++)
+ work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
+ for (i = 0; i < 8; i++)
+ work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
+ work->data[25] = le32dec(sctx->job.ntime);
+ work->data[26] = le32dec(sctx->job.nbits);
+ work->data[28] = 0x80000000;
} else {
for (i = 0; i < 8; i++)
work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
@@ -1498,6 +1515,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
case ALGO_FRESH:
case ALGO_FUGUE256:
case ALGO_GROESTL:
+ case ALGO_LBRY:
case ALGO_LYRA2v2:
work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));
break;
@@ -1658,6 +1676,7 @@ static void *miner_thread(void *userdata)
// &work.data[19]
int wcmplen = (opt_algo == ALGO_DECRED) ? 140 : 76;
+ if (opt_algo == ALGO_LBRY) wcmplen = 108;
int wcmpoft = 0;
uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
@@ -1910,6 +1929,7 @@ static void *miner_thread(void *userdata)
minmax = 0x40000000U;
break;
case ALGO_KECCAK:
+ case ALGO_LBRY:
case ALGO_LUFFA:
case ALGO_SKEIN:
case ALGO_SKEIN2:
@@ -2035,6 +2055,9 @@ static void *miner_thread(void *userdata)
case ALGO_JACKPOT:
rc = scanhash_jackpot(thr_id, &work, max_nonce, &hashes_done);
break;
+ case ALGO_LBRY:
+ rc = scanhash_lbry(thr_id, &work, max_nonce, &hashes_done);
+ break;
case ALGO_LUFFA:
rc = scanhash_luffa(thr_id, &work, max_nonce, &hashes_done);
break;
@@ -2130,7 +2153,7 @@ static void *miner_thread(void *userdata)
// todo: update all algos to use work->nonces
work.nonces[0] = nonceptr[0];
- if (opt_algo != ALGO_DECRED && opt_algo != ALGO_BLAKE2S) {
+ if (opt_algo != ALGO_DECRED && opt_algo != ALGO_BLAKE2S && opt_algo != ALGO_LBRY) {
work.nonces[1] = nonceptr[2];
}
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 4d7b794..b304366 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -115,7 +115,7 @@
80
true
true
- compute_50,sm_50
+ compute_61,sm_61;compute_52,sm_52
$(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99
64
@@ -273,15 +273,11 @@
-
- true
-
-
- true
-
+
+
@@ -428,6 +424,10 @@
92
+
+
+
+
80
--ptxas-options="-dlcm=cg" %(AdditionalOptions)
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index 5decd0d..ad3842f 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -82,6 +82,9 @@
{1613763f-895c-4321-b58b-6f5849868956}
+
+ {3079ea1f-f768-455a-acd6-f517fac535b4}
+
@@ -150,9 +153,6 @@
Source Files\sph
-
- Source Files\sph
-
Source Files\sph
@@ -180,7 +180,10 @@
Source Files\sph
-
+
+ Source Files\sph
+
+
Source Files\sph
@@ -721,6 +724,18 @@
Source Files\CUDA\Algo256
+
+ Source Files\CUDA\lbry
+
+
+ Source Files\CUDA\lbry
+
+
+ Source Files\CUDA\lbry
+
+
+ Source Files\CUDA\lbry
+
@@ -737,4 +752,4 @@
Ressources
-
\ No newline at end of file
+
diff --git a/lbry/cuda_ripemd160.cu b/lbry/cuda_ripemd160.cu
new file mode 100644
index 0000000..1ae7f35
--- /dev/null
+++ b/lbry/cuda_ripemd160.cu
@@ -0,0 +1,441 @@
+/*
+ * ripemd-160 kernel implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2014, 2016 djm34, tpruvot
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ */
+#include
+#include
+#include
+
+#include
+
+static __constant__ uint32_t c_IV[5] = {
+ 0x67452301u, 0xEFCDAB89u, 0x98BADCFEu, 0x10325476u, 0xC3D2E1F0u
+};
+
+//__host__
+//uint64_t xornot64(uint64_t a, uint64_t b, uint64_t c) {
+// return c ^ (a | !b);
+//}
+
+__forceinline__ __device__
+uint64_t xornot64(uint64_t a, uint64_t b, uint64_t c)
+{
+ uint64_t result;
+ asm("{ .reg .u64 m,n; // xornot64\n\t"
+ "not.b64 m,%2; \n\t"
+ "or.b64 n, %1,m;\n\t"
+ "xor.b64 %0, n,%3;\n\t"
+ "}\n\t"
+ : "=l"(result) : "l"(a), "l"(b), "l"(c));
+ return result;
+}
+
+//__host__
+//uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c) {
+// return a ^ (b | !c);
+//}
+
+__device__ __forceinline__
+uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c)
+{
+ uint64_t result;
+ asm("{ .reg .u64 m,n; // xornt64\n\t"
+ "not.b64 m,%3; \n\t"
+ "or.b64 n, %2,m;\n\t"
+ "xor.b64 %0, %1,n;\n\t"
+ "}\n\t"
+ : "=l"(result) : "l"(a), "l"(b), "l"(c));
+ return result;
+}
+
+/*
+ * Round functions for RIPEMD-128 and RIPEMD-160.
+ */
+#if 1
+#define F1(x, y, z) ((x) ^ (y) ^ (z))
+#define F2(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
+#define F3(x, y, z) (((x) | ~(y)) ^ (z))
+#define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y))
+#define F5(x, y, z) ((x) ^ ((y) | ~(z)))
+#else
+#define F1(x, y, z) xor3(x,y,z)
+#define F2(x, y, z) xandx(x,y,z)
+#define F3(x, y, z) xornot64(x,y,z)
+#define F4(x, y, z) xandx(z,x,y)
+#define F5(x, y, z) xornt64(x,y,z)
+#endif
+
+/*
+ * Round constants for RIPEMD-160.
+ */
+#define K11 0x00000000u
+#define K12 0x5A827999u
+#define K13 0x6ED9EBA1u
+#define K14 0x8F1BBCDCu
+#define K15 0xA953FD4Eu
+
+#define K21 0x50A28BE6u
+#define K22 0x5C4DD124u
+#define K23 0x6D703EF3u
+#define K24 0x7A6D76E9u
+#define K25 0x00000000u
+
+#define RR(a, b, c, d, e, f, s, r, k) { \
+ a = SPH_T32(ROTL32(SPH_T32(a + f(b, c, d) + r + k), s) + e); \
+ c = ROTL32(c, 10); \
+}
+
+#define ROUND1(a, b, c, d, e, f, s, r, k) \
+ RR(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
+
+#define ROUND2(a, b, c, d, e, f, s, r, k) \
+ RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
+
+#define RIPEMD160_ROUND_BODY(in, h) { \
+ uint32_t A1, B1, C1, D1, E1; \
+ uint32_t A2, B2, C2, D2, E2; \
+ uint32_t tmp; \
+\
+ A1 = A2 = h[0]; \
+ B1 = B2 = h[1]; \
+ C1 = C2 = h[2]; \
+ D1 = D2 = h[3]; \
+ E1 = E2 = h[4]; \
+\
+ ROUND1(A, B, C, D, E, F1, 11, in[ 0], 1); \
+ ROUND1(E, A, B, C, D, F1, 14, in[ 1], 1); \
+ ROUND1(D, E, A, B, C, F1, 15, in[ 2], 1); \
+ ROUND1(C, D, E, A, B, F1, 12, in[ 3], 1); \
+ ROUND1(B, C, D, E, A, F1, 5, in[ 4], 1); \
+ ROUND1(A, B, C, D, E, F1, 8, in[ 5], 1); \
+ ROUND1(E, A, B, C, D, F1, 7, in[ 6], 1); \
+ ROUND1(D, E, A, B, C, F1, 9, in[ 7], 1); \
+ ROUND1(C, D, E, A, B, F1, 11, in[ 8], 1); \
+ ROUND1(B, C, D, E, A, F1, 13, in[ 9], 1); \
+ ROUND1(A, B, C, D, E, F1, 14, in[10], 1); \
+ ROUND1(E, A, B, C, D, F1, 15, in[11], 1); \
+ ROUND1(D, E, A, B, C, F1, 6, in[12], 1); \
+ ROUND1(C, D, E, A, B, F1, 7, in[13], 1); \
+ ROUND1(B, C, D, E, A, F1, 9, in[14], 1); \
+ ROUND1(A, B, C, D, E, F1, 8, in[15], 1); \
+\
+ ROUND1(E, A, B, C, D, F2, 7, in[ 7], 2); \
+ ROUND1(D, E, A, B, C, F2, 6, in[ 4], 2); \
+ ROUND1(C, D, E, A, B, F2, 8, in[13], 2); \
+ ROUND1(B, C, D, E, A, F2, 13, in[ 1], 2); \
+ ROUND1(A, B, C, D, E, F2, 11, in[10], 2); \
+ ROUND1(E, A, B, C, D, F2, 9, in[ 6], 2); \
+ ROUND1(D, E, A, B, C, F2, 7, in[15], 2); \
+ ROUND1(C, D, E, A, B, F2, 15, in[ 3], 2); \
+ ROUND1(B, C, D, E, A, F2, 7, in[12], 2); \
+ ROUND1(A, B, C, D, E, F2, 12, in[ 0], 2); \
+ ROUND1(E, A, B, C, D, F2, 15, in[ 9], 2); \
+ ROUND1(D, E, A, B, C, F2, 9, in[ 5], 2); \
+ ROUND1(C, D, E, A, B, F2, 11, in[ 2], 2); \
+ ROUND1(B, C, D, E, A, F2, 7, in[14], 2); \
+ ROUND1(A, B, C, D, E, F2, 13, in[11], 2); \
+ ROUND1(E, A, B, C, D, F2, 12, in[ 8], 2); \
+\
+ ROUND1(D, E, A, B, C, F3, 11, in[ 3], 3); \
+ ROUND1(C, D, E, A, B, F3, 13, in[10], 3); \
+ ROUND1(B, C, D, E, A, F3, 6, in[14], 3); \
+ ROUND1(A, B, C, D, E, F3, 7, in[ 4], 3); \
+ ROUND1(E, A, B, C, D, F3, 14, in[ 9], 3); \
+ ROUND1(D, E, A, B, C, F3, 9, in[15], 3); \
+ ROUND1(C, D, E, A, B, F3, 13, in[ 8], 3); \
+ ROUND1(B, C, D, E, A, F3, 15, in[ 1], 3); \
+ ROUND1(A, B, C, D, E, F3, 14, in[ 2], 3); \
+ ROUND1(E, A, B, C, D, F3, 8, in[ 7], 3); \
+ ROUND1(D, E, A, B, C, F3, 13, in[ 0], 3); \
+ ROUND1(C, D, E, A, B, F3, 6, in[ 6], 3); \
+ ROUND1(B, C, D, E, A, F3, 5, in[13], 3); \
+ ROUND1(A, B, C, D, E, F3, 12, in[11], 3); \
+ ROUND1(E, A, B, C, D, F3, 7, in[ 5], 3); \
+ ROUND1(D, E, A, B, C, F3, 5, in[12], 3); \
+\
+ ROUND1(C, D, E, A, B, F4, 11, in[ 1], 4); \
+ ROUND1(B, C, D, E, A, F4, 12, in[ 9], 4); \
+ ROUND1(A, B, C, D, E, F4, 14, in[11], 4); \
+ ROUND1(E, A, B, C, D, F4, 15, in[10], 4); \
+ ROUND1(D, E, A, B, C, F4, 14, in[ 0], 4); \
+ ROUND1(C, D, E, A, B, F4, 15, in[ 8], 4); \
+ ROUND1(B, C, D, E, A, F4, 9, in[12], 4); \
+ ROUND1(A, B, C, D, E, F4, 8, in[ 4], 4); \
+ ROUND1(E, A, B, C, D, F4, 9, in[13], 4); \
+ ROUND1(D, E, A, B, C, F4, 14, in[ 3], 4); \
+ ROUND1(C, D, E, A, B, F4, 5, in[ 7], 4); \
+ ROUND1(B, C, D, E, A, F4, 6, in[15], 4); \
+ ROUND1(A, B, C, D, E, F4, 8, in[14], 4); \
+ ROUND1(E, A, B, C, D, F4, 6, in[ 5], 4); \
+ ROUND1(D, E, A, B, C, F4, 5, in[ 6], 4); \
+ ROUND1(C, D, E, A, B, F4, 12, in[ 2], 4); \
+\
+ ROUND1(B, C, D, E, A, F5, 9, in[ 4], 5); \
+ ROUND1(A, B, C, D, E, F5, 15, in[ 0], 5); \
+ ROUND1(E, A, B, C, D, F5, 5, in[ 5], 5); \
+ ROUND1(D, E, A, B, C, F5, 11, in[ 9], 5); \
+ ROUND1(C, D, E, A, B, F5, 6, in[ 7], 5); \
+ ROUND1(B, C, D, E, A, F5, 8, in[12], 5); \
+ ROUND1(A, B, C, D, E, F5, 13, in[ 2], 5); \
+ ROUND1(E, A, B, C, D, F5, 12, in[10], 5); \
+ ROUND1(D, E, A, B, C, F5, 5, in[14], 5); \
+ ROUND1(C, D, E, A, B, F5, 12, in[ 1], 5); \
+ ROUND1(B, C, D, E, A, F5, 13, in[ 3], 5); \
+ ROUND1(A, B, C, D, E, F5, 14, in[ 8], 5); \
+ ROUND1(E, A, B, C, D, F5, 11, in[11], 5); \
+ ROUND1(D, E, A, B, C, F5, 8, in[ 6], 5); \
+ ROUND1(C, D, E, A, B, F5, 5, in[15], 5); \
+ ROUND1(B, C, D, E, A, F5, 6, in[13], 5); \
+\
+ ROUND2(A, B, C, D, E, F5, 8, in[ 5], 1); \
+ ROUND2(E, A, B, C, D, F5, 9, in[14], 1); \
+ ROUND2(D, E, A, B, C, F5, 9, in[ 7], 1); \
+ ROUND2(C, D, E, A, B, F5, 11, in[ 0], 1); \
+ ROUND2(B, C, D, E, A, F5, 13, in[ 9], 1); \
+ ROUND2(A, B, C, D, E, F5, 15, in[ 2], 1); \
+ ROUND2(E, A, B, C, D, F5, 15, in[11], 1); \
+ ROUND2(D, E, A, B, C, F5, 5, in[ 4], 1); \
+ ROUND2(C, D, E, A, B, F5, 7, in[13], 1); \
+ ROUND2(B, C, D, E, A, F5, 7, in[ 6], 1); \
+ ROUND2(A, B, C, D, E, F5, 8, in[15], 1); \
+ ROUND2(E, A, B, C, D, F5, 11, in[ 8], 1); \
+ ROUND2(D, E, A, B, C, F5, 14, in[ 1], 1); \
+ ROUND2(C, D, E, A, B, F5, 14, in[10], 1); \
+ ROUND2(B, C, D, E, A, F5, 12, in[ 3], 1); \
+ ROUND2(A, B, C, D, E, F5, 6, in[12], 1); \
+\
+ ROUND2(E, A, B, C, D, F4, 9, in[ 6], 2); \
+ ROUND2(D, E, A, B, C, F4, 13, in[11], 2); \
+ ROUND2(C, D, E, A, B, F4, 15, in[ 3], 2); \
+ ROUND2(B, C, D, E, A, F4, 7, in[ 7], 2); \
+ ROUND2(A, B, C, D, E, F4, 12, in[ 0], 2); \
+ ROUND2(E, A, B, C, D, F4, 8, in[13], 2); \
+ ROUND2(D, E, A, B, C, F4, 9, in[ 5], 2); \
+ ROUND2(C, D, E, A, B, F4, 11, in[10], 2); \
+ ROUND2(B, C, D, E, A, F4, 7, in[14], 2); \
+ ROUND2(A, B, C, D, E, F4, 7, in[15], 2); \
+ ROUND2(E, A, B, C, D, F4, 12, in[ 8], 2); \
+ ROUND2(D, E, A, B, C, F4, 7, in[12], 2); \
+ ROUND2(C, D, E, A, B, F4, 6, in[ 4], 2); \
+ ROUND2(B, C, D, E, A, F4, 15, in[ 9], 2); \
+ ROUND2(A, B, C, D, E, F4, 13, in[ 1], 2); \
+ ROUND2(E, A, B, C, D, F4, 11, in[ 2], 2); \
+\
+ ROUND2(D, E, A, B, C, F3, 9, in[15], 3); \
+ ROUND2(C, D, E, A, B, F3, 7, in[ 5], 3); \
+ ROUND2(B, C, D, E, A, F3, 15, in[ 1], 3); \
+ ROUND2(A, B, C, D, E, F3, 11, in[ 3], 3); \
+ ROUND2(E, A, B, C, D, F3, 8, in[ 7], 3); \
+ ROUND2(D, E, A, B, C, F3, 6, in[14], 3); \
+ ROUND2(C, D, E, A, B, F3, 6, in[ 6], 3); \
+ ROUND2(B, C, D, E, A, F3, 14, in[ 9], 3); \
+ ROUND2(A, B, C, D, E, F3, 12, in[11], 3); \
+ ROUND2(E, A, B, C, D, F3, 13, in[ 8], 3); \
+ ROUND2(D, E, A, B, C, F3, 5, in[12], 3); \
+ ROUND2(C, D, E, A, B, F3, 14, in[ 2], 3); \
+ ROUND2(B, C, D, E, A, F3, 13, in[10], 3); \
+ ROUND2(A, B, C, D, E, F3, 13, in[ 0], 3); \
+ ROUND2(E, A, B, C, D, F3, 7, in[ 4], 3); \
+ ROUND2(D, E, A, B, C, F3, 5, in[13], 3); \
+\
+ ROUND2(C, D, E, A, B, F2, 15, in[ 8], 4); \
+ ROUND2(B, C, D, E, A, F2, 5, in[ 6], 4); \
+ ROUND2(A, B, C, D, E, F2, 8, in[ 4], 4); \
+ ROUND2(E, A, B, C, D, F2, 11, in[ 1], 4); \
+ ROUND2(D, E, A, B, C, F2, 14, in[ 3], 4); \
+ ROUND2(C, D, E, A, B, F2, 14, in[11], 4); \
+ ROUND2(B, C, D, E, A, F2, 6, in[15], 4); \
+ ROUND2(A, B, C, D, E, F2, 14, in[ 0], 4); \
+ ROUND2(E, A, B, C, D, F2, 6, in[ 5], 4); \
+ ROUND2(D, E, A, B, C, F2, 9, in[12], 4); \
+ ROUND2(C, D, E, A, B, F2, 12, in[ 2], 4); \
+ ROUND2(B, C, D, E, A, F2, 9, in[13], 4); \
+ ROUND2(A, B, C, D, E, F2, 12, in[ 9], 4); \
+ ROUND2(E, A, B, C, D, F2, 5, in[ 7], 4); \
+ ROUND2(D, E, A, B, C, F2, 15, in[10], 4); \
+ ROUND2(C, D, E, A, B, F2, 8, in[14], 4); \
+\
+ ROUND2(B, C, D, E, A, F1, 8, in[12], 5); \
+ ROUND2(A, B, C, D, E, F1, 5, in[15], 5); \
+ ROUND2(E, A, B, C, D, F1, 12, in[10], 5); \
+ ROUND2(D, E, A, B, C, F1, 9, in[ 4], 5); \
+ ROUND2(C, D, E, A, B, F1, 12, in[ 1], 5); \
+ ROUND2(B, C, D, E, A, F1, 5, in[ 5], 5); \
+ ROUND2(A, B, C, D, E, F1, 14, in[ 8], 5); \
+ ROUND2(E, A, B, C, D, F1, 6, in[ 7], 5); \
+ ROUND2(D, E, A, B, C, F1, 8, in[ 6], 5); \
+ ROUND2(C, D, E, A, B, F1, 13, in[ 2], 5); \
+ ROUND2(B, C, D, E, A, F1, 6, in[13], 5); \
+ ROUND2(A, B, C, D, E, F1, 5, in[14], 5); \
+ ROUND2(E, A, B, C, D, F1, 15, in[ 0], 5); \
+ ROUND2(D, E, A, B, C, F1, 13, in[ 3], 5); \
+ ROUND2(C, D, E, A, B, F1, 11, in[ 9], 5); \
+ ROUND2(B, C, D, E, A, F1, 11, in[11], 5); \
+\
+ tmp = (h[1] + C1 + D2); \
+ h[1] = (h[2] + D1 + E2); \
+ h[2] = (h[3] + E1 + A2); \
+ h[3] = (h[4] + A1 + B2); \
+ h[4] = (h[0] + B1 + C2); \
+ h[0] = tmp; \
+}
+
+#if 0
+__global__
+void lbry_ripemd160_gpu_hash_32(const uint32_t threads, uint64_t *g_hash, const uint32_t byteOffset)
+{
+ const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ if (thread < threads)
+ {
+ uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U + byteOffset/8]);
+
+ uint32_t in[16];
+ for (int i=0; i<8; i++)
+ in[i] = (hash[i]);
+ in[8] = 0x80;
+
+ #pragma unroll
+ for (int i=9;i<16;i++) in[i] = 0;
+
+ in[14] = 0x100; // size in bits
+
+ uint32_t h[5];
+ #pragma unroll
+ for (int i=0; i<5; i++)
+ h[i] = c_IV[i];
+
+ RIPEMD160_ROUND_BODY(in, h);
+
+ #pragma unroll
+ for (int i=0; i<5; i++)
+ hash[i] = h[i];
+
+#ifdef PAD_ZEROS
+ // 20 bytes hash on 32 or 64 bytes output space
+ hash[5] = 0;
+ hash[6] = 0;
+ hash[7] = 0;
+#endif
+ }
+}
+
+__host__
+void lbry_ripemd160_hash_32(int thr_id, uint32_t threads, uint32_t *g_Hash, uint32_t byteOffset, cudaStream_t stream)
+{
+ const uint32_t threadsperblock = 128;
+
+ dim3 grid(threads/threadsperblock);
+ dim3 block(threadsperblock);
+
+ lbry_ripemd160_gpu_hash_32 <<>> (threads, (uint64_t*) g_Hash, byteOffset);
+}
+#endif
+
+__global__
+//__launch_bounds__(256,6)
+void lbry_ripemd160_gpu_hash_32x2(const uint32_t threads, uint64_t *g_hash)
+{
+ const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ if (thread < threads)
+ {
+ uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U]);
+
+ uint32_t in[16];
+ for (int i=0; i<8; i++)
+ in[i] = (hash[i]);
+ in[8] = 0x80;
+
+ #pragma unroll
+ for (int i=9;i<16;i++) in[i] = 0;
+
+ in[14] = 0x100; // size in bits
+
+ uint32_t h[5];
+ #pragma unroll
+ for (int i=0; i<5; i++)
+ h[i] = c_IV[i];
+
+ RIPEMD160_ROUND_BODY(in, h);
+
+ #pragma unroll
+ for (int i=0; i<5; i++)
+ hash[i] = h[i];
+
+#ifdef PAD_ZEROS
+ // 20 bytes hash on 32 output space
+ hash[5] = 0;
+ hash[6] = 0;
+ hash[7] = 0;
+#endif
+ // second 32 bytes block hash
+ hash += 8;
+
+ #pragma unroll
+ for (int i=0; i<8; i++)
+ in[i] = (hash[i]);
+ in[8] = 0x80;
+
+ #pragma unroll
+ for (int i=9;i<16;i++) in[i] = 0;
+
+ in[14] = 0x100; // size in bits
+
+ #pragma unroll
+ for (int i=0; i<5; i++)
+ h[i] = c_IV[i];
+
+ RIPEMD160_ROUND_BODY(in, h);
+
+ #pragma unroll
+ for (int i=0; i<5; i++)
+ hash[i] = h[i];
+
+#ifdef PAD_ZEROS
+ // 20 bytes hash on 32 output space
+ hash[5] = 0;
+ hash[6] = 0;
+ hash[7] = 0;
+#endif
+ }
+}
+
+__host__
+void lbry_ripemd160_hash_32x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream)
+{
+ const uint32_t threadsperblock = 128;
+
+ dim3 grid(threads/threadsperblock);
+ dim3 block(threadsperblock);
+
+ lbry_ripemd160_gpu_hash_32x2 <<>> (threads, (uint64_t*) g_Hash);
+}
+
+void lbry_ripemd160_init(int thr_id)
+{
+ //cudaMemcpyToSymbol(c_IV, IV, sizeof(IV), 0, cudaMemcpyHostToDevice);
+}
diff --git a/lbry/cuda_sha256_lbry.cu b/lbry/cuda_sha256_lbry.cu
new file mode 100644
index 0000000..db0c41c
--- /dev/null
+++ b/lbry/cuda_sha256_lbry.cu
@@ -0,0 +1,712 @@
+/*
+ * sha256 CUDA implementation.
+ */
+#include
+#include
+#include
+
+#include
+#include
+
+__constant__ static uint32_t __align__(8) c_midstate112[8];
+__constant__ static uint32_t __align__(8) c_dataEnd112[12];
+
+const __constant__ uint32_t __align__(8) c_H256[8] = {
+ 0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
+ 0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
+};
+__constant__ static uint32_t __align__(8) c_K[64];
+
+static __thread uint32_t* d_resNonces;
+__constant__ static uint32_t __align__(8) c_target[2];
+__device__ uint64_t d_target[1];
+
+// ------------------------------------------------------------------------------------------------
+
+static const uint32_t cpu_H256[8] = {
+ 0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
+ 0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
+};
+
+static const uint32_t cpu_K[64] = {
+ 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+ 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+ 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+ 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+ 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+ 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+ 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+ 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
+};
+
+#define ROTR ROTR32
+
+__host__
+static void sha256_step1_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
+ uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+ uint32_t in, const uint32_t Kshared)
+{
+ uint32_t t1,t2;
+ uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+ uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
+ uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
+ uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+
+ t1 = h + bsg21 + vxandx + Kshared + in;
+ t2 = bsg20 + andorv;
+ d = d + t1;
+ h = t1 + t2;
+}
+
+__host__
+static void sha256_step2_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
+ uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+ uint32_t* in, uint32_t pc, const uint32_t Kshared)
+{
+ uint32_t t1,t2;
+
+ int pcidx1 = (pc-2) & 0xF;
+ int pcidx2 = (pc-7) & 0xF;
+ int pcidx3 = (pc-15) & 0xF;
+
+ uint32_t inx0 = in[pc];
+ uint32_t inx1 = in[pcidx1];
+ uint32_t inx2 = in[pcidx2];
+ uint32_t inx3 = in[pcidx3];
+
+ uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
+ uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
+ uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+ uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
+ uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
+ uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+
+ in[pc] = ssg21 + inx2 + ssg20 + inx0;
+
+ t1 = h + bsg21 + vxandx + Kshared + in[pc];
+ t2 = bsg20 + andorv;
+ d = d + t1;
+ h = t1 + t2;
+}
+
+__host__
+static void sha256_round_body_host(uint32_t* in, uint32_t* state, const uint32_t* Kshared)
+{
+ uint32_t a = state[0];
+ uint32_t b = state[1];
+ uint32_t c = state[2];
+ uint32_t d = state[3];
+ uint32_t e = state[4];
+ uint32_t f = state[5];
+ uint32_t g = state[6];
+ uint32_t h = state[7];
+
+ sha256_step1_host(a,b,c,d,e,f,g,h,in[0], Kshared[0]);
+ sha256_step1_host(h,a,b,c,d,e,f,g,in[1], Kshared[1]);
+ sha256_step1_host(g,h,a,b,c,d,e,f,in[2], Kshared[2]);
+ sha256_step1_host(f,g,h,a,b,c,d,e,in[3], Kshared[3]);
+ sha256_step1_host(e,f,g,h,a,b,c,d,in[4], Kshared[4]);
+ sha256_step1_host(d,e,f,g,h,a,b,c,in[5], Kshared[5]);
+ sha256_step1_host(c,d,e,f,g,h,a,b,in[6], Kshared[6]);
+ sha256_step1_host(b,c,d,e,f,g,h,a,in[7], Kshared[7]);
+ sha256_step1_host(a,b,c,d,e,f,g,h,in[8], Kshared[8]);
+ sha256_step1_host(h,a,b,c,d,e,f,g,in[9], Kshared[9]);
+ sha256_step1_host(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
+ sha256_step1_host(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
+ sha256_step1_host(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
+ sha256_step1_host(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
+ sha256_step1_host(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
+ sha256_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
+
+ for (int i=0; i<3; i++)
+ {
+ sha256_step2_host(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
+ sha256_step2_host(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
+ sha256_step2_host(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
+ sha256_step2_host(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
+ sha256_step2_host(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
+ sha256_step2_host(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
+ sha256_step2_host(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
+ sha256_step2_host(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
+ sha256_step2_host(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
+ sha256_step2_host(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
+ sha256_step2_host(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
+ sha256_step2_host(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
+ sha256_step2_host(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
+ sha256_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
+ sha256_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
+ sha256_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
+ }
+
+ state[0] += a;
+ state[1] += b;
+ state[2] += c;
+ state[3] += d;
+ state[4] += e;
+ state[5] += f;
+ state[6] += g;
+ state[7] += h;
+}
+
+__device__ __forceinline__
+uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) {
+ uint32_t result;
+#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
+ asm ("lop3.b32 %0, %1, %2, %3, 0x96; // xor3b" //0x96 = 0xF0 ^ 0xCC ^ 0xAA
+ : "=r"(result) : "r"(a), "r"(b),"r"(c));
+#else
+ result = a^b^c;
+#endif
+ return result;
+}
+
+/*
+__device__ __forceinline__
+uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) {
+ uint32_t result;
+ asm("{ .reg .u32 t1; // xor3b \n\t"
+ "xor.b32 t1, %2, %3;\n\t"
+ "xor.b32 %0, %1, t1;"
+ "}"
+ : "=r"(result) : "r"(a) ,"r"(b),"r"(c));
+ return result;
+}
+#define xor3b(a,b,c) (a ^ b ^ c)
+*/
+
+__device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
+{
+ uint32_t r1 = ROTR32(x,2);
+ uint32_t r2 = ROTR32(x,13);
+ uint32_t r3 = ROTR32(x,22);
+ return xor3b(r1,r2,r3);
+}
+
+__device__ __forceinline__ uint32_t bsg2_1(const uint32_t x)
+{
+ uint32_t r1 = ROTR32(x,6);
+ uint32_t r2 = ROTR32(x,11);
+ uint32_t r3 = ROTR32(x,25);
+ return xor3b(r1,r2,r3);
+}
+
+__device__ __forceinline__ uint32_t ssg2_0(const uint32_t x)
+{
+ uint64_t r1 = ROTR32(x,7);
+ uint64_t r2 = ROTR32(x,18);
+ uint64_t r3 = shr_t32(x,3);
+ return xor3b(r1,r2,r3);
+}
+
+__device__ __forceinline__ uint32_t ssg2_1(const uint32_t x)
+{
+ uint64_t r1 = ROTR32(x,17);
+ uint64_t r2 = ROTR32(x,19);
+ uint64_t r3 = shr_t32(x,10);
+ return xor3b(r1,r2,r3);
+}
+
+__device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b, const uint32_t c)
+{
+ uint32_t result;
+ asm("{\n\t"
+ ".reg .u32 m,n,o;\n\t"
+ "and.b32 m, %1, %2;\n\t"
+ " or.b32 n, %1, %2;\n\t"
+ "and.b32 o, n, %3;\n\t"
+ " or.b32 %0, m, o ;\n\t"
+ "}\n\t" : "=r"(result) : "r"(a), "r"(b), "r"(c)
+ );
+ return result;
+}
+
+__device__
+static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+ uint32_t in, const uint32_t Kshared)
+{
+ uint32_t t1,t2;
+ uint32_t vxandx = xandx(e, f, g);
+ uint32_t bsg21 = bsg2_1(e);
+ uint32_t bsg20 = bsg2_0(a);
+ uint32_t andorv = andor32(a,b,c);
+
+ t1 = h + bsg21 + vxandx + Kshared + in;
+ t2 = bsg20 + andorv;
+ d = d + t1;
+ h = t1 + t2;
+}
+
+__device__
+static void sha2_step2(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+ uint32_t* in, uint32_t pc, const uint32_t Kshared)
+{
+ uint32_t t1,t2;
+
+ int pcidx1 = (pc-2) & 0xF;
+ int pcidx2 = (pc-7) & 0xF;
+ int pcidx3 = (pc-15) & 0xF;
+
+ uint32_t inx0 = in[pc];
+ uint32_t inx1 = in[pcidx1];
+ uint32_t inx2 = in[pcidx2];
+ uint32_t inx3 = in[pcidx3];
+
+ uint32_t ssg21 = ssg2_1(inx1);
+ uint32_t ssg20 = ssg2_0(inx3);
+ uint32_t vxandx = xandx(e, f, g);
+ uint32_t bsg21 = bsg2_1(e);
+ uint32_t bsg20 = bsg2_0(a);
+ uint32_t andorv = andor32(a,b,c);
+
+ in[pc] = ssg21 + inx2 + ssg20 + inx0;
+
+ t1 = h + bsg21 + vxandx + Kshared + in[pc];
+ t2 = bsg20 + andorv;
+ d = d + t1;
+ h = t1 + t2;
+}
+
+__device__
+static void sha256_round_body(uint32_t* in, uint32_t* state, uint32_t* const Kshared)
+{
+ uint32_t a = state[0];
+ uint32_t b = state[1];
+ uint32_t c = state[2];
+ uint32_t d = state[3];
+ uint32_t e = state[4];
+ uint32_t f = state[5];
+ uint32_t g = state[6];
+ uint32_t h = state[7];
+
+ sha2_step1(a,b,c,d,e,f,g,h,in[0], Kshared[0]);
+ sha2_step1(h,a,b,c,d,e,f,g,in[1], Kshared[1]);
+ sha2_step1(g,h,a,b,c,d,e,f,in[2], Kshared[2]);
+ sha2_step1(f,g,h,a,b,c,d,e,in[3], Kshared[3]);
+ sha2_step1(e,f,g,h,a,b,c,d,in[4], Kshared[4]);
+ sha2_step1(d,e,f,g,h,a,b,c,in[5], Kshared[5]);
+ sha2_step1(c,d,e,f,g,h,a,b,in[6], Kshared[6]);
+ sha2_step1(b,c,d,e,f,g,h,a,in[7], Kshared[7]);
+ sha2_step1(a,b,c,d,e,f,g,h,in[8], Kshared[8]);
+ sha2_step1(h,a,b,c,d,e,f,g,in[9], Kshared[9]);
+ sha2_step1(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
+ sha2_step1(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
+ sha2_step1(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
+ sha2_step1(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
+ sha2_step1(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
+ sha2_step1(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
+
+ #pragma unroll
+ for (int i=0; i<3; i++)
+ {
+ sha2_step2(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
+ sha2_step2(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
+ sha2_step2(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
+ sha2_step2(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
+ sha2_step2(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
+ sha2_step2(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
+ sha2_step2(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
+ sha2_step2(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
+ sha2_step2(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
+ sha2_step2(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
+ sha2_step2(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
+ sha2_step2(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
+ sha2_step2(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
+ sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
+ sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
+ sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
+ }
+
+ state[0] += a;
+ state[1] += b;
+ state[2] += c;
+ state[3] += d;
+ state[4] += e;
+ state[5] += f;
+ state[6] += g;
+ state[7] += h;
+}
+
+__device__
+uint64_t cuda_swab32ll(uint64_t x) {
+ return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
+}
+
+__global__
+/*__launch_bounds__(256,3)*/
+void lbry_sha256_gpu_hash_112(const uint32_t threads, const uint32_t startNonce, const bool swabNonce, uint64_t *outputHash)
+{
+ const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ if (thread < threads)
+ {
+ const uint32_t nonce = startNonce + thread;
+
+ uint32_t dat[16];
+ #pragma unroll
+ for (int i=0;i<11;i++) dat[i] = c_dataEnd112[i]; // pre "swabed"
+ dat[11] = swabNonce ? cuda_swab32(nonce) : nonce;
+ dat[12] = 0x80000000;
+ dat[13] = 0;
+ dat[14] = 0;
+ dat[15] = 0x380;
+
+ uint32_t __align__(8) buf[8];
+ #pragma unroll
+ for (int i=0;i<8;i++) buf[i] = c_midstate112[i];
+
+ sha256_round_body(dat, buf, c_K);
+
+ // output
+ uint2* output = (uint2*) (&outputHash[thread * 8U]);
+ #pragma unroll
+ for (int i=0;i<4;i++) {
+ //output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
+ output[i] = vectorize(((uint64_t*)buf)[i]); // out without swap, new sha256 after
+ }
+ }
+}
+
+__global__
+/*__launch_bounds__(256,3)*/
+void lbry_sha256_gpu_hash_32(uint32_t threads, uint64_t *Hash512)
+{
+ const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ if (thread < threads)
+ {
+ uint32_t __align__(8) buf[8]; // align for vectorize
+ #pragma unroll
+ for (int i=0; i<8; i++) buf[i] = c_H256[i];
+
+ uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]);
+
+ uint32_t dat[16];
+ #pragma unroll
+ //for (int i=0;i<8;i++) dat[i] = cuda_swab32(input[i]);
+ for (int i=0; i<8; i++) dat[i] = input[i];
+ dat[8] = 0x80000000;
+ #pragma unroll
+ for (int i=9; i<15; i++) dat[i] = 0;
+ dat[15] = 0x100;
+
+ sha256_round_body(dat, buf, c_K);
+
+ // output
+ uint2* output = (uint2*) input;
+ #pragma unroll
+ for (int i=0;i<4;i++) {
+ output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
+ }
+#ifdef PAD_ZEROS
+ #pragma unroll
+ for (int i=4; i<8; i++) output[i] = vectorize(0);
+#endif
+ }
+}
+
+__global__
+/*__launch_bounds__(256,3)*/
+void lbry_sha256d_gpu_hash_112(const uint32_t threads, const uint32_t startNonce, const bool swabNonce, uint64_t *outputHash)
+{
+ const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ extern __shared__ uint32_t s_K[];
+ //s_K[thread & 63] = c_K[thread & 63];
+ if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x];
+ if (thread < threads)
+ {
+ const uint32_t nonce = startNonce + thread;
+
+ uint32_t dat[16];
+ #pragma unroll
+ for (int i=0; i<11; i++) dat[i] = c_dataEnd112[i];
+ dat[11] = swabNonce ? cuda_swab32(nonce) : nonce;
+ dat[12] = 0x80000000;
+ dat[13] = 0;
+ dat[14] = 0;
+ dat[15] = 0x380;
+
+ uint32_t __align__(8) buf[8];
+ #pragma unroll
+ for (int i=0;i<8;i++) buf[i] = c_midstate112[i];
+
+ sha256_round_body(dat, buf, s_K);
+
+ // second sha256
+
+ #pragma unroll
+ for (int i=0; i<8; i++) dat[i] = buf[i];
+ dat[8] = 0x80000000;
+ #pragma unroll
+ for (int i=9; i<15; i++) dat[i] = 0;
+ dat[15] = 0x100;
+
+ #pragma unroll
+ for (int i=0; i<8; i++) buf[i] = c_H256[i];
+
+ sha256_round_body(dat, buf, s_K);
+
+ // output
+ uint2* output = (uint2*) (&outputHash[thread * 8U]);
+ #pragma unroll
+ for (int i=0;i<4;i++) {
+ output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
+ //output[i] = vectorize(((uint64_t*)buf)[i]);
+ }
+ }
+}
+
+__global__
+/*__launch_bounds__(256,3)*/
+void lbry_sha256_gpu_hash_20x2(uint32_t threads, uint64_t *Hash512)
+{
+ const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ if (thread < threads)
+ {
+ uint32_t __align__(8) buf[8]; // align for vectorize
+ #pragma unroll
+ for (int i=0;i<8;i++) buf[i] = c_H256[i];
+
+ uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]);
+
+ uint32_t dat[16];
+ #pragma unroll
+ for (int i=0;i<5;i++) dat[i] = cuda_swab32(input[i]);
+ #pragma unroll
+ for (int i=0;i<5;i++) dat[i+5] = cuda_swab32(input[i+8]);
+ dat[10] = 0x80000000;
+ #pragma unroll
+ for (int i=11;i<15;i++) dat[i] = 0;
+ dat[15] = 0x140;
+
+ sha256_round_body(dat, buf, c_K);
+
+ // output
+ uint2* output = (uint2*) input;
+ #pragma unroll
+ for (int i=0;i<4;i++) {
+ //output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
+ output[i] = vectorize(((uint64_t*)buf)[i]);
+ }
+#ifdef PAD_ZEROS
+ #pragma unroll
+ for (int i=4; i<8; i++) output[i] = vectorize(0);
+#endif
+ }
+}
+
+__global__
+/*__launch_bounds__(256,3)*/
+void lbry_sha256d_gpu_hash_20x2(uint32_t threads, uint64_t *Hash512)
+{
+ const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ extern __shared__ uint32_t s_K[];
+ if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x];
+ if (thread < threads)
+ {
+ uint32_t __align__(8) buf[8]; // align for vectorize
+ #pragma unroll
+ for (int i=0;i<8;i++) buf[i] = c_H256[i];
+
+ uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]);
+
+ uint32_t dat[16];
+ #pragma unroll
+ for (int i=0; i<5; i++) dat[i] = cuda_swab32(input[i]);
+ #pragma unroll
+ for (int i=0; i<5; i++) dat[i+5] = cuda_swab32(input[i+8]);
+ dat[10] = 0x80000000;
+ #pragma unroll
+ for (int i=11;i<15;i++) dat[i] = 0;
+ dat[15] = 0x140;
+
+ sha256_round_body(dat, buf, s_K);
+
+ // second sha256
+
+ #pragma unroll
+ for (int i=0; i<8; i++) dat[i] = buf[i];
+ dat[8] = 0x80000000;
+ #pragma unroll
+ for (int i=9; i<15; i++) dat[i] = 0;
+ dat[15] = 0x100;
+
+ #pragma unroll
+ for (int i=0;i<8;i++) buf[i] = c_H256[i];
+
+ sha256_round_body(dat, buf, s_K);
+
+ // output
+ uint2* output = (uint2*) input;
+
+#ifdef FULL_HASH
+ #pragma unroll
+ for (int i=0;i<4;i++) {
+ output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
+ //output[i] = vectorize(((uint64_t*)buf)[i]);
+ }
+# ifdef PAD_ZEROS
+ #pragma unroll
+ for (int i=4; i<8; i++) output[i] = vectorize(0);
+# endif
+
+#else
+ //input[6] = cuda_swab32(buf[6]);
+ //input[7] = cuda_swab32(buf[7]);
+ output[3] = vectorize(cuda_swab32ll(((uint64_t*)buf)[3]));
+#endif
+ }
+}
+
+__host__
+void lbry_sha256_init(int thr_id)
+{
+ //cudaMemcpyToSymbol(c_H256, cpu_H256, sizeof(cpu_H256), 0, cudaMemcpyHostToDevice);
+ cudaMemcpyToSymbol(c_K, cpu_K, sizeof(cpu_K), 0, cudaMemcpyHostToDevice);
+ CUDA_SAFE_CALL(cudaMalloc(&d_resNonces, 4*sizeof(uint32_t)));
+}
+
+__host__
+void lbry_sha256_free(int thr_id)
+{
+ cudaFree(d_resNonces);
+}
+
+__host__
+void lbry_sha256_setBlock_112(uint32_t *pdata, uint32_t *ptarget)
+{
+ uint32_t in[16], buf[8], end[11];
+ for (int i=0;i<16;i++) in[i] = cuda_swab32(pdata[i]);
+ for (int i=0; i<8;i++) buf[i] = cpu_H256[i];
+ for (int i=0;i<11;i++) end[i] = cuda_swab32(pdata[16+i]);
+ sha256_round_body_host(in, buf, cpu_K);
+
+ CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_midstate112, buf, 32, 0, cudaMemcpyHostToDevice));
+ CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_dataEnd112, end, sizeof(end), 0, cudaMemcpyHostToDevice));
+ CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_target, &ptarget[6], sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
+ CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &ptarget[6], sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
+}
+
+__host__
+void lbry_sha256_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream)
+{
+ const int threadsperblock = 256;
+
+ dim3 grid(threads/threadsperblock);
+ dim3 block(threadsperblock);
+
+ lbry_sha256_gpu_hash_112 <<>> (threads, startNonce, swabNonce, (uint64_t*) d_outputHash);
+ cudaGetLastError();
+}
+
+__host__
+void lbry_sha256_hash_32(int thr_id, uint32_t threads, uint32_t *d_Hash, cudaStream_t stream)
+{
+ const int threadsperblock = 256;
+
+ dim3 grid(threads/threadsperblock);
+ dim3 block(threadsperblock);
+
+ lbry_sha256_gpu_hash_32 <<>> (threads, (uint64_t*) d_Hash);
+}
+
+__host__
+void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream)
+{
+ const int threadsperblock = 256;
+
+ dim3 grid(threads/threadsperblock);
+ dim3 block(threadsperblock);
+
+ lbry_sha256d_gpu_hash_112 <<>> (threads, startNonce, swabNonce, (uint64_t*) d_outputHash);
+}
+
+__host__
+void lbry_sha256_hash_20x2(int thr_id, uint32_t threads, uint32_t *d_Hash, cudaStream_t stream)
+{
+ const int threadsperblock = 256;
+
+ dim3 grid(threads/threadsperblock);
+ dim3 block(threadsperblock);
+
+ lbry_sha256_gpu_hash_20x2 <<>> (threads, (uint64_t*) d_Hash);
+}
+
+__host__
+void lbry_sha256d_hash_20x2(int thr_id, uint32_t threads, uint32_t *d_Hash, cudaStream_t stream)
+{
+ const int threadsperblock = 256;
+
+ dim3 grid(threads/threadsperblock);
+ dim3 block(threadsperblock);
+
+ lbry_sha256d_gpu_hash_20x2 <<>> (threads, (uint64_t*) d_Hash);
+}
+
+__global__
+__launch_bounds__(256,3)
+void lbry_sha256d_gpu_hash_final(const uint32_t threads, const uint32_t startNonce, uint64_t *Hash512, uint32_t *resNonces)
+{
+ const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ if (thread < threads)
+ {
+ uint32_t __align__(8) buf[8]; // align for vectorize
+ #pragma unroll
+ for (int i=0;i<8;i++) buf[i] = c_H256[i];
+
+ uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]);
+
+ uint32_t __align__(8) dat[16];
+ #pragma unroll
+ for (int i=0;i<5;i++) dat[i] = cuda_swab32(input[i]);
+ #pragma unroll
+ for (int i=0;i<5;i++) dat[i+5] = cuda_swab32(input[i+8]);
+ dat[10] = 0x80000000;
+ #pragma unroll
+ for (int i=11;i<15;i++) dat[i] = 0;
+ dat[15] = 0x140;
+
+ sha256_round_body(dat, buf, c_K);
+
+ // second sha256
+
+ #pragma unroll
+ for (int i=0;i<8;i++) dat[i] = buf[i];
+ dat[8] = 0x80000000;
+ #pragma unroll
+ for (int i=9;i<15;i++) dat[i] = 0;
+ dat[15] = 0x100;
+
+ #pragma unroll
+ for (int i=0;i<8;i++) buf[i] = c_H256[i];
+
+ sha256_round_body(dat, buf, c_K);
+
+ // valid nonces
+ uint64_t high = cuda_swab32ll(((uint64_t*)buf)[3]);
+ if (high <= d_target[0]) {
+ // printf("%08x %08x - %016llx %016llx - %08x %08x\n", buf[7], buf[6], high, d_target[0], c_target[1], c_target[0]);
+ uint32_t nonce = startNonce + thread;
+ resNonces[1] = atomicExch(resNonces, nonce);
+ d_target[0] = high;
+ }
+ }
+}
+
+__host__
+void lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_inputHash, uint32_t *resNonces, cudaStream_t stream)
+{
+ const int threadsperblock = 256;
+
+ dim3 grid(threads/threadsperblock);
+ dim3 block(threadsperblock);
+
+ CUDA_SAFE_CALL(cudaMemset(d_resNonces, 0xFF, 2 * sizeof(uint32_t)));
+ cudaThreadSynchronize();
+
+ lbry_sha256d_gpu_hash_final <<>> (threads, startNonce, (uint64_t*) d_inputHash, d_resNonces);
+
+ cudaThreadSynchronize();
+
+ CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_resNonces, 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+ if (resNonces[0] == resNonces[1]) {
+ resNonces[1] = UINT32_MAX;
+ }
+}
\ No newline at end of file
diff --git a/lbry/cuda_sha512_lbry.cu b/lbry/cuda_sha512_lbry.cu
new file mode 100644
index 0000000..39f549b
--- /dev/null
+++ b/lbry/cuda_sha512_lbry.cu
@@ -0,0 +1,181 @@
+/**
+ * sha-512 CUDA implementation.
+ */
+
+#include
+#include
+#include
+
+#include
+
+static __constant__ uint64_t K_512[80];
+
+static const uint64_t K512[80] = {
+ 0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
+ 0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
+ 0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
+ 0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
+ 0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
+ 0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
+ 0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
+ 0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
+ 0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
+ 0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
+ 0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
+ 0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
+ 0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
+ 0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
+ 0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
+ 0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
+ 0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
+ 0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
+ 0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
+ 0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
+};
+
+//#undef xor3
+//#define xor3(a,b,c) (a^b^c)
+
+//#undef
+
+static __device__ __forceinline__
+uint64_t bsg5_0(const uint64_t x)
+{
+ uint64_t r1 = ROTR64(x,28);
+ uint64_t r2 = ROTR64(x,34);
+ uint64_t r3 = ROTR64(x,39);
+ return xor3(r1,r2,r3);
+}
+
+static __device__ __forceinline__
+uint64_t bsg5_1(const uint64_t x)
+{
+ uint64_t r1 = ROTR64(x,14);
+ uint64_t r2 = ROTR64(x,18);
+ uint64_t r3 = ROTR64(x,41);
+ return xor3(r1,r2,r3);
+}
+
+static __device__ __forceinline__
+uint64_t ssg5_0(const uint64_t x)
+{
+ uint64_t r1 = ROTR64(x,1);
+ uint64_t r2 = ROTR64(x,8);
+ uint64_t r3 = shr_t64(x,7);
+ return xor3(r1,r2,r3);
+}
+
+static __device__ __forceinline__
+uint64_t ssg5_1(const uint64_t x)
+{
+ uint64_t r1 = ROTR64(x,19);
+ uint64_t r2 = ROTR64(x,61);
+ uint64_t r3 = shr_t64(x,6);
+ return xor3(r1,r2,r3);
+}
+
+static __device__ __forceinline__
+uint64_t xandx64(const uint64_t a, const uint64_t b, const uint64_t c)
+{
+ uint64_t result;
+ asm("{ .reg .u64 m,n; // xandx64\n\t"
+ "xor.b64 m, %2,%3;\n\t"
+ "and.b64 n, m,%1;\n\t"
+ "xor.b64 %0, n,%3;\n\t"
+ "}" : "=l"(result) : "l"(a), "l"(b), "l"(c));
+ return result;
+}
+
+static __device__ __forceinline__
+void sha512_step2(uint64_t* r, uint64_t* W, uint64_t* K, const int ord, int i)
+{
+ int u = 8-ord;
+ uint64_t a = r[(0+u) & 7];
+ uint64_t b = r[(1+u) & 7];
+ uint64_t c = r[(2+u) & 7];
+ uint64_t d = r[(3+u) & 7];
+ uint64_t e = r[(4+u) & 7];
+ uint64_t f = r[(5+u) & 7];
+ uint64_t g = r[(6+u) & 7];
+ uint64_t h = r[(7+u) & 7];
+
+ uint64_t T1 = h + bsg5_1(e) + xandx64(e,f,g) + W[i] + K[i];
+ uint64_t T2 = bsg5_0(a) + andor(a,b,c);
+ r[(3+u)& 7] = d + T1;
+ r[(7+u)& 7] = T1 + T2;
+}
+
+/**************************************************************************************************/
+
+__global__
+void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
+{
+ const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ if (thread < threads)
+ {
+ uint64_t *pHash = &g_hash[thread * 8U];
+
+ uint64_t W[80];
+ uint64_t r[8];
+
+ uint64_t IV512[8] = {
+ 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+ 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
+ };
+
+ #pragma unroll
+ for (int i = 0; i < 8; i++)
+ r[i] = IV512[i];
+
+ #pragma unroll
+ for (int i = 0; i < 4; i++) {
+ // 32 bytes input
+ W[i] = cuda_swab64(pHash[i]);
+ }
+
+ W[4] = 0x8000000000000000; // end tag
+
+ #pragma unroll
+ for (int i = 5; i < 15; i++) W[i] = 0;
+
+ W[15] = 0x100; // 256 bits
+
+ #pragma unroll
+ for (int i = 16; i < 80; i++) W[i] = 0;
+
+ #pragma unroll 64
+ for (int i = 16; i < 80; i++)
+ W[i] = ssg5_1(W[i - 2]) + W[i - 7] + ssg5_0(W[i - 15]) + W[i - 16];
+
+ #pragma unroll 10
+ for (int i = 0; i < 10; i++) {
+ #pragma unroll 8
+ for (int ord=0; ord<8; ord++)
+ sha512_step2(r, W, K_512, ord, 8*i + ord);
+ }
+
+ #pragma unroll 8
+ for (int i = 0; i < 8; i++)
+ pHash[i] = cuda_swab64(r[i] + IV512[i]);
+ }
+}
+
+__host__
+void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStream_t stream)
+{
+ const int threadsperblock = 256;
+
+ dim3 grid((threads + threadsperblock-1)/threadsperblock);
+ dim3 block(threadsperblock);
+
+ size_t shared_size = 80*8;
+ lbry_sha512_gpu_hash_32 <<>> (threads, (uint64_t*)d_hash);
+}
+
+/**************************************************************************************************/
+
+__host__
+void lbry_sha512_init(int thr_id)
+{
+ cudaMemcpyToSymbol(K_512, K512, 80*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+}
diff --git a/lbry/lbry.cu b/lbry/lbry.cu
new file mode 100644
index 0000000..f5da268
--- /dev/null
+++ b/lbry/lbry.cu
@@ -0,0 +1,225 @@
+/**
+ * Lbry CUDA Implementation
+ *
+ * by tpruvot@github - July 2016
+ *
+ */
+
+#include
+#include
+
+extern "C" {
+#include
+#include
+}
+
+#include
+#include
+
+#define A 64
+#define debug_cpu 0
+
+extern "C" void lbry_hash(void* output, const void* input)
+{
+ uint32_t _ALIGN(A) hashA[16];
+ uint32_t _ALIGN(A) hashB[8];
+ uint32_t _ALIGN(A) hashC[8];
+
+ sph_sha256_context ctx_sha256;
+ sph_sha512_context ctx_sha512;
+ sph_ripemd160_context ctx_ripemd;
+
+ sph_sha256_init(&ctx_sha256);
+ sph_sha256(&ctx_sha256, input, 112);
+ sph_sha256_close(&ctx_sha256, hashA);
+
+ sph_sha256(&ctx_sha256, hashA, 32);
+ sph_sha256_close(&ctx_sha256, hashA);
+
+ sph_sha512_init(&ctx_sha512);
+ sph_sha512(&ctx_sha512, hashA, 32);
+ sph_sha512_close(&ctx_sha512, hashA);
+
+ sph_ripemd160_init(&ctx_ripemd);
+ sph_ripemd160(&ctx_ripemd, hashA, 32); // sha512 low
+ sph_ripemd160_close(&ctx_ripemd, hashB);
+ if (debug_cpu) applog_hex(hashB, 20);
+
+ sph_ripemd160(&ctx_ripemd, &hashA[8], 32); // sha512 high
+ sph_ripemd160_close(&ctx_ripemd, hashC);
+ if (debug_cpu) applog_hex(hashC, 20);
+
+ sph_sha256(&ctx_sha256, hashB, 20);
+ sph_sha256(&ctx_sha256, hashC, 20);
+ sph_sha256_close(&ctx_sha256, hashA);
+ if (debug_cpu) applog_hex(hashA,32);
+
+ sph_sha256(&ctx_sha256, hashA, 32);
+ sph_sha256_close(&ctx_sha256, hashA);
+
+ memcpy(output, hashA, 32);
+}
+
+/* ############################################################################################################################### */
+
+extern void lbry_ripemd160_init(int thr_id);
+extern void lbry_sha256_init(int thr_id);
+extern void lbry_sha256_free(int thr_id);
+extern void lbry_sha256_setBlock_112(uint32_t *pdata, uint32_t *ptarget);
+extern void lbry_sha256_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream);
+extern void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream);
+extern void lbry_sha256_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStream_t stream);
+extern void lbry_sha512_init(int thr_id);
+extern void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStream_t stream);
+extern void lbry_ripemd160_hash_32x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream);
+extern void lbry_sha256_hash_20x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream);
+extern void lbry_sha256d_hash_20x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream);
+extern void lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_inputHash, uint32_t *resNonces, cudaStream_t stream);
+
+static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {
+ return iftrue ? swab32(val) : val;
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+static uint32_t *d_hash[MAX_GPUS];
+
+// nonce position is different
+#define LBC_NONCE_OFT32 27
+
+extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+ uint32_t _ALIGN(A) vhash[8];
+ uint32_t _ALIGN(A) endiandata[28];
+ uint32_t *pdata = work->data;
+ uint32_t *ptarget = work->target;
+
+ const uint32_t first_nonce = pdata[LBC_NONCE_OFT32];
+ const int swap = 0; // to toggle nonce endian
+
+ const int dev_id = device_map[thr_id];
+ int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 22 : 20;
+ if (device_sm[dev_id] >= 600) intensity = 23;
+ if (device_sm[dev_id] < 350) intensity = 18;
+
+ uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+ if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+ if (opt_benchmark) {
+ ptarget[7] = 0xff;
+ }
+
+ if (!init[thr_id]){
+ cudaSetDevice(dev_id);
+ if (opt_cudaschedule == -1 && gpu_threads == 1) {
+ cudaDeviceReset();
+ // reduce cpu usage (linux)
+ cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+ CUDA_LOG_ERROR();
+ }
+
+ CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
+
+ lbry_sha256_init(thr_id);
+ lbry_sha512_init(thr_id);
+ lbry_ripemd160_init(thr_id);
+ cuda_check_cpu_init(thr_id, throughput);
+ CUDA_LOG_ERROR();
+
+ init[thr_id] = true;
+ }
+
+ for (int i=0; i < LBC_NONCE_OFT32; i++) {
+ be32enc(&endiandata[i], pdata[i]);
+ }
+
+ lbry_sha256_setBlock_112(endiandata, ptarget);
+ cuda_check_cpu_setTarget(ptarget);
+
+ do {
+
+ // Hash with CUDA
+ #if 0
+ lbry_sha256_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], swap, 0);
+ lbry_sha256_hash_32(thr_id, throughput, d_hash[thr_id], 0);
+ #else
+ lbry_sha256d_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], swap, 0);
+ #endif
+ CUDA_LOG_ERROR();
+
+ lbry_sha512_hash_32(thr_id, throughput, d_hash[thr_id], 0);
+
+ lbry_ripemd160_hash_32x2(thr_id, throughput, d_hash[thr_id], 0);
+
+ #if 0
+ lbry_sha256d_hash_20x2(thr_id, throughput, d_hash[thr_id], 0);
+ uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id]);
+ #else
+ uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
+ lbry_sha256d_hash_final(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], resNonces, 0);
+ uint32_t foundNonce = resNonces[0];
+ #endif
+
+ *hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + throughput;
+
+ if (foundNonce != UINT32_MAX)
+ {
+ endiandata[LBC_NONCE_OFT32] = swab32_if(foundNonce, !swap);
+ lbry_hash(vhash, endiandata);
+
+ if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+ int res = 1;
+ uint32_t secNonce = resNonces[1];
+ //uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], 1);
+ work->nonces[0] = swab32_if(foundNonce, swap);
+ work_set_target_ratio(work, vhash);
+ if (secNonce != UINT32_MAX) {
+ //if (secNonce) {
+ if (opt_debug)
+ gpulog(LOG_BLUE, thr_id, "found second nonce %08x", swab32(secNonce));
+ endiandata[LBC_NONCE_OFT32] = swab32_if(secNonce, !swap);
+ lbry_hash(vhash, endiandata);
+ if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio) {
+ work_set_target_ratio(work, vhash);
+ xchg(work->nonces[0], work->nonces[1]);
+ }
+ work->nonces[1] = swab32_if(secNonce, swap);
+ res++;
+ }
+ pdata[LBC_NONCE_OFT32] = work->nonces[0];
+ return res;
+ } else {
+ gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU %08x > %08x!", foundNonce, vhash[7], ptarget[7]);
+ }
+ }
+
+ if ((uint64_t) throughput + pdata[LBC_NONCE_OFT32] >= max_nonce) {
+ pdata[LBC_NONCE_OFT32] = max_nonce;
+ break;
+ }
+
+ pdata[LBC_NONCE_OFT32] += throughput;
+
+ } while (!work_restart[thr_id].restart);
+
+ //*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce;
+
+ return 0;
+}
+
+// cleanup
+void free_lbry(int thr_id)
+{
+ if (!init[thr_id])
+ return;
+
+ cudaThreadSynchronize();
+
+ cudaFree(d_hash[thr_id]);
+ lbry_sha256_free(thr_id);
+
+ cuda_check_cpu_free(thr_id);
+ init[thr_id] = false;
+
+ cudaDeviceSynchronize();
+}
diff --git a/miner.h b/miner.h
index 6bbb3a8..fa8335a 100644
--- a/miner.h
+++ b/miner.h
@@ -275,6 +275,7 @@ extern int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce,
extern int scanhash_groestlcoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_heavy(int thr_id,struct work *work, uint32_t max_nonce, unsigned long *hashes_done, uint32_t maxvote, int blocklen);
extern int scanhash_jackpot(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_lyra2v2(int thr_id,struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -584,6 +585,7 @@ struct stratum_job {
unsigned char version[4];
unsigned char nbits[4];
unsigned char ntime[4];
+ unsigned char claim[32]; // lbry
bool clean;
unsigned char nreward[2];
uint32_t height;
@@ -797,6 +799,7 @@ void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
void keccak256_hash(void *state, const void *input);
unsigned int jackpothash(void *state, const void *input);
void groestlhash(void *state, const void *input);
+void lbry_hash(void *output, const void *input);
void lyra2re_hash(void *state, const void *input);
void lyra2v2_hash(void *state, const void *input);
void myriadhash(void *state, const void *input);
diff --git a/sph/ripemd.c b/sph/ripemd.c
new file mode 100644
index 0000000..dd12b1f
--- /dev/null
+++ b/sph/ripemd.c
@@ -0,0 +1,833 @@
+/* $Id: ripemd.c 216 2010-06-08 09:46:57Z tp $ */
+/*
+ * RIPEMD-160 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010 Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include
+#include
+
+#include "sph_ripemd.h"
+
+/*
+ * Round functions for RIPEMD (original).
+ */
+#define F(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
+#define G(x, y, z) (((x) & (y)) | (((x) | (y)) & (z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+
+static const sph_u32 oIV[5] = {
+ SPH_C32(0x67452301), SPH_C32(0xEFCDAB89),
+ SPH_C32(0x98BADCFE), SPH_C32(0x10325476)
+};
+
+/*
+ * Round functions for RIPEMD-128 and RIPEMD-160.
+ */
+#define F1(x, y, z) ((x) ^ (y) ^ (z))
+#define F2(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
+#define F3(x, y, z) (((x) | ~(y)) ^ (z))
+#define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y))
+#define F5(x, y, z) ((x) ^ ((y) | ~(z)))
+
+static const sph_u32 IV[5] = {
+ SPH_C32(0x67452301), SPH_C32(0xEFCDAB89), SPH_C32(0x98BADCFE),
+ SPH_C32(0x10325476), SPH_C32(0xC3D2E1F0)
+};
+
+#define ROTL SPH_ROTL32
+
+/* ===================================================================== */
+/*
+ * RIPEMD (original hash, deprecated).
+ */
+
+#define FF1(A, B, C, D, X, s) do { \
+ sph_u32 tmp = SPH_T32((A) + F(B, C, D) + (X)); \
+ (A) = ROTL(tmp, (s)); \
+ } while (0)
+
+#define GG1(A, B, C, D, X, s) do { \
+ sph_u32 tmp = SPH_T32((A) + G(B, C, D) \
+ + (X) + SPH_C32(0x5A827999)); \
+ (A) = ROTL(tmp, (s)); \
+ } while (0)
+
+#define HH1(A, B, C, D, X, s) do { \
+ sph_u32 tmp = SPH_T32((A) + H(B, C, D) \
+ + (X) + SPH_C32(0x6ED9EBA1)); \
+ (A) = ROTL(tmp, (s)); \
+ } while (0)
+
+#define FF2(A, B, C, D, X, s) do { \
+ sph_u32 tmp = SPH_T32((A) + F(B, C, D) \
+ + (X) + SPH_C32(0x50A28BE6)); \
+ (A) = ROTL(tmp, (s)); \
+ } while (0)
+
+#define GG2(A, B, C, D, X, s) do { \
+ sph_u32 tmp = SPH_T32((A) + G(B, C, D) + (X)); \
+ (A) = ROTL(tmp, (s)); \
+ } while (0)
+
+#define HH2(A, B, C, D, X, s) do { \
+ sph_u32 tmp = SPH_T32((A) + H(B, C, D) \
+ + (X) + SPH_C32(0x5C4DD124)); \
+ (A) = ROTL(tmp, (s)); \
+ } while (0)
+
+#define RIPEMD_ROUND_BODY(in, h) do { \
+ sph_u32 A1, B1, C1, D1; \
+ sph_u32 A2, B2, C2, D2; \
+ sph_u32 tmp; \
+ \
+ A1 = A2 = (h)[0]; \
+ B1 = B2 = (h)[1]; \
+ C1 = C2 = (h)[2]; \
+ D1 = D2 = (h)[3]; \
+ \
+ FF1(A1, B1, C1, D1, in( 0), 11); \
+ FF1(D1, A1, B1, C1, in( 1), 14); \
+ FF1(C1, D1, A1, B1, in( 2), 15); \
+ FF1(B1, C1, D1, A1, in( 3), 12); \
+ FF1(A1, B1, C1, D1, in( 4), 5); \
+ FF1(D1, A1, B1, C1, in( 5), 8); \
+ FF1(C1, D1, A1, B1, in( 6), 7); \
+ FF1(B1, C1, D1, A1, in( 7), 9); \
+ FF1(A1, B1, C1, D1, in( 8), 11); \
+ FF1(D1, A1, B1, C1, in( 9), 13); \
+ FF1(C1, D1, A1, B1, in(10), 14); \
+ FF1(B1, C1, D1, A1, in(11), 15); \
+ FF1(A1, B1, C1, D1, in(12), 6); \
+ FF1(D1, A1, B1, C1, in(13), 7); \
+ FF1(C1, D1, A1, B1, in(14), 9); \
+ FF1(B1, C1, D1, A1, in(15), 8); \
+ \
+ GG1(A1, B1, C1, D1, in( 7), 7); \
+ GG1(D1, A1, B1, C1, in( 4), 6); \
+ GG1(C1, D1, A1, B1, in(13), 8); \
+ GG1(B1, C1, D1, A1, in( 1), 13); \
+ GG1(A1, B1, C1, D1, in(10), 11); \
+ GG1(D1, A1, B1, C1, in( 6), 9); \
+ GG1(C1, D1, A1, B1, in(15), 7); \
+ GG1(B1, C1, D1, A1, in( 3), 15); \
+ GG1(A1, B1, C1, D1, in(12), 7); \
+ GG1(D1, A1, B1, C1, in( 0), 12); \
+ GG1(C1, D1, A1, B1, in( 9), 15); \
+ GG1(B1, C1, D1, A1, in( 5), 9); \
+ GG1(A1, B1, C1, D1, in(14), 7); \
+ GG1(D1, A1, B1, C1, in( 2), 11); \
+ GG1(C1, D1, A1, B1, in(11), 13); \
+ GG1(B1, C1, D1, A1, in( 8), 12); \
+ \
+ HH1(A1, B1, C1, D1, in( 3), 11); \
+ HH1(D1, A1, B1, C1, in(10), 13); \
+ HH1(C1, D1, A1, B1, in( 2), 14); \
+ HH1(B1, C1, D1, A1, in( 4), 7); \
+ HH1(A1, B1, C1, D1, in( 9), 14); \
+ HH1(D1, A1, B1, C1, in(15), 9); \
+ HH1(C1, D1, A1, B1, in( 8), 13); \
+ HH1(B1, C1, D1, A1, in( 1), 15); \
+ HH1(A1, B1, C1, D1, in(14), 6); \
+ HH1(D1, A1, B1, C1, in( 7), 8); \
+ HH1(C1, D1, A1, B1, in( 0), 13); \
+ HH1(B1, C1, D1, A1, in( 6), 6); \
+ HH1(A1, B1, C1, D1, in(11), 12); \
+ HH1(D1, A1, B1, C1, in(13), 5); \
+ HH1(C1, D1, A1, B1, in( 5), 7); \
+ HH1(B1, C1, D1, A1, in(12), 5); \
+ \
+ FF2(A2, B2, C2, D2, in( 0), 11); \
+ FF2(D2, A2, B2, C2, in( 1), 14); \
+ FF2(C2, D2, A2, B2, in( 2), 15); \
+ FF2(B2, C2, D2, A2, in( 3), 12); \
+ FF2(A2, B2, C2, D2, in( 4), 5); \
+ FF2(D2, A2, B2, C2, in( 5), 8); \
+ FF2(C2, D2, A2, B2, in( 6), 7); \
+ FF2(B2, C2, D2, A2, in( 7), 9); \
+ FF2(A2, B2, C2, D2, in( 8), 11); \
+ FF2(D2, A2, B2, C2, in( 9), 13); \
+ FF2(C2, D2, A2, B2, in(10), 14); \
+ FF2(B2, C2, D2, A2, in(11), 15); \
+ FF2(A2, B2, C2, D2, in(12), 6); \
+ FF2(D2, A2, B2, C2, in(13), 7); \
+ FF2(C2, D2, A2, B2, in(14), 9); \
+ FF2(B2, C2, D2, A2, in(15), 8); \
+ \
+ GG2(A2, B2, C2, D2, in( 7), 7); \
+ GG2(D2, A2, B2, C2, in( 4), 6); \
+ GG2(C2, D2, A2, B2, in(13), 8); \
+ GG2(B2, C2, D2, A2, in( 1), 13); \
+ GG2(A2, B2, C2, D2, in(10), 11); \
+ GG2(D2, A2, B2, C2, in( 6), 9); \
+ GG2(C2, D2, A2, B2, in(15), 7); \
+ GG2(B2, C2, D2, A2, in( 3), 15); \
+ GG2(A2, B2, C2, D2, in(12), 7); \
+ GG2(D2, A2, B2, C2, in( 0), 12); \
+ GG2(C2, D2, A2, B2, in( 9), 15); \
+ GG2(B2, C2, D2, A2, in( 5), 9); \
+ GG2(A2, B2, C2, D2, in(14), 7); \
+ GG2(D2, A2, B2, C2, in( 2), 11); \
+ GG2(C2, D2, A2, B2, in(11), 13); \
+ GG2(B2, C2, D2, A2, in( 8), 12); \
+ \
+ HH2(A2, B2, C2, D2, in( 3), 11); \
+ HH2(D2, A2, B2, C2, in(10), 13); \
+ HH2(C2, D2, A2, B2, in( 2), 14); \
+ HH2(B2, C2, D2, A2, in( 4), 7); \
+ HH2(A2, B2, C2, D2, in( 9), 14); \
+ HH2(D2, A2, B2, C2, in(15), 9); \
+ HH2(C2, D2, A2, B2, in( 8), 13); \
+ HH2(B2, C2, D2, A2, in( 1), 15); \
+ HH2(A2, B2, C2, D2, in(14), 6); \
+ HH2(D2, A2, B2, C2, in( 7), 8); \
+ HH2(C2, D2, A2, B2, in( 0), 13); \
+ HH2(B2, C2, D2, A2, in( 6), 6); \
+ HH2(A2, B2, C2, D2, in(11), 12); \
+ HH2(D2, A2, B2, C2, in(13), 5); \
+ HH2(C2, D2, A2, B2, in( 5), 7); \
+ HH2(B2, C2, D2, A2, in(12), 5); \
+ \
+ tmp = SPH_T32((h)[1] + C1 + D2); \
+ (h)[1] = SPH_T32((h)[2] + D1 + A2); \
+ (h)[2] = SPH_T32((h)[3] + A1 + B2); \
+ (h)[3] = SPH_T32((h)[0] + B1 + C2); \
+ (h)[0] = tmp; \
+ } while (0)
+
+/*
+ * One round of RIPEMD. The data must be aligned for 32-bit access.
+ */
+static void
+ripemd_round(const unsigned char *data, sph_u32 r[5])
+{
+#if SPH_LITTLE_FAST
+
+#define RIPEMD_IN(x) sph_dec32le_aligned(data + (4 * (x)))
+
+#else
+
+ sph_u32 X_var[16];
+ int i;
+
+ for (i = 0; i < 16; i ++)
+ X_var[i] = sph_dec32le_aligned(data + 4 * i);
+#define RIPEMD_IN(x) X_var[x]
+
+#endif
+ RIPEMD_ROUND_BODY(RIPEMD_IN, r);
+#undef RIPEMD_IN
+}
+
+/* see sph_ripemd.h */
+void
+sph_ripemd_init(void *cc)
+{
+ sph_ripemd_context *sc;
+
+ sc = cc;
+ memcpy(sc->val, oIV, sizeof sc->val);
+#if SPH_64
+ sc->count = 0;
+#else
+ sc->count_high = sc->count_low = 0;
+#endif
+}
+
+#define RFUN ripemd_round
+#define HASH ripemd
+#define LE32 1
+#include "md_helper.c"
+#undef RFUN
+#undef HASH
+#undef LE32
+
+/* see sph_ripemd.h */
+void
+sph_ripemd_close(void *cc, void *dst)
+{
+ ripemd_close(cc, dst, 4);
+ sph_ripemd_init(cc);
+}
+
+/* see sph_ripemd.h */
+void
+sph_ripemd_comp(const sph_u32 msg[16], sph_u32 val[4])
+{
+#define RIPEMD_IN(x) msg[x]
+ RIPEMD_ROUND_BODY(RIPEMD_IN, val);
+#undef RIPEMD_IN
+}
+
+/* ===================================================================== */
+/*
+ * RIPEMD-128.
+ */
+
+/*
+ * Round constants for RIPEMD-128.
+ */
+#define sK11 SPH_C32(0x00000000)
+#define sK12 SPH_C32(0x5A827999)
+#define sK13 SPH_C32(0x6ED9EBA1)
+#define sK14 SPH_C32(0x8F1BBCDC)
+
+#define sK21 SPH_C32(0x50A28BE6)
+#define sK22 SPH_C32(0x5C4DD124)
+#define sK23 SPH_C32(0x6D703EF3)
+#define sK24 SPH_C32(0x00000000)
+
+#define sRR(a, b, c, d, f, s, r, k) do { \
+ a = ROTL(SPH_T32(a + f(b, c, d) + r + k), s); \
+ } while (0)
+
+#define sROUND1(a, b, c, d, f, s, r, k) \
+ sRR(a ## 1, b ## 1, c ## 1, d ## 1, f, s, r, sK1 ## k)
+
+#define sROUND2(a, b, c, d, f, s, r, k) \
+ sRR(a ## 2, b ## 2, c ## 2, d ## 2, f, s, r, sK2 ## k)
+
+/*
+ * This macro defines the body for a RIPEMD-128 compression function
+ * implementation. The "in" parameter should evaluate, when applied to a
+ * numerical input parameter from 0 to 15, to an expression which yields
+ * the corresponding input block. The "h" parameter should evaluate to
+ * an array or pointer expression designating the array of 4 words which
+ * contains the input and output of the compression function.
+ */
+
+#define RIPEMD128_ROUND_BODY(in, h) do { \
+ sph_u32 A1, B1, C1, D1; \
+ sph_u32 A2, B2, C2, D2; \
+ sph_u32 tmp; \
+ \
+ A1 = A2 = (h)[0]; \
+ B1 = B2 = (h)[1]; \
+ C1 = C2 = (h)[2]; \
+ D1 = D2 = (h)[3]; \
+ \
+ sROUND1(A, B, C, D, F1, 11, in( 0), 1); \
+ sROUND1(D, A, B, C, F1, 14, in( 1), 1); \
+ sROUND1(C, D, A, B, F1, 15, in( 2), 1); \
+ sROUND1(B, C, D, A, F1, 12, in( 3), 1); \
+ sROUND1(A, B, C, D, F1, 5, in( 4), 1); \
+ sROUND1(D, A, B, C, F1, 8, in( 5), 1); \
+ sROUND1(C, D, A, B, F1, 7, in( 6), 1); \
+ sROUND1(B, C, D, A, F1, 9, in( 7), 1); \
+ sROUND1(A, B, C, D, F1, 11, in( 8), 1); \
+ sROUND1(D, A, B, C, F1, 13, in( 9), 1); \
+ sROUND1(C, D, A, B, F1, 14, in(10), 1); \
+ sROUND1(B, C, D, A, F1, 15, in(11), 1); \
+ sROUND1(A, B, C, D, F1, 6, in(12), 1); \
+ sROUND1(D, A, B, C, F1, 7, in(13), 1); \
+ sROUND1(C, D, A, B, F1, 9, in(14), 1); \
+ sROUND1(B, C, D, A, F1, 8, in(15), 1); \
+ \
+ sROUND1(A, B, C, D, F2, 7, in( 7), 2); \
+ sROUND1(D, A, B, C, F2, 6, in( 4), 2); \
+ sROUND1(C, D, A, B, F2, 8, in(13), 2); \
+ sROUND1(B, C, D, A, F2, 13, in( 1), 2); \
+ sROUND1(A, B, C, D, F2, 11, in(10), 2); \
+ sROUND1(D, A, B, C, F2, 9, in( 6), 2); \
+ sROUND1(C, D, A, B, F2, 7, in(15), 2); \
+ sROUND1(B, C, D, A, F2, 15, in( 3), 2); \
+ sROUND1(A, B, C, D, F2, 7, in(12), 2); \
+ sROUND1(D, A, B, C, F2, 12, in( 0), 2); \
+ sROUND1(C, D, A, B, F2, 15, in( 9), 2); \
+ sROUND1(B, C, D, A, F2, 9, in( 5), 2); \
+ sROUND1(A, B, C, D, F2, 11, in( 2), 2); \
+ sROUND1(D, A, B, C, F2, 7, in(14), 2); \
+ sROUND1(C, D, A, B, F2, 13, in(11), 2); \
+ sROUND1(B, C, D, A, F2, 12, in( 8), 2); \
+ \
+ sROUND1(A, B, C, D, F3, 11, in( 3), 3); \
+ sROUND1(D, A, B, C, F3, 13, in(10), 3); \
+ sROUND1(C, D, A, B, F3, 6, in(14), 3); \
+ sROUND1(B, C, D, A, F3, 7, in( 4), 3); \
+ sROUND1(A, B, C, D, F3, 14, in( 9), 3); \
+ sROUND1(D, A, B, C, F3, 9, in(15), 3); \
+ sROUND1(C, D, A, B, F3, 13, in( 8), 3); \
+ sROUND1(B, C, D, A, F3, 15, in( 1), 3); \
+ sROUND1(A, B, C, D, F3, 14, in( 2), 3); \
+ sROUND1(D, A, B, C, F3, 8, in( 7), 3); \
+ sROUND1(C, D, A, B, F3, 13, in( 0), 3); \
+ sROUND1(B, C, D, A, F3, 6, in( 6), 3); \
+ sROUND1(A, B, C, D, F3, 5, in(13), 3); \
+ sROUND1(D, A, B, C, F3, 12, in(11), 3); \
+ sROUND1(C, D, A, B, F3, 7, in( 5), 3); \
+ sROUND1(B, C, D, A, F3, 5, in(12), 3); \
+ \
+ sROUND1(A, B, C, D, F4, 11, in( 1), 4); \
+ sROUND1(D, A, B, C, F4, 12, in( 9), 4); \
+ sROUND1(C, D, A, B, F4, 14, in(11), 4); \
+ sROUND1(B, C, D, A, F4, 15, in(10), 4); \
+ sROUND1(A, B, C, D, F4, 14, in( 0), 4); \
+ sROUND1(D, A, B, C, F4, 15, in( 8), 4); \
+ sROUND1(C, D, A, B, F4, 9, in(12), 4); \
+ sROUND1(B, C, D, A, F4, 8, in( 4), 4); \
+ sROUND1(A, B, C, D, F4, 9, in(13), 4); \
+ sROUND1(D, A, B, C, F4, 14, in( 3), 4); \
+ sROUND1(C, D, A, B, F4, 5, in( 7), 4); \
+ sROUND1(B, C, D, A, F4, 6, in(15), 4); \
+ sROUND1(A, B, C, D, F4, 8, in(14), 4); \
+ sROUND1(D, A, B, C, F4, 6, in( 5), 4); \
+ sROUND1(C, D, A, B, F4, 5, in( 6), 4); \
+ sROUND1(B, C, D, A, F4, 12, in( 2), 4); \
+ \
+ sROUND2(A, B, C, D, F4, 8, in( 5), 1); \
+ sROUND2(D, A, B, C, F4, 9, in(14), 1); \
+ sROUND2(C, D, A, B, F4, 9, in( 7), 1); \
+ sROUND2(B, C, D, A, F4, 11, in( 0), 1); \
+ sROUND2(A, B, C, D, F4, 13, in( 9), 1); \
+ sROUND2(D, A, B, C, F4, 15, in( 2), 1); \
+ sROUND2(C, D, A, B, F4, 15, in(11), 1); \
+ sROUND2(B, C, D, A, F4, 5, in( 4), 1); \
+ sROUND2(A, B, C, D, F4, 7, in(13), 1); \
+ sROUND2(D, A, B, C, F4, 7, in( 6), 1); \
+ sROUND2(C, D, A, B, F4, 8, in(15), 1); \
+ sROUND2(B, C, D, A, F4, 11, in( 8), 1); \
+ sROUND2(A, B, C, D, F4, 14, in( 1), 1); \
+ sROUND2(D, A, B, C, F4, 14, in(10), 1); \
+ sROUND2(C, D, A, B, F4, 12, in( 3), 1); \
+ sROUND2(B, C, D, A, F4, 6, in(12), 1); \
+ \
+ sROUND2(A, B, C, D, F3, 9, in( 6), 2); \
+ sROUND2(D, A, B, C, F3, 13, in(11), 2); \
+ sROUND2(C, D, A, B, F3, 15, in( 3), 2); \
+ sROUND2(B, C, D, A, F3, 7, in( 7), 2); \
+ sROUND2(A, B, C, D, F3, 12, in( 0), 2); \
+ sROUND2(D, A, B, C, F3, 8, in(13), 2); \
+ sROUND2(C, D, A, B, F3, 9, in( 5), 2); \
+ sROUND2(B, C, D, A, F3, 11, in(10), 2); \
+ sROUND2(A, B, C, D, F3, 7, in(14), 2); \
+ sROUND2(D, A, B, C, F3, 7, in(15), 2); \
+ sROUND2(C, D, A, B, F3, 12, in( 8), 2); \
+ sROUND2(B, C, D, A, F3, 7, in(12), 2); \
+ sROUND2(A, B, C, D, F3, 6, in( 4), 2); \
+ sROUND2(D, A, B, C, F3, 15, in( 9), 2); \
+ sROUND2(C, D, A, B, F3, 13, in( 1), 2); \
+ sROUND2(B, C, D, A, F3, 11, in( 2), 2); \
+ \
+ sROUND2(A, B, C, D, F2, 9, in(15), 3); \
+ sROUND2(D, A, B, C, F2, 7, in( 5), 3); \
+ sROUND2(C, D, A, B, F2, 15, in( 1), 3); \
+ sROUND2(B, C, D, A, F2, 11, in( 3), 3); \
+ sROUND2(A, B, C, D, F2, 8, in( 7), 3); \
+ sROUND2(D, A, B, C, F2, 6, in(14), 3); \
+ sROUND2(C, D, A, B, F2, 6, in( 6), 3); \
+ sROUND2(B, C, D, A, F2, 14, in( 9), 3); \
+ sROUND2(A, B, C, D, F2, 12, in(11), 3); \
+ sROUND2(D, A, B, C, F2, 13, in( 8), 3); \
+ sROUND2(C, D, A, B, F2, 5, in(12), 3); \
+ sROUND2(B, C, D, A, F2, 14, in( 2), 3); \
+ sROUND2(A, B, C, D, F2, 13, in(10), 3); \
+ sROUND2(D, A, B, C, F2, 13, in( 0), 3); \
+ sROUND2(C, D, A, B, F2, 7, in( 4), 3); \
+ sROUND2(B, C, D, A, F2, 5, in(13), 3); \
+ \
+ sROUND2(A, B, C, D, F1, 15, in( 8), 4); \
+ sROUND2(D, A, B, C, F1, 5, in( 6), 4); \
+ sROUND2(C, D, A, B, F1, 8, in( 4), 4); \
+ sROUND2(B, C, D, A, F1, 11, in( 1), 4); \
+ sROUND2(A, B, C, D, F1, 14, in( 3), 4); \
+ sROUND2(D, A, B, C, F1, 14, in(11), 4); \
+ sROUND2(C, D, A, B, F1, 6, in(15), 4); \
+ sROUND2(B, C, D, A, F1, 14, in( 0), 4); \
+ sROUND2(A, B, C, D, F1, 6, in( 5), 4); \
+ sROUND2(D, A, B, C, F1, 9, in(12), 4); \
+ sROUND2(C, D, A, B, F1, 12, in( 2), 4); \
+ sROUND2(B, C, D, A, F1, 9, in(13), 4); \
+ sROUND2(A, B, C, D, F1, 12, in( 9), 4); \
+ sROUND2(D, A, B, C, F1, 5, in( 7), 4); \
+ sROUND2(C, D, A, B, F1, 15, in(10), 4); \
+ sROUND2(B, C, D, A, F1, 8, in(14), 4); \
+ \
+ tmp = SPH_T32((h)[1] + C1 + D2); \
+ (h)[1] = SPH_T32((h)[2] + D1 + A2); \
+ (h)[2] = SPH_T32((h)[3] + A1 + B2); \
+ (h)[3] = SPH_T32((h)[0] + B1 + C2); \
+ (h)[0] = tmp; \
+ } while (0)
+
+/*
+ * One round of RIPEMD-128. The data must be aligned for 32-bit access.
+ */
+static void
+ripemd128_round(const unsigned char *data, sph_u32 r[5])
+{
+#if SPH_LITTLE_FAST
+
+#define RIPEMD128_IN(x) sph_dec32le_aligned(data + (4 * (x)))
+
+#else
+
+ sph_u32 X_var[16];
+ int i;
+
+ for (i = 0; i < 16; i ++)
+ X_var[i] = sph_dec32le_aligned(data + 4 * i);
+#define RIPEMD128_IN(x) X_var[x]
+
+#endif
+ RIPEMD128_ROUND_BODY(RIPEMD128_IN, r);
+#undef RIPEMD128_IN
+}
+
+/* see sph_ripemd.h */
+void
+sph_ripemd128_init(void *cc)
+{
+ sph_ripemd128_context *sc;
+
+ sc = cc;
+ memcpy(sc->val, IV, sizeof sc->val);
+#if SPH_64
+ sc->count = 0;
+#else
+ sc->count_high = sc->count_low = 0;
+#endif
+}
+
+#define RFUN ripemd128_round
+#define HASH ripemd128
+#define LE32 1
+#include "md_helper.c"
+#undef RFUN
+#undef HASH
+#undef LE32
+
+/* see sph_ripemd.h */
+void
+sph_ripemd128_close(void *cc, void *dst)
+{
+ ripemd128_close(cc, dst, 4);
+ sph_ripemd128_init(cc);
+}
+
+/* see sph_ripemd.h */
+void
+sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4])
+{
+#define RIPEMD128_IN(x) msg[x]
+ RIPEMD128_ROUND_BODY(RIPEMD128_IN, val);
+#undef RIPEMD128_IN
+}
+
+/* ===================================================================== */
+/*
+ * RIPEMD-160.
+ */
+
+/*
+ * Round constants for RIPEMD-160.
+ */
+#define K11 SPH_C32(0x00000000)
+#define K12 SPH_C32(0x5A827999)
+#define K13 SPH_C32(0x6ED9EBA1)
+#define K14 SPH_C32(0x8F1BBCDC)
+#define K15 SPH_C32(0xA953FD4E)
+
+#define K21 SPH_C32(0x50A28BE6)
+#define K22 SPH_C32(0x5C4DD124)
+#define K23 SPH_C32(0x6D703EF3)
+#define K24 SPH_C32(0x7A6D76E9)
+#define K25 SPH_C32(0x00000000)
+
+#define RR(a, b, c, d, e, f, s, r, k) do { \
+ a = SPH_T32(ROTL(SPH_T32(a + f(b, c, d) + r + k), s) + e); \
+ c = ROTL(c, 10); \
+ } while (0)
+
+#define ROUND1(a, b, c, d, e, f, s, r, k) \
+ RR(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
+
+#define ROUND2(a, b, c, d, e, f, s, r, k) \
+ RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
+
+/*
+ * This macro defines the body for a RIPEMD-160 compression function
+ * implementation. The "in" parameter should evaluate, when applied to a
+ * numerical input parameter from 0 to 15, to an expression which yields
+ * the corresponding input block. The "h" parameter should evaluate to
+ * an array or pointer expression designating the array of 5 words which
+ * contains the input and output of the compression function.
+ */
+
+#define RIPEMD160_ROUND_BODY(in, h) do { \
+ sph_u32 A1, B1, C1, D1, E1; \
+ sph_u32 A2, B2, C2, D2, E2; \
+ sph_u32 tmp; \
+ \
+ A1 = A2 = (h)[0]; \
+ B1 = B2 = (h)[1]; \
+ C1 = C2 = (h)[2]; \
+ D1 = D2 = (h)[3]; \
+ E1 = E2 = (h)[4]; \
+ \
+ ROUND1(A, B, C, D, E, F1, 11, in( 0), 1); \
+ ROUND1(E, A, B, C, D, F1, 14, in( 1), 1); \
+ ROUND1(D, E, A, B, C, F1, 15, in( 2), 1); \
+ ROUND1(C, D, E, A, B, F1, 12, in( 3), 1); \
+ ROUND1(B, C, D, E, A, F1, 5, in( 4), 1); \
+ ROUND1(A, B, C, D, E, F1, 8, in( 5), 1); \
+ ROUND1(E, A, B, C, D, F1, 7, in( 6), 1); \
+ ROUND1(D, E, A, B, C, F1, 9, in( 7), 1); \
+ ROUND1(C, D, E, A, B, F1, 11, in( 8), 1); \
+ ROUND1(B, C, D, E, A, F1, 13, in( 9), 1); \
+ ROUND1(A, B, C, D, E, F1, 14, in(10), 1); \
+ ROUND1(E, A, B, C, D, F1, 15, in(11), 1); \
+ ROUND1(D, E, A, B, C, F1, 6, in(12), 1); \
+ ROUND1(C, D, E, A, B, F1, 7, in(13), 1); \
+ ROUND1(B, C, D, E, A, F1, 9, in(14), 1); \
+ ROUND1(A, B, C, D, E, F1, 8, in(15), 1); \
+ \
+ ROUND1(E, A, B, C, D, F2, 7, in( 7), 2); \
+ ROUND1(D, E, A, B, C, F2, 6, in( 4), 2); \
+ ROUND1(C, D, E, A, B, F2, 8, in(13), 2); \
+ ROUND1(B, C, D, E, A, F2, 13, in( 1), 2); \
+ ROUND1(A, B, C, D, E, F2, 11, in(10), 2); \
+ ROUND1(E, A, B, C, D, F2, 9, in( 6), 2); \
+ ROUND1(D, E, A, B, C, F2, 7, in(15), 2); \
+ ROUND1(C, D, E, A, B, F2, 15, in( 3), 2); \
+ ROUND1(B, C, D, E, A, F2, 7, in(12), 2); \
+ ROUND1(A, B, C, D, E, F2, 12, in( 0), 2); \
+ ROUND1(E, A, B, C, D, F2, 15, in( 9), 2); \
+ ROUND1(D, E, A, B, C, F2, 9, in( 5), 2); \
+ ROUND1(C, D, E, A, B, F2, 11, in( 2), 2); \
+ ROUND1(B, C, D, E, A, F2, 7, in(14), 2); \
+ ROUND1(A, B, C, D, E, F2, 13, in(11), 2); \
+ ROUND1(E, A, B, C, D, F2, 12, in( 8), 2); \
+ \
+ ROUND1(D, E, A, B, C, F3, 11, in( 3), 3); \
+ ROUND1(C, D, E, A, B, F3, 13, in(10), 3); \
+ ROUND1(B, C, D, E, A, F3, 6, in(14), 3); \
+ ROUND1(A, B, C, D, E, F3, 7, in( 4), 3); \
+ ROUND1(E, A, B, C, D, F3, 14, in( 9), 3); \
+ ROUND1(D, E, A, B, C, F3, 9, in(15), 3); \
+ ROUND1(C, D, E, A, B, F3, 13, in( 8), 3); \
+ ROUND1(B, C, D, E, A, F3, 15, in( 1), 3); \
+ ROUND1(A, B, C, D, E, F3, 14, in( 2), 3); \
+ ROUND1(E, A, B, C, D, F3, 8, in( 7), 3); \
+ ROUND1(D, E, A, B, C, F3, 13, in( 0), 3); \
+ ROUND1(C, D, E, A, B, F3, 6, in( 6), 3); \
+ ROUND1(B, C, D, E, A, F3, 5, in(13), 3); \
+ ROUND1(A, B, C, D, E, F3, 12, in(11), 3); \
+ ROUND1(E, A, B, C, D, F3, 7, in( 5), 3); \
+ ROUND1(D, E, A, B, C, F3, 5, in(12), 3); \
+ \
+ ROUND1(C, D, E, A, B, F4, 11, in( 1), 4); \
+ ROUND1(B, C, D, E, A, F4, 12, in( 9), 4); \
+ ROUND1(A, B, C, D, E, F4, 14, in(11), 4); \
+ ROUND1(E, A, B, C, D, F4, 15, in(10), 4); \
+ ROUND1(D, E, A, B, C, F4, 14, in( 0), 4); \
+ ROUND1(C, D, E, A, B, F4, 15, in( 8), 4); \
+ ROUND1(B, C, D, E, A, F4, 9, in(12), 4); \
+ ROUND1(A, B, C, D, E, F4, 8, in( 4), 4); \
+ ROUND1(E, A, B, C, D, F4, 9, in(13), 4); \
+ ROUND1(D, E, A, B, C, F4, 14, in( 3), 4); \
+ ROUND1(C, D, E, A, B, F4, 5, in( 7), 4); \
+ ROUND1(B, C, D, E, A, F4, 6, in(15), 4); \
+ ROUND1(A, B, C, D, E, F4, 8, in(14), 4); \
+ ROUND1(E, A, B, C, D, F4, 6, in( 5), 4); \
+ ROUND1(D, E, A, B, C, F4, 5, in( 6), 4); \
+ ROUND1(C, D, E, A, B, F4, 12, in( 2), 4); \
+ \
+ ROUND1(B, C, D, E, A, F5, 9, in( 4), 5); \
+ ROUND1(A, B, C, D, E, F5, 15, in( 0), 5); \
+ ROUND1(E, A, B, C, D, F5, 5, in( 5), 5); \
+ ROUND1(D, E, A, B, C, F5, 11, in( 9), 5); \
+ ROUND1(C, D, E, A, B, F5, 6, in( 7), 5); \
+ ROUND1(B, C, D, E, A, F5, 8, in(12), 5); \
+ ROUND1(A, B, C, D, E, F5, 13, in( 2), 5); \
+ ROUND1(E, A, B, C, D, F5, 12, in(10), 5); \
+ ROUND1(D, E, A, B, C, F5, 5, in(14), 5); \
+ ROUND1(C, D, E, A, B, F5, 12, in( 1), 5); \
+ ROUND1(B, C, D, E, A, F5, 13, in( 3), 5); \
+ ROUND1(A, B, C, D, E, F5, 14, in( 8), 5); \
+ ROUND1(E, A, B, C, D, F5, 11, in(11), 5); \
+ ROUND1(D, E, A, B, C, F5, 8, in( 6), 5); \
+ ROUND1(C, D, E, A, B, F5, 5, in(15), 5); \
+ ROUND1(B, C, D, E, A, F5, 6, in(13), 5); \
+ \
+ ROUND2(A, B, C, D, E, F5, 8, in( 5), 1); \
+ ROUND2(E, A, B, C, D, F5, 9, in(14), 1); \
+ ROUND2(D, E, A, B, C, F5, 9, in( 7), 1); \
+ ROUND2(C, D, E, A, B, F5, 11, in( 0), 1); \
+ ROUND2(B, C, D, E, A, F5, 13, in( 9), 1); \
+ ROUND2(A, B, C, D, E, F5, 15, in( 2), 1); \
+ ROUND2(E, A, B, C, D, F5, 15, in(11), 1); \
+ ROUND2(D, E, A, B, C, F5, 5, in( 4), 1); \
+ ROUND2(C, D, E, A, B, F5, 7, in(13), 1); \
+ ROUND2(B, C, D, E, A, F5, 7, in( 6), 1); \
+ ROUND2(A, B, C, D, E, F5, 8, in(15), 1); \
+ ROUND2(E, A, B, C, D, F5, 11, in( 8), 1); \
+ ROUND2(D, E, A, B, C, F5, 14, in( 1), 1); \
+ ROUND2(C, D, E, A, B, F5, 14, in(10), 1); \
+ ROUND2(B, C, D, E, A, F5, 12, in( 3), 1); \
+ ROUND2(A, B, C, D, E, F5, 6, in(12), 1); \
+ \
+ ROUND2(E, A, B, C, D, F4, 9, in( 6), 2); \
+ ROUND2(D, E, A, B, C, F4, 13, in(11), 2); \
+ ROUND2(C, D, E, A, B, F4, 15, in( 3), 2); \
+ ROUND2(B, C, D, E, A, F4, 7, in( 7), 2); \
+ ROUND2(A, B, C, D, E, F4, 12, in( 0), 2); \
+ ROUND2(E, A, B, C, D, F4, 8, in(13), 2); \
+ ROUND2(D, E, A, B, C, F4, 9, in( 5), 2); \
+ ROUND2(C, D, E, A, B, F4, 11, in(10), 2); \
+ ROUND2(B, C, D, E, A, F4, 7, in(14), 2); \
+ ROUND2(A, B, C, D, E, F4, 7, in(15), 2); \
+ ROUND2(E, A, B, C, D, F4, 12, in( 8), 2); \
+ ROUND2(D, E, A, B, C, F4, 7, in(12), 2); \
+ ROUND2(C, D, E, A, B, F4, 6, in( 4), 2); \
+ ROUND2(B, C, D, E, A, F4, 15, in( 9), 2); \
+ ROUND2(A, B, C, D, E, F4, 13, in( 1), 2); \
+ ROUND2(E, A, B, C, D, F4, 11, in( 2), 2); \
+ \
+ ROUND2(D, E, A, B, C, F3, 9, in(15), 3); \
+ ROUND2(C, D, E, A, B, F3, 7, in( 5), 3); \
+ ROUND2(B, C, D, E, A, F3, 15, in( 1), 3); \
+ ROUND2(A, B, C, D, E, F3, 11, in( 3), 3); \
+ ROUND2(E, A, B, C, D, F3, 8, in( 7), 3); \
+ ROUND2(D, E, A, B, C, F3, 6, in(14), 3); \
+ ROUND2(C, D, E, A, B, F3, 6, in( 6), 3); \
+ ROUND2(B, C, D, E, A, F3, 14, in( 9), 3); \
+ ROUND2(A, B, C, D, E, F3, 12, in(11), 3); \
+ ROUND2(E, A, B, C, D, F3, 13, in( 8), 3); \
+ ROUND2(D, E, A, B, C, F3, 5, in(12), 3); \
+ ROUND2(C, D, E, A, B, F3, 14, in( 2), 3); \
+ ROUND2(B, C, D, E, A, F3, 13, in(10), 3); \
+ ROUND2(A, B, C, D, E, F3, 13, in( 0), 3); \
+ ROUND2(E, A, B, C, D, F3, 7, in( 4), 3); \
+ ROUND2(D, E, A, B, C, F3, 5, in(13), 3); \
+ \
+ ROUND2(C, D, E, A, B, F2, 15, in( 8), 4); \
+ ROUND2(B, C, D, E, A, F2, 5, in( 6), 4); \
+ ROUND2(A, B, C, D, E, F2, 8, in( 4), 4); \
+ ROUND2(E, A, B, C, D, F2, 11, in( 1), 4); \
+ ROUND2(D, E, A, B, C, F2, 14, in( 3), 4); \
+ ROUND2(C, D, E, A, B, F2, 14, in(11), 4); \
+ ROUND2(B, C, D, E, A, F2, 6, in(15), 4); \
+ ROUND2(A, B, C, D, E, F2, 14, in( 0), 4); \
+ ROUND2(E, A, B, C, D, F2, 6, in( 5), 4); \
+ ROUND2(D, E, A, B, C, F2, 9, in(12), 4); \
+ ROUND2(C, D, E, A, B, F2, 12, in( 2), 4); \
+ ROUND2(B, C, D, E, A, F2, 9, in(13), 4); \
+ ROUND2(A, B, C, D, E, F2, 12, in( 9), 4); \
+ ROUND2(E, A, B, C, D, F2, 5, in( 7), 4); \
+ ROUND2(D, E, A, B, C, F2, 15, in(10), 4); \
+ ROUND2(C, D, E, A, B, F2, 8, in(14), 4); \
+ \
+ ROUND2(B, C, D, E, A, F1, 8, in(12), 5); \
+ ROUND2(A, B, C, D, E, F1, 5, in(15), 5); \
+ ROUND2(E, A, B, C, D, F1, 12, in(10), 5); \
+ ROUND2(D, E, A, B, C, F1, 9, in( 4), 5); \
+ ROUND2(C, D, E, A, B, F1, 12, in( 1), 5); \
+ ROUND2(B, C, D, E, A, F1, 5, in( 5), 5); \
+ ROUND2(A, B, C, D, E, F1, 14, in( 8), 5); \
+ ROUND2(E, A, B, C, D, F1, 6, in( 7), 5); \
+ ROUND2(D, E, A, B, C, F1, 8, in( 6), 5); \
+ ROUND2(C, D, E, A, B, F1, 13, in( 2), 5); \
+ ROUND2(B, C, D, E, A, F1, 6, in(13), 5); \
+ ROUND2(A, B, C, D, E, F1, 5, in(14), 5); \
+ ROUND2(E, A, B, C, D, F1, 15, in( 0), 5); \
+ ROUND2(D, E, A, B, C, F1, 13, in( 3), 5); \
+ ROUND2(C, D, E, A, B, F1, 11, in( 9), 5); \
+ ROUND2(B, C, D, E, A, F1, 11, in(11), 5); \
+ \
+ tmp = SPH_T32((h)[1] + C1 + D2); \
+ (h)[1] = SPH_T32((h)[2] + D1 + E2); \
+ (h)[2] = SPH_T32((h)[3] + E1 + A2); \
+ (h)[3] = SPH_T32((h)[4] + A1 + B2); \
+ (h)[4] = SPH_T32((h)[0] + B1 + C2); \
+ (h)[0] = tmp; \
+ } while (0)
+
+/*
+ * One round of RIPEMD-160. The data must be aligned for 32-bit access.
+ */
+static void
+ripemd160_round(const unsigned char *data, sph_u32 r[5])
+{
+#if SPH_LITTLE_FAST
+
+#define RIPEMD160_IN(x) sph_dec32le_aligned(data + (4 * (x)))
+
+#else
+
+ sph_u32 X_var[16];
+ int i;
+
+ for (i = 0; i < 16; i ++)
+ X_var[i] = sph_dec32le_aligned(data + 4 * i);
+#define RIPEMD160_IN(x) X_var[x]
+
+#endif
+ RIPEMD160_ROUND_BODY(RIPEMD160_IN, r);
+#undef RIPEMD160_IN
+}
+
+/* see sph_ripemd.h */
+void
+sph_ripemd160_init(void *cc)
+{
+ sph_ripemd160_context *sc;
+
+ sc = cc;
+ memcpy(sc->val, IV, sizeof sc->val);
+#if SPH_64
+ sc->count = 0;
+#else
+ sc->count_high = sc->count_low = 0;
+#endif
+}
+
+#define RFUN ripemd160_round
+#define HASH ripemd160
+#define LE32 1
+#include "md_helper.c"
+#undef RFUN
+#undef HASH
+#undef LE32
+
+/* see sph_ripemd.h */
+void
+sph_ripemd160_close(void *cc, void *dst)
+{
+ ripemd160_close(cc, dst, 5);
+ sph_ripemd160_init(cc);
+}
+
+/* see sph_ripemd.h */
+void
+sph_ripemd160_comp(const sph_u32 msg[16], sph_u32 val[5])
+{
+#define RIPEMD160_IN(x) msg[x]
+ RIPEMD160_ROUND_BODY(RIPEMD160_IN, val);
+#undef RIPEMD160_IN
+}
diff --git a/sph/sph_ripemd.h b/sph/sph_ripemd.h
new file mode 100644
index 0000000..f1f0982
--- /dev/null
+++ b/sph/sph_ripemd.h
@@ -0,0 +1,273 @@
+/* $Id: sph_ripemd.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * RIPEMD, RIPEMD-128 and RIPEMD-160 interface.
+ *
+ * RIPEMD was first described in: Research and Development in Advanced
+ * Communication Technologies in Europe, "RIPE Integrity Primitives:
+ * Final Report of RACE Integrity Primitives Evaluation (R1040)", RACE,
+ * June 1992.
+ *
+ * A new, strengthened version, dubbed RIPEMD-160, was published in: H.
+ * Dobbertin, A. Bosselaers, and B. Preneel, "RIPEMD-160, a strengthened
+ * version of RIPEMD", Fast Software Encryption - FSE'96, LNCS 1039,
+ * Springer (1996), pp. 71--82.
+ *
+ * This article describes both RIPEMD-160, with a 160-bit output, and a
+ * reduced version called RIPEMD-128, which has a 128-bit output. RIPEMD-128
+ * was meant as a "drop-in" replacement for any hash function with 128-bit
+ * output, especially the original RIPEMD.
+ *
+ * @warning Collisions, and an efficient method to build other collisions,
+ * have been published for the original RIPEMD, which is thus considered as
+ * cryptographically broken. It is also very rarely encountered, and there
+ * seems to exist no free description or implementation of RIPEMD (except
+ * the sphlib code, of course). As of january 2007, RIPEMD-128 and RIPEMD-160
+ * seem as secure as their output length allows.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010 Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file sph_ripemd.h
+ * @author Thomas Pornin
+ */
+
+#ifndef SPH_RIPEMD_H__
+#define SPH_RIPEMD_H__
+
+#include
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for RIPEMD.
+ */
+#define SPH_SIZE_ripemd 128
+
+/**
+ * Output size (in bits) for RIPEMD-128.
+ */
+#define SPH_SIZE_ripemd128 128
+
+/**
+ * Output size (in bits) for RIPEMD-160.
+ */
+#define SPH_SIZE_ripemd160 160
+
+/**
+ * This structure is a context for RIPEMD computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a RIPEMD computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running RIPEMD computation
+ * can be cloned by copying the context (e.g. with a simple
+ * memcpy()
).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+ unsigned char buf[64]; /* first field, for alignment */
+ sph_u32 val[4];
+#if SPH_64
+ sph_u64 count;
+#else
+ sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_ripemd_context;
+
+/**
+ * Initialize a RIPEMD context. This process performs no memory allocation.
+ *
+ * @param cc the RIPEMD context (pointer to
+ * a sph_ripemd_context
)
+ */
+void sph_ripemd_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that len
is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc the RIPEMD context
+ * @param data the input data
+ * @param len the input data length (in bytes)
+ */
+void sph_ripemd(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current RIPEMD computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (16 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc the RIPEMD context
+ * @param dst the destination buffer
+ */
+void sph_ripemd_close(void *cc, void *dst);
+
+/**
+ * Apply the RIPEMD compression function on the provided data. The
+ * msg
parameter contains the 16 32-bit input blocks,
+ * as numerical values (hence after the little-endian decoding). The
+ * val
parameter contains the 5 32-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array.
+ *
+ * @param msg the message block (16 values)
+ * @param val the function 128-bit input and output
+ */
+void sph_ripemd_comp(const sph_u32 msg[16], sph_u32 val[4]);
+
+/* ===================================================================== */
+
+/**
+ * This structure is a context for RIPEMD-128 computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a RIPEMD-128 computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running RIPEMD-128 computation
+ * can be cloned by copying the context (e.g. with a simple
+ * memcpy()
).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+ unsigned char buf[64]; /* first field, for alignment */
+ sph_u32 val[4];
+#if SPH_64
+ sph_u64 count;
+#else
+ sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_ripemd128_context;
+
+/**
+ * Initialize a RIPEMD-128 context. This process performs no memory allocation.
+ *
+ * @param cc the RIPEMD-128 context (pointer to
+ * a sph_ripemd128_context
)
+ */
+void sph_ripemd128_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that len
is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc the RIPEMD-128 context
+ * @param data the input data
+ * @param len the input data length (in bytes)
+ */
+void sph_ripemd128(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current RIPEMD-128 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (16 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc the RIPEMD-128 context
+ * @param dst the destination buffer
+ */
+void sph_ripemd128_close(void *cc, void *dst);
+
+/**
+ * Apply the RIPEMD-128 compression function on the provided data. The
+ * msg
parameter contains the 16 32-bit input blocks,
+ * as numerical values (hence after the little-endian decoding). The
+ * val
parameter contains the 5 32-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array.
+ *
+ * @param msg the message block (16 values)
+ * @param val the function 128-bit input and output
+ */
+void sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4]);
+
+/* ===================================================================== */
+
+/**
+ * This structure is a context for RIPEMD-160 computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a RIPEMD-160 computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running RIPEMD-160 computation
+ * can be cloned by copying the context (e.g. with a simple
+ * memcpy()
).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+ unsigned char buf[64]; /* first field, for alignment */
+ sph_u32 val[5];
+#if SPH_64
+ sph_u64 count;
+#else
+ sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_ripemd160_context;
+
+/**
+ * Initialize a RIPEMD-160 context. This process performs no memory allocation.
+ *
+ * @param cc the RIPEMD-160 context (pointer to
+ * a sph_ripemd160_context
)
+ */
+void sph_ripemd160_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that len
is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc the RIPEMD-160 context
+ * @param data the input data
+ * @param len the input data length (in bytes)
+ */
+void sph_ripemd160(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current RIPEMD-160 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (20 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc the RIPEMD-160 context
+ * @param dst the destination buffer
+ */
+void sph_ripemd160_close(void *cc, void *dst);
+
+/**
+ * Apply the RIPEMD-160 compression function on the provided data. The
+ * msg
parameter contains the 16 32-bit input blocks,
+ * as numerical values (hence after the little-endian decoding). The
+ * val
parameter contains the 5 32-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array.
+ *
+ * @param msg the message block (16 values)
+ * @param val the function 160-bit input and output
+ */
+void sph_ripemd160_comp(const sph_u32 msg[16], sph_u32 val[5]);
+
+#endif
diff --git a/sph/sph_sha2.c b/sph/sph_sha2.c
new file mode 100644
index 0000000..aab2c55
--- /dev/null
+++ b/sph/sph_sha2.c
@@ -0,0 +1,691 @@
+/* $Id: sha2.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * SHA-224 / SHA-256 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010 Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author Thomas Pornin
+ */
+
+#include
+#include
+
+#include "sph_sha2.h"
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHA2
+#define SPH_SMALL_FOOTPRINT_SHA2 1
+#endif
+
+#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z))
+#define MAJ(X, Y, Z) (((Y) & (Z)) | (((Y) | (Z)) & (X)))
+
+#define ROTR SPH_ROTR32
+
+#define BSG2_0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define BSG2_1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define SSG2_0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SPH_T32((x) >> 3))
+#define SSG2_1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SPH_T32((x) >> 10))
+
+static const sph_u32 H224[8] = {
+ SPH_C32(0xC1059ED8), SPH_C32(0x367CD507), SPH_C32(0x3070DD17),
+ SPH_C32(0xF70E5939), SPH_C32(0xFFC00B31), SPH_C32(0x68581511),
+ SPH_C32(0x64F98FA7), SPH_C32(0xBEFA4FA4)
+};
+
+static const sph_u32 H256[8] = {
+ SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85), SPH_C32(0x3C6EF372),
+ SPH_C32(0xA54FF53A), SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
+ SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
+};
+
+/*
+ * The SHA2_ROUND_BODY defines the body for a SHA-224 / SHA-256
+ * compression function implementation. The "in" parameter should
+ * evaluate, when applied to a numerical input parameter from 0 to 15,
+ * to an expression which yields the corresponding input block. The "r"
+ * parameter should evaluate to an array or pointer expression
+ * designating the array of 8 words which contains the input and output
+ * of the compression function.
+ */
+
+#if SPH_SMALL_FOOTPRINT_SHA2
+
+static const sph_u32 K[64] = {
+ SPH_C32(0x428A2F98), SPH_C32(0x71374491),
+ SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
+ SPH_C32(0x3956C25B), SPH_C32(0x59F111F1),
+ SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5),
+ SPH_C32(0xD807AA98), SPH_C32(0x12835B01),
+ SPH_C32(0x243185BE), SPH_C32(0x550C7DC3),
+ SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE),
+ SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174),
+ SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786),
+ SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC),
+ SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA),
+ SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA),
+ SPH_C32(0x983E5152), SPH_C32(0xA831C66D),
+ SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7),
+ SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147),
+ SPH_C32(0x06CA6351), SPH_C32(0x14292967),
+ SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138),
+ SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13),
+ SPH_C32(0x650A7354), SPH_C32(0x766A0ABB),
+ SPH_C32(0x81C2C92E), SPH_C32(0x92722C85),
+ SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B),
+ SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3),
+ SPH_C32(0xD192E819), SPH_C32(0xD6990624),
+ SPH_C32(0xF40E3585), SPH_C32(0x106AA070),
+ SPH_C32(0x19A4C116), SPH_C32(0x1E376C08),
+ SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5),
+ SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A),
+ SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3),
+ SPH_C32(0x748F82EE), SPH_C32(0x78A5636F),
+ SPH_C32(0x84C87814), SPH_C32(0x8CC70208),
+ SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
+ SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
+};
+
+#define SHA2_MEXP1(in, pc) do { \
+ W[pc] = in(pc); \
+ } while (0)
+
+#define SHA2_MEXP2(in, pc) do { \
+ W[(pc) & 0x0F] = SPH_T32(SSG2_1(W[((pc) - 2) & 0x0F]) \
+ + W[((pc) - 7) & 0x0F] \
+ + SSG2_0(W[((pc) - 15) & 0x0F]) + W[(pc) & 0x0F]); \
+ } while (0)
+
+#define SHA2_STEPn(n, a, b, c, d, e, f, g, h, in, pc) do { \
+ sph_u32 t1, t2; \
+ SHA2_MEXP ## n(in, pc); \
+ t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \
+ + K[pcount + (pc)] + W[(pc) & 0x0F]); \
+ t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \
+ d = SPH_T32(d + t1); \
+ h = SPH_T32(t1 + t2); \
+ } while (0)
+
+#define SHA2_STEP1(a, b, c, d, e, f, g, h, in, pc) \
+ SHA2_STEPn(1, a, b, c, d, e, f, g, h, in, pc)
+#define SHA2_STEP2(a, b, c, d, e, f, g, h, in, pc) \
+ SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc)
+
+#define SHA2_ROUND_BODY(in, r) do { \
+ sph_u32 A, B, C, D, E, F, G, H; \
+ sph_u32 W[16]; \
+ unsigned pcount; \
+ \
+ A = (r)[0]; \
+ B = (r)[1]; \
+ C = (r)[2]; \
+ D = (r)[3]; \
+ E = (r)[4]; \
+ F = (r)[5]; \
+ G = (r)[6]; \
+ H = (r)[7]; \
+ pcount = 0; \
+ SHA2_STEP1(A, B, C, D, E, F, G, H, in, 0); \
+ SHA2_STEP1(H, A, B, C, D, E, F, G, in, 1); \
+ SHA2_STEP1(G, H, A, B, C, D, E, F, in, 2); \
+ SHA2_STEP1(F, G, H, A, B, C, D, E, in, 3); \
+ SHA2_STEP1(E, F, G, H, A, B, C, D, in, 4); \
+ SHA2_STEP1(D, E, F, G, H, A, B, C, in, 5); \
+ SHA2_STEP1(C, D, E, F, G, H, A, B, in, 6); \
+ SHA2_STEP1(B, C, D, E, F, G, H, A, in, 7); \
+ SHA2_STEP1(A, B, C, D, E, F, G, H, in, 8); \
+ SHA2_STEP1(H, A, B, C, D, E, F, G, in, 9); \
+ SHA2_STEP1(G, H, A, B, C, D, E, F, in, 10); \
+ SHA2_STEP1(F, G, H, A, B, C, D, E, in, 11); \
+ SHA2_STEP1(E, F, G, H, A, B, C, D, in, 12); \
+ SHA2_STEP1(D, E, F, G, H, A, B, C, in, 13); \
+ SHA2_STEP1(C, D, E, F, G, H, A, B, in, 14); \
+ SHA2_STEP1(B, C, D, E, F, G, H, A, in, 15); \
+ for (pcount = 16; pcount < 64; pcount += 16) { \
+ SHA2_STEP2(A, B, C, D, E, F, G, H, in, 0); \
+ SHA2_STEP2(H, A, B, C, D, E, F, G, in, 1); \
+ SHA2_STEP2(G, H, A, B, C, D, E, F, in, 2); \
+ SHA2_STEP2(F, G, H, A, B, C, D, E, in, 3); \
+ SHA2_STEP2(E, F, G, H, A, B, C, D, in, 4); \
+ SHA2_STEP2(D, E, F, G, H, A, B, C, in, 5); \
+ SHA2_STEP2(C, D, E, F, G, H, A, B, in, 6); \
+ SHA2_STEP2(B, C, D, E, F, G, H, A, in, 7); \
+ SHA2_STEP2(A, B, C, D, E, F, G, H, in, 8); \
+ SHA2_STEP2(H, A, B, C, D, E, F, G, in, 9); \
+ SHA2_STEP2(G, H, A, B, C, D, E, F, in, 10); \
+ SHA2_STEP2(F, G, H, A, B, C, D, E, in, 11); \
+ SHA2_STEP2(E, F, G, H, A, B, C, D, in, 12); \
+ SHA2_STEP2(D, E, F, G, H, A, B, C, in, 13); \
+ SHA2_STEP2(C, D, E, F, G, H, A, B, in, 14); \
+ SHA2_STEP2(B, C, D, E, F, G, H, A, in, 15); \
+ } \
+ (r)[0] = SPH_T32((r)[0] + A); \
+ (r)[1] = SPH_T32((r)[1] + B); \
+ (r)[2] = SPH_T32((r)[2] + C); \
+ (r)[3] = SPH_T32((r)[3] + D); \
+ (r)[4] = SPH_T32((r)[4] + E); \
+ (r)[5] = SPH_T32((r)[5] + F); \
+ (r)[6] = SPH_T32((r)[6] + G); \
+ (r)[7] = SPH_T32((r)[7] + H); \
+ } while (0)
+
+#else
+
+#define SHA2_ROUND_BODY(in, r) do { \
+ sph_u32 A, B, C, D, E, F, G, H, T1, T2; \
+ sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \
+ sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \
+ int i; \
+ \
+ A = (r)[0]; \
+ B = (r)[1]; \
+ C = (r)[2]; \
+ D = (r)[3]; \
+ E = (r)[4]; \
+ F = (r)[5]; \
+ G = (r)[6]; \
+ H = (r)[7]; \
+ W00 = in(0); \
+ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ + SPH_C32(0x428A2F98) + W00); \
+ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+ D = SPH_T32(D + T1); \
+ H = SPH_T32(T1 + T2); \
+ W01 = in(1); \
+ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ + SPH_C32(0x71374491) + W01); \
+ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+ C = SPH_T32(C + T1); \
+ G = SPH_T32(T1 + T2); \
+ W02 = in(2); \
+ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ + SPH_C32(0xB5C0FBCF) + W02); \
+ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+ B = SPH_T32(B + T1); \
+ F = SPH_T32(T1 + T2); \
+ W03 = in(3); \
+ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ + SPH_C32(0xE9B5DBA5) + W03); \
+ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+ A = SPH_T32(A + T1); \
+ E = SPH_T32(T1 + T2); \
+ W04 = in(4); \
+ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ + SPH_C32(0x3956C25B) + W04); \
+ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+ H = SPH_T32(H + T1); \
+ D = SPH_T32(T1 + T2); \
+ W05 = in(5); \
+ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ + SPH_C32(0x59F111F1) + W05); \
+ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+ G = SPH_T32(G + T1); \
+ C = SPH_T32(T1 + T2); \
+ W06 = in(6); \
+ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ + SPH_C32(0x923F82A4) + W06); \
+ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+ F = SPH_T32(F + T1); \
+ B = SPH_T32(T1 + T2); \
+ W07 = in(7); \
+ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ + SPH_C32(0xAB1C5ED5) + W07); \
+ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+ E = SPH_T32(E + T1); \
+ A = SPH_T32(T1 + T2); \
+ W08 = in(8); \
+ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ + SPH_C32(0xD807AA98) + W08); \
+ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+ D = SPH_T32(D + T1); \
+ H = SPH_T32(T1 + T2); \
+ W09 = in(9); \
+ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ + SPH_C32(0x12835B01) + W09); \
+ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+ C = SPH_T32(C + T1); \
+ G = SPH_T32(T1 + T2); \
+ W10 = in(10); \
+ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ + SPH_C32(0x243185BE) + W10); \
+ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+ B = SPH_T32(B + T1); \
+ F = SPH_T32(T1 + T2); \
+ W11 = in(11); \
+ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ + SPH_C32(0x550C7DC3) + W11); \
+ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+ A = SPH_T32(A + T1); \
+ E = SPH_T32(T1 + T2); \
+ W12 = in(12); \
+ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ + SPH_C32(0x72BE5D74) + W12); \
+ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+ H = SPH_T32(H + T1); \
+ D = SPH_T32(T1 + T2); \
+ W13 = in(13); \
+ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ + SPH_C32(0x80DEB1FE) + W13); \
+ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+ G = SPH_T32(G + T1); \
+ C = SPH_T32(T1 + T2); \
+ W14 = in(14); \
+ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ + SPH_C32(0x9BDC06A7) + W14); \
+ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+ F = SPH_T32(F + T1); \
+ B = SPH_T32(T1 + T2); \
+ W15 = in(15); \
+ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ + SPH_C32(0xC19BF174) + W15); \
+ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+ E = SPH_T32(E + T1); \
+ A = SPH_T32(T1 + T2); \
+ W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
+ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ + SPH_C32(0xE49B69C1) + W00); \
+ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+ D = SPH_T32(D + T1); \
+ H = SPH_T32(T1 + T2); \
+ W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
+ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ + SPH_C32(0xEFBE4786) + W01); \
+ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+ C = SPH_T32(C + T1); \
+ G = SPH_T32(T1 + T2); \
+ W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
+ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ + SPH_C32(0x0FC19DC6) + W02); \
+ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+ B = SPH_T32(B + T1); \
+ F = SPH_T32(T1 + T2); \
+ W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
+ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ + SPH_C32(0x240CA1CC) + W03); \
+ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+ A = SPH_T32(A + T1); \
+ E = SPH_T32(T1 + T2); \
+ W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
+ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ + SPH_C32(0x2DE92C6F) + W04); \
+ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+ H = SPH_T32(H + T1); \
+ D = SPH_T32(T1 + T2); \
+ W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
+ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ + SPH_C32(0x4A7484AA) + W05); \
+ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+ G = SPH_T32(G + T1); \
+ C = SPH_T32(T1 + T2); \
+ W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
+ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ + SPH_C32(0x5CB0A9DC) + W06); \
+ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+ F = SPH_T32(F + T1); \
+ B = SPH_T32(T1 + T2); \
+ W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
+ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ + SPH_C32(0x76F988DA) + W07); \
+ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+ E = SPH_T32(E + T1); \
+ A = SPH_T32(T1 + T2); \
+ W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
+ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ + SPH_C32(0x983E5152) + W08); \
+ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+ D = SPH_T32(D + T1); \
+ H = SPH_T32(T1 + T2); \
+ W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
+ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ + SPH_C32(0xA831C66D) + W09); \
+ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+ C = SPH_T32(C + T1); \
+ G = SPH_T32(T1 + T2); \
+ W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
+ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ + SPH_C32(0xB00327C8) + W10); \
+ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+ B = SPH_T32(B + T1); \
+ F = SPH_T32(T1 + T2); \
+ W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
+ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ + SPH_C32(0xBF597FC7) + W11); \
+ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+ A = SPH_T32(A + T1); \
+ E = SPH_T32(T1 + T2); \
+ W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
+ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ + SPH_C32(0xC6E00BF3) + W12); \
+ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+ H = SPH_T32(H + T1); \
+ D = SPH_T32(T1 + T2); \
+ W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
+ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ + SPH_C32(0xD5A79147) + W13); \
+ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+ G = SPH_T32(G + T1); \
+ C = SPH_T32(T1 + T2); \
+ W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
+ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ + SPH_C32(0x06CA6351) + W14); \
+ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+ F = SPH_T32(F + T1); \
+ B = SPH_T32(T1 + T2); \
+ W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
+ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ + SPH_C32(0x14292967) + W15); \
+ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+ E = SPH_T32(E + T1); \
+ A = SPH_T32(T1 + T2); \
+ W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
+ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ + SPH_C32(0x27B70A85) + W00); \
+ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+ D = SPH_T32(D + T1); \
+ H = SPH_T32(T1 + T2); \
+ W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
+ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ + SPH_C32(0x2E1B2138) + W01); \
+ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+ C = SPH_T32(C + T1); \
+ G = SPH_T32(T1 + T2); \
+ W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
+ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ + SPH_C32(0x4D2C6DFC) + W02); \
+ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+ B = SPH_T32(B + T1); \
+ F = SPH_T32(T1 + T2); \
+ W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
+ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ + SPH_C32(0x53380D13) + W03); \
+ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+ A = SPH_T32(A + T1); \
+ E = SPH_T32(T1 + T2); \
+ W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
+ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ + SPH_C32(0x650A7354) + W04); \
+ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+ H = SPH_T32(H + T1); \
+ D = SPH_T32(T1 + T2); \
+ W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
+ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ + SPH_C32(0x766A0ABB) + W05); \
+ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+ G = SPH_T32(G + T1); \
+ C = SPH_T32(T1 + T2); \
+ W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
+ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ + SPH_C32(0x81C2C92E) + W06); \
+ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+ F = SPH_T32(F + T1); \
+ B = SPH_T32(T1 + T2); \
+ W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
+ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ + SPH_C32(0x92722C85) + W07); \
+ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+ E = SPH_T32(E + T1); \
+ A = SPH_T32(T1 + T2); \
+ W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
+ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ + SPH_C32(0xA2BFE8A1) + W08); \
+ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+ D = SPH_T32(D + T1); \
+ H = SPH_T32(T1 + T2); \
+ W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
+ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ + SPH_C32(0xA81A664B) + W09); \
+ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+ C = SPH_T32(C + T1); \
+ G = SPH_T32(T1 + T2); \
+ W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
+ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ + SPH_C32(0xC24B8B70) + W10); \
+ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+ B = SPH_T32(B + T1); \
+ F = SPH_T32(T1 + T2); \
+ W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
+ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ + SPH_C32(0xC76C51A3) + W11); \
+ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+ A = SPH_T32(A + T1); \
+ E = SPH_T32(T1 + T2); \
+ W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
+ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ + SPH_C32(0xD192E819) + W12); \
+ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+ H = SPH_T32(H + T1); \
+ D = SPH_T32(T1 + T2); \
+ W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
+ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ + SPH_C32(0xD6990624) + W13); \
+ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+ G = SPH_T32(G + T1); \
+ C = SPH_T32(T1 + T2); \
+ W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
+ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ + SPH_C32(0xF40E3585) + W14); \
+ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+ F = SPH_T32(F + T1); \
+ B = SPH_T32(T1 + T2); \
+ W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
+ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ + SPH_C32(0x106AA070) + W15); \
+ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+ E = SPH_T32(E + T1); \
+ A = SPH_T32(T1 + T2); \
+ W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
+ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ + SPH_C32(0x19A4C116) + W00); \
+ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+ D = SPH_T32(D + T1); \
+ H = SPH_T32(T1 + T2); \
+ W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
+ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ + SPH_C32(0x1E376C08) + W01); \
+ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+ C = SPH_T32(C + T1); \
+ G = SPH_T32(T1 + T2); \
+ W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
+ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ + SPH_C32(0x2748774C) + W02); \
+ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+ B = SPH_T32(B + T1); \
+ F = SPH_T32(T1 + T2); \
+ W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
+ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ + SPH_C32(0x34B0BCB5) + W03); \
+ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+ A = SPH_T32(A + T1); \
+ E = SPH_T32(T1 + T2); \
+ W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
+ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ + SPH_C32(0x391C0CB3) + W04); \
+ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+ H = SPH_T32(H + T1); \
+ D = SPH_T32(T1 + T2); \
+ W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
+ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ + SPH_C32(0x4ED8AA4A) + W05); \
+ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+ G = SPH_T32(G + T1); \
+ C = SPH_T32(T1 + T2); \
+ W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
+ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ + SPH_C32(0x5B9CCA4F) + W06); \
+ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+ F = SPH_T32(F + T1); \
+ B = SPH_T32(T1 + T2); \
+ W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
+ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ + SPH_C32(0x682E6FF3) + W07); \
+ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+ E = SPH_T32(E + T1); \
+ A = SPH_T32(T1 + T2); \
+ W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
+ T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ + SPH_C32(0x748F82EE) + W08); \
+ T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+ D = SPH_T32(D + T1); \
+ H = SPH_T32(T1 + T2); \
+ W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
+ T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ + SPH_C32(0x78A5636F) + W09); \
+ T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+ C = SPH_T32(C + T1); \
+ G = SPH_T32(T1 + T2); \
+ W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
+ T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ + SPH_C32(0x84C87814) + W10); \
+ T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+ B = SPH_T32(B + T1); \
+ F = SPH_T32(T1 + T2); \
+ W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
+ T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ + SPH_C32(0x8CC70208) + W11); \
+ T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+ A = SPH_T32(A + T1); \
+ E = SPH_T32(T1 + T2); \
+ W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
+ T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ + SPH_C32(0x90BEFFFA) + W12); \
+ T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+ H = SPH_T32(H + T1); \
+ D = SPH_T32(T1 + T2); \
+ W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
+ T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ + SPH_C32(0xA4506CEB) + W13); \
+ T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+ G = SPH_T32(G + T1); \
+ C = SPH_T32(T1 + T2); \
+ W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
+ T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ + SPH_C32(0xBEF9A3F7) + W14); \
+ T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+ F = SPH_T32(F + T1); \
+ B = SPH_T32(T1 + T2); \
+ W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
+ T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ + SPH_C32(0xC67178F2) + W15); \
+ T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+ E = SPH_T32(E + T1); \
+ A = SPH_T32(T1 + T2); \
+ (r)[0] = SPH_T32((r)[0] + A); \
+ (r)[1] = SPH_T32((r)[1] + B); \
+ (r)[2] = SPH_T32((r)[2] + C); \
+ (r)[3] = SPH_T32((r)[3] + D); \
+ (r)[4] = SPH_T32((r)[4] + E); \
+ (r)[5] = SPH_T32((r)[5] + F); \
+ (r)[6] = SPH_T32((r)[6] + G); \
+ (r)[7] = SPH_T32((r)[7] + H); \
+ } while (0)
+
+#endif
+
+/*
+ * One round of SHA-224 / SHA-256. The data must be aligned for 32-bit access.
+ */
+static void
+sha2_round(const unsigned char *data, sph_u32 r[8])
+{
+#define SHA2_IN(x) sph_dec32be_aligned(data + (4 * (x)))
+ SHA2_ROUND_BODY(SHA2_IN, r);
+#undef SHA2_IN
+}
+
+/* see sph_sha2.h */
+void
+sph_sha224_init(void *cc)
+{
+ sph_sha224_context *sc;
+
+ sc = cc;
+ memcpy(sc->val, H224, sizeof H224);
+#if SPH_64
+ sc->count = 0;
+#else
+ sc->count_high = sc->count_low = 0;
+#endif
+}
+
+/* see sph_sha2.h */
+void
+sph_sha256_init(void *cc)
+{
+ sph_sha256_context *sc;
+
+ sc = cc;
+ memcpy(sc->val, H256, sizeof H256);
+#if SPH_64
+ sc->count = 0;
+#else
+ sc->count_high = sc->count_low = 0;
+#endif
+}
+
+#define RFUN sha2_round
+#define HASH sha224
+#define BE32 1
+#include "md_helper.c"
+
+/* see sph_sha2.h */
+void
+sph_sha224_close(void *cc, void *dst)
+{
+ sha224_close(cc, dst, 7);
+ sph_sha224_init(cc);
+}
+
+/* see sph_sha2.h */
+void
+sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+ sha224_addbits_and_close(cc, ub, n, dst, 7);
+ sph_sha224_init(cc);
+}
+
+/* see sph_sha2.h */
+void
+sph_sha256_close(void *cc, void *dst)
+{
+ sha224_close(cc, dst, 8);
+ sph_sha256_init(cc);
+}
+
+/* see sph_sha2.h */
+void
+sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+ sha224_addbits_and_close(cc, ub, n, dst, 8);
+ sph_sha256_init(cc);
+}
+
+/* see sph_sha2.h */
+void
+sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8])
+{
+#define SHA2_IN(x) msg[x]
+ SHA2_ROUND_BODY(SHA2_IN, val);
+#undef SHA2_IN
+}
diff --git a/util.cpp b/util.cpp
index e7b305a..40352c8 100644
--- a/util.cpp
+++ b/util.cpp
@@ -1421,28 +1421,39 @@ static uint32_t getblocheight(struct stratum_ctx *sctx)
static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
{
- const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime, *nreward;
+ const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime;
+ const char *claim = NULL, *nreward = NULL;
size_t coinb1_size, coinb2_size;
bool clean, ret = false;
- int merkle_count, i;
+ int merkle_count, i, p=0;
json_t *merkle_arr;
uchar **merkle = NULL;
// uchar(*merkle_tree)[32] = { 0 };
int ntime;
-
- job_id = json_string_value(json_array_get(params, 0));
- prevhash = json_string_value(json_array_get(params, 1));
- coinb1 = json_string_value(json_array_get(params, 2));
- coinb2 = json_string_value(json_array_get(params, 3));
- merkle_arr = json_array_get(params, 4);
+ char algo[64] = { 0 };
+ get_currentalgo(algo, sizeof(algo));
+ bool has_claim = !strcasecmp(algo, "lbry");
+
+ job_id = json_string_value(json_array_get(params, p++));
+ prevhash = json_string_value(json_array_get(params, p++));
+ if (has_claim) {
+ claim = json_string_value(json_array_get(params, p++));
+ if (!claim || strlen(claim) != 64) {
+ applog(LOG_ERR, "Stratum notify: invalid claim parameter");
+ goto out;
+ }
+ }
+ coinb1 = json_string_value(json_array_get(params, p++));
+ coinb2 = json_string_value(json_array_get(params, p++));
+ merkle_arr = json_array_get(params, p++);
if (!merkle_arr || !json_is_array(merkle_arr))
goto out;
merkle_count = (int) json_array_size(merkle_arr);
- version = json_string_value(json_array_get(params, 5));
- nbits = json_string_value(json_array_get(params, 6));
- stime = json_string_value(json_array_get(params, 7));
- clean = json_is_true(json_array_get(params, 8));
- nreward = json_string_value(json_array_get(params, 9));
+ version = json_string_value(json_array_get(params, p++));
+ nbits = json_string_value(json_array_get(params, p++));
+ stime = json_string_value(json_array_get(params, p++));
+ clean = json_is_true(json_array_get(params, p)); p++;
+ nreward = json_string_value(json_array_get(params, p++));
if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime ||
strlen(prevhash) != 64 || strlen(version) != 8 ||
@@ -1494,6 +1505,7 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
free(sctx->job.job_id);
sctx->job.job_id = strdup(job_id);
hex2bin(sctx->job.prevhash, prevhash, 32);
+ if (has_claim) hex2bin(sctx->job.claim, claim, 32);
sctx->job.height = getblocheight(sctx);
@@ -2079,7 +2091,8 @@ void do_gpu_tests(void)
//scanhash_scrypt_jane(0, &work, NULL, 1, &done, &tv, &tv);
memset(work.data, 0, sizeof(work.data));
- scanhash_decred(0, &work, 1, &done);
+ work.data[0] = 0;
+ scanhash_lbry(0, &work, 1, &done);
free(work_restart);
work_restart = NULL;
@@ -2142,6 +2155,10 @@ void print_hash_tests(void)
keccak256_hash(&hash[0], &buf[0]);
printpfx("keccak", hash);
+ memset(buf, 0, 128);
+ lbry_hash(&hash[0], &buf[0]);
+ printpfx("lbry", hash);
+
luffa_hash(&hash[0], &buf[0]);
printpfx("luffa", hash);