lbry algo (stratum only)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
2016-07-11 23:26:23 +02:00 · 2016-07-11 23:26:23 +02:00 · 5703a74e22
commit 5703a74e22
parent dbb9507d2b
15 changed files with 3450 additions and 28 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -50,6 +50,8 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
 			  sph/hamsi.c sph/hamsi_helper.c sph/streebog.c \
 			  sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \
+			  sph/ripemd.c sph/sph_sha2.c \
+			  lbry/lbry.cu lbry/cuda_ripemd160.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu \
 			  qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
 			  x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
 			  x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@

-ccMiner preview 1.8-dev (May 2016) "Pascal and x11evo algo"
+ccMiner 1.8 Preview (July 2016) "CUDA 8, lbry and x11evo algos"
 ---------------------------------------------------------------

 ***************************************************************
@ -28,6 +28,7 @@ Decred (Blake256 14-rounds - 180 bytes)
 HeavyCoin & MjollnirCoin
 FugueCoin
 GroestlCoin & Myriad-Groestl
+Lbry Credits
 JackpotCoin
 QuarkCoin family & AnimeCoin
 TalkCoin
@ -35,13 +36,13 @@ DarkCoin and other X11 coins
 Chaincoin and Flaxscript (C11)
 Saffroncoin blake (256 14-rounds)
 BlakeCoin (256 8-rounds)
-Midnight (BMW 256)
 Qubit (Digibyte, ...)
 Luffa (Joincoin)
 Keccak (Maxcoin)
 Pentablake (Blake 512 x5)
 1Coin Triple S
 Neoscrypt (FeatherCoin)
+Revolver (X11evo)
 Scrypt and Scrypt:N
 Scrypt-Jane (Chacha)
 Sibcoin (sib)
@ -82,6 +83,7 @@ its command line interface and options.
                          heavy       use to mine Heavycoin
                          jackpot     use to mine Jackpotcoin
                          keccak      use to mine Maxcoin
+                          lbry        use to mine LBRY Credits
                          luffa       use to mine Joincoin
                          lyra2       use to mine Vertcoin
                          mjollnir    use to mine Mjollnircoin
@ -151,6 +153,7 @@ its command line interface and options.
      --max-diff=N      Only mine if net difficulty is less than specified value
      --pstate=0        will force the Geforce 9xx to run in P0 P-State
      --plimit=150W     set the gpu power limit, allow multiple values for N cards
+      --tlimit=85       Set the gpu thermal limit (windows only)
      --keep-clocks     prevent reset clocks and/or power limit on exit
      --show-diff       display submitted block and net difficulty
  -B, --background      run the miner in the background
@ -242,6 +245,7 @@ features.

  July 2016       v1.8.0
                  Pascal support with cuda 8
+                  lbry new multi sha / ripemd algo (LBC)
                  x11evo algo (XRE)
                  Lyra2v2, Neoscrypt and Decred improvements
                  Enhance windows NVAPI clock and power limits
--- a/algos.h
+++ b/algos.h
@ -19,6 +19,7 @@ enum sha_algos {
 	ALGO_HEAVY,		/* Heavycoin hash */
 	ALGO_KECCAK,
 	ALGO_JACKPOT,
+	ALGO_LBRY,
 	ALGO_LUFFA,
 	ALGO_LYRA2,
 	ALGO_LYRA2v2,
@ -67,6 +68,7 @@ static const char *algo_names[] = {
 	"heavy",
 	"keccak",
 	"jackpot",
+	"lbry",
 	"luffa",
 	"lyra2",
 	"lyra2v2",
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -228,6 +228,7 @@ Options:\n\
 			heavy       Heavycoin\n\
 			jackpot     Jackpot\n\
 			keccak      Keccak-256 (Maxcoin)\n\
+			lbry        LBRY Credits (Sha/Ripemd)\n\
 			luffa       Joincoin\n\
 			lyra2       LyraBar\n\
 			lyra2v2     VertCoin\n\
@ -567,6 +568,7 @@ static void calc_network_diff(struct work *work)
 	// sample for diff 43.281 : 1c05ea29
 	// todo: endian reversed on longpoll could be zr5 specific...
 	uint32_t nbits = have_longpoll ? work->data[18] : swab32(work->data[18]);
+	if (opt_algo == ALGO_LBRY) nbits = swab32(work->data[26]);
 	if (opt_algo == ALGO_DECRED) nbits = work->data[29];
 	uint32_t bits = (nbits & 0xffffff);
 	int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
@ -837,6 +839,11 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 			le32enc(&nonce, work->data[19]);
 			be16enc(&nvote, *((uint16_t*)&work->data[20]));
 			break;
+		case ALGO_LBRY:
+			check_dups = true;
+			le32enc(&ntime, work->data[25]);
+			le32enc(&nonce, work->data[27]);
+			break;
 		case ALGO_ZR5:
 			check_dups = true;
 			be32enc(&ntime, work->data[17]);
@ -1296,6 +1303,8 @@ bool get_work(struct thr_info *thr, struct work *work)
 		memset(work->data + 19, 0x00, 52);
 		if (opt_algo == ALGO_DECRED) {
 			memset(&work->data[35], 0x00, 52);
+		} else if (opt_algo == ALGO_LBRY) {
+			work->data[28] = 0x80000000;
 		} else {
 			work->data[20] = 0x80000000;
 			work->data[31] = 0x00000280;
@ -1441,6 +1450,14 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		work->data[37] = (rand()*4) << 8; // random work data
 		sctx->job.height = work->data[32];
 		//applog_hex(work->data, 180);
+	} else if (opt_algo == ALGO_LBRY) {
+		for (i = 0; i < 8; i++)
+			work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
+		for (i = 0; i < 8; i++)
+			work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
+		work->data[25] = le32dec(sctx->job.ntime);
+		work->data[26] = le32dec(sctx->job.nbits);
+		work->data[28] = 0x80000000;
 	} else {
 		for (i = 0; i < 8; i++)
 			work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
@ -1498,6 +1515,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		case ALGO_FRESH:
 		case ALGO_FUGUE256:
 		case ALGO_GROESTL:
+		case ALGO_LBRY:
 		case ALGO_LYRA2v2:
 			work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));
 			break;
@ -1658,6 +1676,7 @@ static void *miner_thread(void *userdata)

 		// &work.data[19]
 		int wcmplen = (opt_algo == ALGO_DECRED) ? 140 : 76;
+		if (opt_algo == ALGO_LBRY) wcmplen = 108;
 		int wcmpoft = 0;
 		uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);

@ -1910,6 +1929,7 @@ static void *miner_thread(void *userdata)
 				minmax = 0x40000000U;
 				break;
 			case ALGO_KECCAK:
+			case ALGO_LBRY:
 			case ALGO_LUFFA:
 			case ALGO_SKEIN:
 			case ALGO_SKEIN2:
@ -2035,6 +2055,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_JACKPOT:
 			rc = scanhash_jackpot(thr_id, &work, max_nonce, &hashes_done);
 			break;
+		case ALGO_LBRY:
+			rc = scanhash_lbry(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_LUFFA:
 			rc = scanhash_luffa(thr_id, &work, max_nonce, &hashes_done);
 			break;
@ -2130,7 +2153,7 @@ static void *miner_thread(void *userdata)

 		// todo: update all algos to use work->nonces
 		work.nonces[0] = nonceptr[0];
-		if (opt_algo != ALGO_DECRED && opt_algo != ALGO_BLAKE2S) {
+		if (opt_algo != ALGO_DECRED && opt_algo != ALGO_BLAKE2S && opt_algo != ALGO_LBRY) {
 			work.nonces[1] = nonceptr[2];
 		}

--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -115,7 +115,7 @@
      <MaxRegCount>80</MaxRegCount>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>true</Keep>
-      <CodeGeneration>compute_50,sm_50</CodeGeneration>
+      <CodeGeneration>compute_61,sm_61;compute_52,sm_52</CodeGeneration>
      <Include>$(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99</Include>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
@ -273,15 +273,11 @@
    <ClCompile Include="sph\fugue.c" />
    <ClCompile Include="sph\groestl.c" />
    <ClCompile Include="sph\haval.c" />
-    <ClCompile Include="sph\haval_helper.c">
-      <ExcludedFromBuild>true</ExcludedFromBuild>
-    </ClCompile>
    <ClCompile Include="sph\jh.c" />
    <ClCompile Include="sph\keccak.c" />
    <ClCompile Include="sph\luffa.c" />
-    <ClCompile Include="sph\md_helper.c">
-      <ExcludedFromBuild>true</ExcludedFromBuild>
-    </ClCompile>
+    <ClCompile Include="sph\ripemd.c" />
+    <ClCompile Include="sph\sph_sha2.c" />
    <ClCompile Include="sph\sha2.c" />
    <ClCompile Include="sph\sha2big.c" />
    <ClCompile Include="sph\shabal.c" />
@ -428,6 +424,10 @@
      <MaxRegCount>92</MaxRegCount>
    </CudaCompile>
    <CudaCompile Include="Algo256\cuda_skein256.cu" />
+    <CudaCompile Include="lbry\cuda_ripemd160.cu" />
+    <CudaCompile Include="lbry\cuda_sha256_lbry.cu" />
+    <CudaCompile Include="lbry\cuda_sha512_lbry.cu" />
+    <CudaCompile Include="lbry\lbry.cu" />
    <CudaCompile Include="pentablake.cu">
      <MaxRegCount>80</MaxRegCount>
      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -82,6 +82,9 @@
    <Filter Include="Source Files\CUDA\lyra2">
      <UniqueIdentifier>{1613763f-895c-4321-b58b-6f5849868956}</UniqueIdentifier>
    </Filter>
+    <Filter Include="Source Files\CUDA\lbry">
+      <UniqueIdentifier>{3079ea1f-f768-455a-acd6-f517fac535b4}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="compat\jansson\dump.c">
@ -150,9 +153,6 @@
    <ClCompile Include="sph\luffa.c">
      <Filter>Source Files\sph</Filter>
    </ClCompile>
-    <ClCompile Include="sph\md_helper.c">
-      <Filter>Source Files\sph</Filter>
-    </ClCompile>
    <ClCompile Include="sph\sha2.c">
      <Filter>Source Files\sph</Filter>
    </ClCompile>
@ -180,7 +180,10 @@
    <ClCompile Include="sph\haval.c">
      <Filter>Source Files\sph</Filter>
    </ClCompile>
-    <ClCompile Include="sph\haval_helper.c">
+    <ClCompile Include="sph\ripemd.c">
+      <Filter>Source Files\sph</Filter>
+    </ClCompile>
+    <ClCompile Include="sph\sph_sha2.c">
      <Filter>Source Files\sph</Filter>
    </ClCompile>
    <ClCompile Include="sph\sha2big.c">
@ -721,6 +724,18 @@
    <CudaCompile Include="Algo256\blake2s.cu">
      <Filter>Source Files\CUDA\Algo256</Filter>
    </CudaCompile>
+    <CudaCompile Include="lbry\cuda_sha256_lbry.cu">
+      <Filter>Source Files\CUDA\lbry</Filter>
+    </CudaCompile>
+    <CudaCompile Include="lbry\cuda_sha512_lbry.cu">
+      <Filter>Source Files\CUDA\lbry</Filter>
+    </CudaCompile>
+    <CudaCompile Include="lbry\cuda_ripemd160.cu">
+      <Filter>Source Files\CUDA\lbry</Filter>
+    </CudaCompile>
+    <CudaCompile Include="lbry\lbry.cu">
+      <Filter>Source Files\CUDA\lbry</Filter>
+    </CudaCompile>
  </ItemGroup>
  <ItemGroup>
    <Image Include="res\ccminer.ico">
--- a/lbry/cuda_ripemd160.cu
+++ b/lbry/cuda_ripemd160.cu
@ -0,0 +1,441 @@
+/*
+ * ripemd-160 kernel implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2014, 2016  djm34, tpruvot
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ */
+#include <stdio.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include <cuda_helper.h>
+
+static __constant__ uint32_t c_IV[5] = {
+	0x67452301u, 0xEFCDAB89u, 0x98BADCFEu, 0x10325476u, 0xC3D2E1F0u
+};
+
+//__host__
+//uint64_t xornot64(uint64_t a, uint64_t b, uint64_t c) {
+//	return c ^ (a | !b);
+//}
+
+__forceinline__ __device__
+uint64_t xornot64(uint64_t a, uint64_t b, uint64_t c)
+{
+	uint64_t result;
+	asm("{  .reg .u64 m,n; // xornot64\n\t"
+		"not.b64 m,%2; \n\t"
+		"or.b64 n, %1,m;\n\t"
+		"xor.b64 %0, n,%3;\n\t"
+		"}\n\t"
+		: "=l"(result) : "l"(a), "l"(b), "l"(c));
+	return result;
+}
+
+//__host__
+//uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c) {
+//	return a ^ (b | !c);
+//}
+
+__device__ __forceinline__
+uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c)
+{
+	uint64_t result;
+	asm("{  .reg .u64 m,n; // xornt64\n\t"
+		"not.b64 m,%3; \n\t"
+		"or.b64 n, %2,m;\n\t"
+		"xor.b64 %0, %1,n;\n\t"
+		"}\n\t"
+		: "=l"(result) : "l"(a), "l"(b), "l"(c));
+	return result;
+}
+
+/*
+ * Round functions for RIPEMD-128 and RIPEMD-160.
+ */
+#if 1
+#define F1(x, y, z)   ((x) ^ (y) ^ (z))
+#define F2(x, y, z)   ((((y) ^ (z)) & (x)) ^ (z))
+#define F3(x, y, z)   (((x) | ~(y)) ^ (z))
+#define F4(x, y, z)   ((((x) ^ (y)) & (z)) ^ (y))
+#define F5(x, y, z)   ((x) ^ ((y) | ~(z)))
+#else
+#define F1(x, y, z)   xor3(x,y,z)
+#define F2(x, y, z)   xandx(x,y,z)
+#define F3(x, y, z)   xornot64(x,y,z)
+#define F4(x, y, z)   xandx(z,x,y)
+#define F5(x, y, z)   xornt64(x,y,z)
+#endif
+
+/*
+ * Round constants for RIPEMD-160.
+ */
+#define K11 0x00000000u
+#define K12 0x5A827999u
+#define K13 0x6ED9EBA1u
+#define K14 0x8F1BBCDCu
+#define K15 0xA953FD4Eu
+
+#define K21 0x50A28BE6u
+#define K22 0x5C4DD124u
+#define K23 0x6D703EF3u
+#define K24 0x7A6D76E9u
+#define K25 0x00000000u
+
+#define RR(a, b, c, d, e, f, s, r, k) { \
+	a = SPH_T32(ROTL32(SPH_T32(a + f(b, c, d) + r + k), s) + e); \
+	c = ROTL32(c, 10); \
+}
+
+#define ROUND1(a, b, c, d, e, f, s, r, k) \
+	RR(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
+
+#define ROUND2(a, b, c, d, e, f, s, r, k) \
+	RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
+
+#define RIPEMD160_ROUND_BODY(in, h) { \
+	uint32_t A1, B1, C1, D1, E1; \
+	uint32_t A2, B2, C2, D2, E2; \
+	uint32_t tmp; \
+\
+	A1 = A2 = h[0]; \
+	B1 = B2 = h[1]; \
+	C1 = C2 = h[2]; \
+	D1 = D2 = h[3]; \
+	E1 = E2 = h[4]; \
+\
+	ROUND1(A, B, C, D, E, F1, 11, in[ 0],  1); \
+	ROUND1(E, A, B, C, D, F1, 14, in[ 1],  1); \
+	ROUND1(D, E, A, B, C, F1, 15, in[ 2],  1); \
+	ROUND1(C, D, E, A, B, F1, 12, in[ 3],  1); \
+	ROUND1(B, C, D, E, A, F1,  5, in[ 4],  1); \
+	ROUND1(A, B, C, D, E, F1,  8, in[ 5],  1); \
+	ROUND1(E, A, B, C, D, F1,  7, in[ 6],  1); \
+	ROUND1(D, E, A, B, C, F1,  9, in[ 7],  1); \
+	ROUND1(C, D, E, A, B, F1, 11, in[ 8],  1); \
+	ROUND1(B, C, D, E, A, F1, 13, in[ 9],  1); \
+	ROUND1(A, B, C, D, E, F1, 14, in[10],  1); \
+	ROUND1(E, A, B, C, D, F1, 15, in[11],  1); \
+	ROUND1(D, E, A, B, C, F1,  6, in[12],  1); \
+	ROUND1(C, D, E, A, B, F1,  7, in[13],  1); \
+	ROUND1(B, C, D, E, A, F1,  9, in[14],  1); \
+	ROUND1(A, B, C, D, E, F1,  8, in[15],  1); \
+\
+	ROUND1(E, A, B, C, D, F2,  7, in[ 7],  2); \
+	ROUND1(D, E, A, B, C, F2,  6, in[ 4],  2); \
+	ROUND1(C, D, E, A, B, F2,  8, in[13],  2); \
+	ROUND1(B, C, D, E, A, F2, 13, in[ 1],  2); \
+	ROUND1(A, B, C, D, E, F2, 11, in[10],  2); \
+	ROUND1(E, A, B, C, D, F2,  9, in[ 6],  2); \
+	ROUND1(D, E, A, B, C, F2,  7, in[15],  2); \
+	ROUND1(C, D, E, A, B, F2, 15, in[ 3],  2); \
+	ROUND1(B, C, D, E, A, F2,  7, in[12],  2); \
+	ROUND1(A, B, C, D, E, F2, 12, in[ 0],  2); \
+	ROUND1(E, A, B, C, D, F2, 15, in[ 9],  2); \
+	ROUND1(D, E, A, B, C, F2,  9, in[ 5],  2); \
+	ROUND1(C, D, E, A, B, F2, 11, in[ 2],  2); \
+	ROUND1(B, C, D, E, A, F2,  7, in[14],  2); \
+	ROUND1(A, B, C, D, E, F2, 13, in[11],  2); \
+	ROUND1(E, A, B, C, D, F2, 12, in[ 8],  2); \
+\
+	ROUND1(D, E, A, B, C, F3, 11, in[ 3],  3); \
+	ROUND1(C, D, E, A, B, F3, 13, in[10],  3); \
+	ROUND1(B, C, D, E, A, F3,  6, in[14],  3); \
+	ROUND1(A, B, C, D, E, F3,  7, in[ 4],  3); \
+	ROUND1(E, A, B, C, D, F3, 14, in[ 9],  3); \
+	ROUND1(D, E, A, B, C, F3,  9, in[15],  3); \
+	ROUND1(C, D, E, A, B, F3, 13, in[ 8],  3); \
+	ROUND1(B, C, D, E, A, F3, 15, in[ 1],  3); \
+	ROUND1(A, B, C, D, E, F3, 14, in[ 2],  3); \
+	ROUND1(E, A, B, C, D, F3,  8, in[ 7],  3); \
+	ROUND1(D, E, A, B, C, F3, 13, in[ 0],  3); \
+	ROUND1(C, D, E, A, B, F3,  6, in[ 6],  3); \
+	ROUND1(B, C, D, E, A, F3,  5, in[13],  3); \
+	ROUND1(A, B, C, D, E, F3, 12, in[11],  3); \
+	ROUND1(E, A, B, C, D, F3,  7, in[ 5],  3); \
+	ROUND1(D, E, A, B, C, F3,  5, in[12],  3); \
+\
+	ROUND1(C, D, E, A, B, F4, 11, in[ 1],  4); \
+	ROUND1(B, C, D, E, A, F4, 12, in[ 9],  4); \
+	ROUND1(A, B, C, D, E, F4, 14, in[11],  4); \
+	ROUND1(E, A, B, C, D, F4, 15, in[10],  4); \
+	ROUND1(D, E, A, B, C, F4, 14, in[ 0],  4); \
+	ROUND1(C, D, E, A, B, F4, 15, in[ 8],  4); \
+	ROUND1(B, C, D, E, A, F4,  9, in[12],  4); \
+	ROUND1(A, B, C, D, E, F4,  8, in[ 4],  4); \
+	ROUND1(E, A, B, C, D, F4,  9, in[13],  4); \
+	ROUND1(D, E, A, B, C, F4, 14, in[ 3],  4); \
+	ROUND1(C, D, E, A, B, F4,  5, in[ 7],  4); \
+	ROUND1(B, C, D, E, A, F4,  6, in[15],  4); \
+	ROUND1(A, B, C, D, E, F4,  8, in[14],  4); \
+	ROUND1(E, A, B, C, D, F4,  6, in[ 5],  4); \
+	ROUND1(D, E, A, B, C, F4,  5, in[ 6],  4); \
+	ROUND1(C, D, E, A, B, F4, 12, in[ 2],  4); \
+\
+	ROUND1(B, C, D, E, A, F5,  9, in[ 4],  5); \
+	ROUND1(A, B, C, D, E, F5, 15, in[ 0],  5); \
+	ROUND1(E, A, B, C, D, F5,  5, in[ 5],  5); \
+	ROUND1(D, E, A, B, C, F5, 11, in[ 9],  5); \
+	ROUND1(C, D, E, A, B, F5,  6, in[ 7],  5); \
+	ROUND1(B, C, D, E, A, F5,  8, in[12],  5); \
+	ROUND1(A, B, C, D, E, F5, 13, in[ 2],  5); \
+	ROUND1(E, A, B, C, D, F5, 12, in[10],  5); \
+	ROUND1(D, E, A, B, C, F5,  5, in[14],  5); \
+	ROUND1(C, D, E, A, B, F5, 12, in[ 1],  5); \
+	ROUND1(B, C, D, E, A, F5, 13, in[ 3],  5); \
+	ROUND1(A, B, C, D, E, F5, 14, in[ 8],  5); \
+	ROUND1(E, A, B, C, D, F5, 11, in[11],  5); \
+	ROUND1(D, E, A, B, C, F5,  8, in[ 6],  5); \
+	ROUND1(C, D, E, A, B, F5,  5, in[15],  5); \
+	ROUND1(B, C, D, E, A, F5,  6, in[13],  5); \
+\
+	ROUND2(A, B, C, D, E, F5,  8, in[ 5],  1); \
+	ROUND2(E, A, B, C, D, F5,  9, in[14],  1); \
+	ROUND2(D, E, A, B, C, F5,  9, in[ 7],  1); \
+	ROUND2(C, D, E, A, B, F5, 11, in[ 0],  1); \
+	ROUND2(B, C, D, E, A, F5, 13, in[ 9],  1); \
+	ROUND2(A, B, C, D, E, F5, 15, in[ 2],  1); \
+	ROUND2(E, A, B, C, D, F5, 15, in[11],  1); \
+	ROUND2(D, E, A, B, C, F5,  5, in[ 4],  1); \
+	ROUND2(C, D, E, A, B, F5,  7, in[13],  1); \
+	ROUND2(B, C, D, E, A, F5,  7, in[ 6],  1); \
+	ROUND2(A, B, C, D, E, F5,  8, in[15],  1); \
+	ROUND2(E, A, B, C, D, F5, 11, in[ 8],  1); \
+	ROUND2(D, E, A, B, C, F5, 14, in[ 1],  1); \
+	ROUND2(C, D, E, A, B, F5, 14, in[10],  1); \
+	ROUND2(B, C, D, E, A, F5, 12, in[ 3],  1); \
+	ROUND2(A, B, C, D, E, F5,  6, in[12],  1); \
+\
+	ROUND2(E, A, B, C, D, F4,  9, in[ 6],  2); \
+	ROUND2(D, E, A, B, C, F4, 13, in[11],  2); \
+	ROUND2(C, D, E, A, B, F4, 15, in[ 3],  2); \
+	ROUND2(B, C, D, E, A, F4,  7, in[ 7],  2); \
+	ROUND2(A, B, C, D, E, F4, 12, in[ 0],  2); \
+	ROUND2(E, A, B, C, D, F4,  8, in[13],  2); \
+	ROUND2(D, E, A, B, C, F4,  9, in[ 5],  2); \
+	ROUND2(C, D, E, A, B, F4, 11, in[10],  2); \
+	ROUND2(B, C, D, E, A, F4,  7, in[14],  2); \
+	ROUND2(A, B, C, D, E, F4,  7, in[15],  2); \
+	ROUND2(E, A, B, C, D, F4, 12, in[ 8],  2); \
+	ROUND2(D, E, A, B, C, F4,  7, in[12],  2); \
+	ROUND2(C, D, E, A, B, F4,  6, in[ 4],  2); \
+	ROUND2(B, C, D, E, A, F4, 15, in[ 9],  2); \
+	ROUND2(A, B, C, D, E, F4, 13, in[ 1],  2); \
+	ROUND2(E, A, B, C, D, F4, 11, in[ 2],  2); \
+\
+	ROUND2(D, E, A, B, C, F3,  9, in[15],  3); \
+	ROUND2(C, D, E, A, B, F3,  7, in[ 5],  3); \
+	ROUND2(B, C, D, E, A, F3, 15, in[ 1],  3); \
+	ROUND2(A, B, C, D, E, F3, 11, in[ 3],  3); \
+	ROUND2(E, A, B, C, D, F3,  8, in[ 7],  3); \
+	ROUND2(D, E, A, B, C, F3,  6, in[14],  3); \
+	ROUND2(C, D, E, A, B, F3,  6, in[ 6],  3); \
+	ROUND2(B, C, D, E, A, F3, 14, in[ 9],  3); \
+	ROUND2(A, B, C, D, E, F3, 12, in[11],  3); \
+	ROUND2(E, A, B, C, D, F3, 13, in[ 8],  3); \
+	ROUND2(D, E, A, B, C, F3,  5, in[12],  3); \
+	ROUND2(C, D, E, A, B, F3, 14, in[ 2],  3); \
+	ROUND2(B, C, D, E, A, F3, 13, in[10],  3); \
+	ROUND2(A, B, C, D, E, F3, 13, in[ 0],  3); \
+	ROUND2(E, A, B, C, D, F3,  7, in[ 4],  3); \
+	ROUND2(D, E, A, B, C, F3,  5, in[13],  3); \
+\
+	ROUND2(C, D, E, A, B, F2, 15, in[ 8],  4); \
+	ROUND2(B, C, D, E, A, F2,  5, in[ 6],  4); \
+	ROUND2(A, B, C, D, E, F2,  8, in[ 4],  4); \
+	ROUND2(E, A, B, C, D, F2, 11, in[ 1],  4); \
+	ROUND2(D, E, A, B, C, F2, 14, in[ 3],  4); \
+	ROUND2(C, D, E, A, B, F2, 14, in[11],  4); \
+	ROUND2(B, C, D, E, A, F2,  6, in[15],  4); \
+	ROUND2(A, B, C, D, E, F2, 14, in[ 0],  4); \
+	ROUND2(E, A, B, C, D, F2,  6, in[ 5],  4); \
+	ROUND2(D, E, A, B, C, F2,  9, in[12],  4); \
+	ROUND2(C, D, E, A, B, F2, 12, in[ 2],  4); \
+	ROUND2(B, C, D, E, A, F2,  9, in[13],  4); \
+	ROUND2(A, B, C, D, E, F2, 12, in[ 9],  4); \
+	ROUND2(E, A, B, C, D, F2,  5, in[ 7],  4); \
+	ROUND2(D, E, A, B, C, F2, 15, in[10],  4); \
+	ROUND2(C, D, E, A, B, F2,  8, in[14],  4); \
+\
+	ROUND2(B, C, D, E, A, F1,  8, in[12],  5); \
+	ROUND2(A, B, C, D, E, F1,  5, in[15],  5); \
+	ROUND2(E, A, B, C, D, F1, 12, in[10],  5); \
+	ROUND2(D, E, A, B, C, F1,  9, in[ 4],  5); \
+	ROUND2(C, D, E, A, B, F1, 12, in[ 1],  5); \
+	ROUND2(B, C, D, E, A, F1,  5, in[ 5],  5); \
+	ROUND2(A, B, C, D, E, F1, 14, in[ 8],  5); \
+	ROUND2(E, A, B, C, D, F1,  6, in[ 7],  5); \
+	ROUND2(D, E, A, B, C, F1,  8, in[ 6],  5); \
+	ROUND2(C, D, E, A, B, F1, 13, in[ 2],  5); \
+	ROUND2(B, C, D, E, A, F1,  6, in[13],  5); \
+	ROUND2(A, B, C, D, E, F1,  5, in[14],  5); \
+	ROUND2(E, A, B, C, D, F1, 15, in[ 0],  5); \
+	ROUND2(D, E, A, B, C, F1, 13, in[ 3],  5); \
+	ROUND2(C, D, E, A, B, F1, 11, in[ 9],  5); \
+	ROUND2(B, C, D, E, A, F1, 11, in[11],  5); \
+\
+	tmp  = (h[1] + C1 + D2); \
+	h[1] = (h[2] + D1 + E2); \
+	h[2] = (h[3] + E1 + A2); \
+	h[3] = (h[4] + A1 + B2); \
+	h[4] = (h[0] + B1 + C2); \
+	h[0] = tmp; \
+}
+
+#if 0
+__global__
+void lbry_ripemd160_gpu_hash_32(const uint32_t threads, uint64_t *g_hash, const uint32_t byteOffset)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U + byteOffset/8]);
+
+		uint32_t in[16];
+		for (int i=0; i<8; i++)
+			in[i] = (hash[i]);
+		in[8] = 0x80;
+
+		#pragma unroll
+		for (int i=9;i<16;i++) in[i] = 0;
+
+		in[14] = 0x100; // size in bits
+
+		uint32_t h[5];
+		#pragma unroll
+		for (int i=0; i<5; i++)
+			h[i] = c_IV[i];
+
+		RIPEMD160_ROUND_BODY(in, h);
+
+		#pragma unroll
+		for (int i=0; i<5; i++)
+			hash[i] = h[i];
+
+#ifdef PAD_ZEROS
+		// 20 bytes hash on 32 or 64 bytes output space
+		hash[5] = 0;
+		hash[6] = 0;
+		hash[7] = 0;
+#endif
+	}
+}
+
+__host__
+void lbry_ripemd160_hash_32(int thr_id, uint32_t threads, uint32_t *g_Hash, uint32_t byteOffset, cudaStream_t stream)
+{
+	const uint32_t threadsperblock = 128;
+
+	dim3 grid(threads/threadsperblock);
+	dim3 block(threadsperblock);
+
+	lbry_ripemd160_gpu_hash_32 <<<grid, block, 0, stream>>> (threads, (uint64_t*) g_Hash, byteOffset);
+}
+#endif
+
+__global__
+//__launch_bounds__(256,6)
+void lbry_ripemd160_gpu_hash_32x2(const uint32_t threads, uint64_t *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U]);
+
+		uint32_t in[16];
+		for (int i=0; i<8; i++)
+			in[i] = (hash[i]);
+		in[8] = 0x80;
+
+		#pragma unroll
+		for (int i=9;i<16;i++) in[i] = 0;
+
+		in[14] = 0x100; // size in bits
+
+		uint32_t h[5];
+		#pragma unroll
+		for (int i=0; i<5; i++)
+			h[i] = c_IV[i];
+
+		RIPEMD160_ROUND_BODY(in, h);
+
+		#pragma unroll
+		for (int i=0; i<5; i++)
+			hash[i] = h[i];
+
+#ifdef PAD_ZEROS
+		// 20 bytes hash on 32 output space
+		hash[5] = 0;
+		hash[6] = 0;
+		hash[7] = 0;
+#endif
+		// second 32 bytes block hash
+		hash += 8;
+
+		#pragma unroll
+		for (int i=0; i<8; i++)
+			in[i] = (hash[i]);
+		in[8] = 0x80;
+
+		#pragma unroll
+		for (int i=9;i<16;i++) in[i] = 0;
+
+		in[14] = 0x100; // size in bits
+
+		#pragma unroll
+		for (int i=0; i<5; i++)
+			h[i] = c_IV[i];
+
+		RIPEMD160_ROUND_BODY(in, h);
+
+		#pragma unroll
+		for (int i=0; i<5; i++)
+			hash[i] = h[i];
+
+#ifdef PAD_ZEROS
+		// 20 bytes hash on 32 output space
+		hash[5] = 0;
+		hash[6] = 0;
+		hash[7] = 0;
+#endif
+	}
+}
+
+__host__
+void lbry_ripemd160_hash_32x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream)
+{
+	const uint32_t threadsperblock = 128;
+
+	dim3 grid(threads/threadsperblock);
+	dim3 block(threadsperblock);
+
+	lbry_ripemd160_gpu_hash_32x2 <<<grid, block, 0, stream>>> (threads, (uint64_t*) g_Hash);
+}
+
+void lbry_ripemd160_init(int thr_id)
+{
+	//cudaMemcpyToSymbol(c_IV, IV, sizeof(IV), 0, cudaMemcpyHostToDevice);
+}
--- a/lbry/cuda_sha256_lbry.cu
+++ b/lbry/cuda_sha256_lbry.cu
@ -0,0 +1,712 @@
+/*
+ * sha256 CUDA implementation.
+ */
+#include <stdio.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include <cuda_helper.h>
+#include <miner.h>
+
+__constant__ static uint32_t __align__(8) c_midstate112[8];
+__constant__ static uint32_t __align__(8) c_dataEnd112[12];
+
+const __constant__  uint32_t __align__(8) c_H256[8] = {
+	0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
+	0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
+};
+__constant__ static uint32_t __align__(8) c_K[64];
+
+static __thread uint32_t* d_resNonces;
+__constant__ static uint32_t __align__(8) c_target[2];
+__device__ uint64_t d_target[1];
+
+// ------------------------------------------------------------------------------------------------
+
+static const uint32_t cpu_H256[8] = {
+	0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
+	0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
+};
+
+static const uint32_t cpu_K[64] = {
+	0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+	0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+	0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+	0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+	0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+	0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+	0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+	0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
+};
+
+#define ROTR ROTR32
+
+__host__
+static void sha256_step1_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
+	uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t in, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+	uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
+	uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
+	uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+
+	t1 = h + bsg21 + vxandx + Kshared + in;
+	t2 = bsg20 + andorv;
+	d = d + t1;
+	h = t1 + t2;
+}
+
+__host__
+static void sha256_step2_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
+	uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t* in, uint32_t pc, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+
+	int pcidx1 = (pc-2)  & 0xF;
+	int pcidx2 = (pc-7)  & 0xF;
+	int pcidx3 = (pc-15) & 0xF;
+
+	uint32_t inx0 = in[pc];
+	uint32_t inx1 = in[pcidx1];
+	uint32_t inx2 = in[pcidx2];
+	uint32_t inx3 = in[pcidx3];
+
+	uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
+	uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
+	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+	uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
+	uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
+	uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+
+	in[pc] = ssg21 + inx2 + ssg20 + inx0;
+
+	t1 = h + bsg21 + vxandx + Kshared + in[pc];
+	t2 = bsg20 + andorv;
+	d =  d + t1;
+	h = t1 + t2;
+}
+
+__host__
+static void sha256_round_body_host(uint32_t* in, uint32_t* state, const uint32_t* Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha256_step1_host(a,b,c,d,e,f,g,h,in[0], Kshared[0]);
+	sha256_step1_host(h,a,b,c,d,e,f,g,in[1], Kshared[1]);
+	sha256_step1_host(g,h,a,b,c,d,e,f,in[2], Kshared[2]);
+	sha256_step1_host(f,g,h,a,b,c,d,e,in[3], Kshared[3]);
+	sha256_step1_host(e,f,g,h,a,b,c,d,in[4], Kshared[4]);
+	sha256_step1_host(d,e,f,g,h,a,b,c,in[5], Kshared[5]);
+	sha256_step1_host(c,d,e,f,g,h,a,b,in[6], Kshared[6]);
+	sha256_step1_host(b,c,d,e,f,g,h,a,in[7], Kshared[7]);
+	sha256_step1_host(a,b,c,d,e,f,g,h,in[8], Kshared[8]);
+	sha256_step1_host(h,a,b,c,d,e,f,g,in[9], Kshared[9]);
+	sha256_step1_host(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
+	sha256_step1_host(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
+	sha256_step1_host(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
+	sha256_step1_host(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
+	sha256_step1_host(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
+	sha256_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
+
+	for (int i=0; i<3; i++)
+	{
+		sha256_step2_host(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
+		sha256_step2_host(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
+		sha256_step2_host(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
+		sha256_step2_host(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
+		sha256_step2_host(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
+		sha256_step2_host(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
+		sha256_step2_host(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
+		sha256_step2_host(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
+		sha256_step2_host(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
+		sha256_step2_host(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
+		sha256_step2_host(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
+		sha256_step2_host(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
+		sha256_step2_host(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
+		sha256_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
+		sha256_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
+		sha256_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
+	}
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+	state[4] += e;
+	state[5] += f;
+	state[6] += g;
+	state[7] += h;
+}
+
+__device__ __forceinline__
+uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) {
+	uint32_t result;
+#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
+	asm ("lop3.b32 %0, %1, %2, %3, 0x96; // xor3b"  //0x96 = 0xF0 ^ 0xCC ^ 0xAA
+		: "=r"(result) : "r"(a), "r"(b),"r"(c));
+#else
+	result = a^b^c;
+#endif
+	return result;
+}
+
+/*
+__device__ __forceinline__
+uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) {
+	uint32_t result;
+	asm("{  .reg .u32 t1; // xor3b \n\t"
+		"xor.b32 t1, %2, %3;\n\t"
+		"xor.b32 %0, %1, t1;"
+		"}"
+		: "=r"(result) : "r"(a) ,"r"(b),"r"(c));
+	return result;
+}
+#define xor3b(a,b,c) (a ^ b ^ c)
+*/
+
+__device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
+{
+	uint32_t r1 = ROTR32(x,2);
+	uint32_t r2 = ROTR32(x,13);
+	uint32_t r3 = ROTR32(x,22);
+	return xor3b(r1,r2,r3);
+}
+
+__device__ __forceinline__ uint32_t bsg2_1(const uint32_t x)
+{
+	uint32_t r1 = ROTR32(x,6);
+	uint32_t r2 = ROTR32(x,11);
+	uint32_t r3 = ROTR32(x,25);
+	return xor3b(r1,r2,r3);
+}
+
+__device__ __forceinline__ uint32_t ssg2_0(const uint32_t x)
+{
+	uint64_t r1 = ROTR32(x,7);
+	uint64_t r2 = ROTR32(x,18);
+	uint64_t r3 = shr_t32(x,3);
+	return xor3b(r1,r2,r3);
+}
+
+__device__ __forceinline__ uint32_t ssg2_1(const uint32_t x)
+{
+	uint64_t r1 = ROTR32(x,17);
+	uint64_t r2 = ROTR32(x,19);
+	uint64_t r3 = shr_t32(x,10);
+	return xor3b(r1,r2,r3);
+}
+
+__device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b, const uint32_t c)
+{
+	uint32_t result;
+	asm("{\n\t"
+		".reg .u32 m,n,o;\n\t"
+		"and.b32 m,  %1, %2;\n\t"
+		" or.b32 n,  %1, %2;\n\t"
+		"and.b32 o,   n, %3;\n\t"
+		" or.b32 %0,  m, o ;\n\t"
+		"}\n\t" : "=r"(result) : "r"(a), "r"(b), "r"(c)
+	);
+	return result;
+}
+
+__device__
+static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t in, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+	uint32_t vxandx = xandx(e, f, g);
+	uint32_t bsg21 = bsg2_1(e);
+	uint32_t bsg20 = bsg2_0(a);
+	uint32_t andorv = andor32(a,b,c);
+
+	t1 = h + bsg21 + vxandx + Kshared + in;
+	t2 = bsg20 + andorv;
+	d = d + t1;
+	h = t1 + t2;
+}
+
+__device__
+static void sha2_step2(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t* in, uint32_t pc, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+
+	int pcidx1 = (pc-2) & 0xF;
+	int pcidx2 = (pc-7) & 0xF;
+	int pcidx3 = (pc-15) & 0xF;
+
+	uint32_t inx0 = in[pc];
+	uint32_t inx1 = in[pcidx1];
+	uint32_t inx2 = in[pcidx2];
+	uint32_t inx3 = in[pcidx3];
+
+	uint32_t ssg21 = ssg2_1(inx1);
+	uint32_t ssg20 = ssg2_0(inx3);
+	uint32_t vxandx = xandx(e, f, g);
+	uint32_t bsg21 = bsg2_1(e);
+	uint32_t bsg20 = bsg2_0(a);
+	uint32_t andorv = andor32(a,b,c);
+
+	in[pc] = ssg21 + inx2 + ssg20 + inx0;
+
+	t1 = h + bsg21 + vxandx + Kshared + in[pc];
+	t2 = bsg20 + andorv;
+	d =  d + t1;
+	h = t1 + t2;
+}
+
+__device__
+static void sha256_round_body(uint32_t* in, uint32_t* state, uint32_t* const Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha2_step1(a,b,c,d,e,f,g,h,in[0], Kshared[0]);
+	sha2_step1(h,a,b,c,d,e,f,g,in[1], Kshared[1]);
+	sha2_step1(g,h,a,b,c,d,e,f,in[2], Kshared[2]);
+	sha2_step1(f,g,h,a,b,c,d,e,in[3], Kshared[3]);
+	sha2_step1(e,f,g,h,a,b,c,d,in[4], Kshared[4]);
+	sha2_step1(d,e,f,g,h,a,b,c,in[5], Kshared[5]);
+	sha2_step1(c,d,e,f,g,h,a,b,in[6], Kshared[6]);
+	sha2_step1(b,c,d,e,f,g,h,a,in[7], Kshared[7]);
+	sha2_step1(a,b,c,d,e,f,g,h,in[8], Kshared[8]);
+	sha2_step1(h,a,b,c,d,e,f,g,in[9], Kshared[9]);
+	sha2_step1(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
+	sha2_step1(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
+	sha2_step1(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
+	sha2_step1(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
+	sha2_step1(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
+	sha2_step1(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
+
+	#pragma unroll
+	for (int i=0; i<3; i++)
+	{
+		sha2_step2(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
+		sha2_step2(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
+		sha2_step2(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
+		sha2_step2(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
+		sha2_step2(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
+		sha2_step2(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
+		sha2_step2(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
+		sha2_step2(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
+		sha2_step2(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
+		sha2_step2(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
+		sha2_step2(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
+		sha2_step2(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
+		sha2_step2(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
+		sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
+		sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
+		sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
+	}
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+	state[4] += e;
+	state[5] += f;
+	state[6] += g;
+	state[7] += h;
+}
+
+__device__
+uint64_t cuda_swab32ll(uint64_t x) {
+	return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
+}
+
+__global__
+/*__launch_bounds__(256,3)*/
+void lbry_sha256_gpu_hash_112(const uint32_t threads, const uint32_t startNonce, const bool swabNonce, uint64_t *outputHash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNonce + thread;
+
+		uint32_t dat[16];
+		#pragma unroll
+		for (int i=0;i<11;i++) dat[i] = c_dataEnd112[i]; // pre "swabed"
+		dat[11] = swabNonce ? cuda_swab32(nonce) : nonce;
+		dat[12] = 0x80000000;
+		dat[13] = 0;
+		dat[14] = 0;
+		dat[15] = 0x380;
+
+		uint32_t __align__(8) buf[8];
+		#pragma unroll
+		for (int i=0;i<8;i++) buf[i] = c_midstate112[i];
+
+		sha256_round_body(dat, buf, c_K);
+
+		// output
+		uint2* output = (uint2*) (&outputHash[thread * 8U]);
+		#pragma unroll
+		for (int i=0;i<4;i++) {
+			//output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
+			output[i] = vectorize(((uint64_t*)buf)[i]); // out without swap, new sha256 after
+		}
+	}
+}
+
+__global__
+/*__launch_bounds__(256,3)*/
+void lbry_sha256_gpu_hash_32(uint32_t threads, uint64_t *Hash512)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t __align__(8) buf[8]; // align for vectorize
+		#pragma unroll
+		for (int i=0; i<8; i++) buf[i] = c_H256[i];
+
+		uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]);
+
+		uint32_t dat[16];
+		#pragma unroll
+		//for (int i=0;i<8;i++) dat[i] = cuda_swab32(input[i]);
+		for (int i=0; i<8; i++) dat[i] = input[i];
+		dat[8] = 0x80000000;
+		#pragma unroll
+		for (int i=9; i<15; i++) dat[i] = 0;
+		dat[15] = 0x100;
+
+		sha256_round_body(dat, buf, c_K);
+
+		// output
+		uint2* output = (uint2*) input;
+		#pragma unroll
+		for (int i=0;i<4;i++) {
+			output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
+		}
+#ifdef PAD_ZEROS
+		#pragma unroll
+		for (int i=4; i<8; i++) output[i] = vectorize(0);
+#endif
+	}
+}
+
+__global__
+/*__launch_bounds__(256,3)*/
+void lbry_sha256d_gpu_hash_112(const uint32_t threads, const uint32_t startNonce, const bool swabNonce, uint64_t *outputHash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	extern __shared__ uint32_t s_K[];
+	//s_K[thread & 63] = c_K[thread & 63];
+	if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x];
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNonce + thread;
+
+		uint32_t dat[16];
+		#pragma unroll
+		for (int i=0; i<11; i++) dat[i] = c_dataEnd112[i];
+		dat[11] = swabNonce ? cuda_swab32(nonce) : nonce;
+		dat[12] = 0x80000000;
+		dat[13] = 0;
+		dat[14] = 0;
+		dat[15] = 0x380;
+
+		uint32_t __align__(8) buf[8];
+		#pragma unroll
+		for (int i=0;i<8;i++) buf[i] = c_midstate112[i];
+
+		sha256_round_body(dat, buf, s_K);
+
+		// second sha256
+
+		#pragma unroll
+		for (int i=0; i<8; i++) dat[i] = buf[i];
+		dat[8] = 0x80000000;
+		#pragma unroll
+		for (int i=9; i<15; i++) dat[i] = 0;
+		dat[15] = 0x100;
+
+		#pragma unroll
+		for (int i=0; i<8; i++) buf[i] = c_H256[i];
+
+		sha256_round_body(dat, buf, s_K);
+
+		// output
+		uint2* output = (uint2*) (&outputHash[thread * 8U]);
+		#pragma unroll
+		for (int i=0;i<4;i++) {
+			output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
+			//output[i] = vectorize(((uint64_t*)buf)[i]);
+		}
+	}
+}
+
+__global__
+/*__launch_bounds__(256,3)*/
+void lbry_sha256_gpu_hash_20x2(uint32_t threads, uint64_t *Hash512)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t __align__(8) buf[8]; // align for vectorize
+		#pragma unroll
+		for (int i=0;i<8;i++) buf[i] = c_H256[i];
+
+		uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]);
+
+		uint32_t dat[16];
+		#pragma unroll
+		for (int i=0;i<5;i++) dat[i] = cuda_swab32(input[i]);
+		#pragma unroll
+		for (int i=0;i<5;i++) dat[i+5] = cuda_swab32(input[i+8]);
+		dat[10] = 0x80000000;
+		#pragma unroll
+		for (int i=11;i<15;i++) dat[i] = 0;
+		dat[15] = 0x140;
+
+		sha256_round_body(dat, buf, c_K);
+
+		// output
+		uint2* output = (uint2*) input;
+		#pragma unroll
+		for (int i=0;i<4;i++) {
+			//output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
+			output[i] = vectorize(((uint64_t*)buf)[i]);
+		}
+#ifdef PAD_ZEROS
+		#pragma unroll
+		for (int i=4; i<8; i++) output[i] = vectorize(0);
+#endif
+	}
+}
+
+__global__
+/*__launch_bounds__(256,3)*/
+void lbry_sha256d_gpu_hash_20x2(uint32_t threads, uint64_t *Hash512)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	extern __shared__ uint32_t s_K[];
+	if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x];
+	if (thread < threads)
+	{
+		uint32_t __align__(8) buf[8]; // align for vectorize
+		#pragma unroll
+		for (int i=0;i<8;i++) buf[i] = c_H256[i];
+
+		uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]);
+
+		uint32_t dat[16];
+		#pragma unroll
+		for (int i=0; i<5; i++) dat[i] = cuda_swab32(input[i]);
+		#pragma unroll
+		for (int i=0; i<5; i++) dat[i+5] = cuda_swab32(input[i+8]);
+		dat[10] = 0x80000000;
+		#pragma unroll
+		for (int i=11;i<15;i++) dat[i] = 0;
+		dat[15] = 0x140;
+
+		sha256_round_body(dat, buf, s_K);
+
+		// second sha256
+
+		#pragma unroll
+		for (int i=0; i<8; i++) dat[i] = buf[i];
+		dat[8] = 0x80000000;
+		#pragma unroll
+		for (int i=9; i<15; i++) dat[i] = 0;
+		dat[15] = 0x100;
+
+		#pragma unroll
+		for (int i=0;i<8;i++) buf[i] = c_H256[i];
+
+		sha256_round_body(dat, buf, s_K);
+
+		// output
+		uint2* output = (uint2*) input;
+
+#ifdef FULL_HASH
+		#pragma unroll
+		for (int i=0;i<4;i++) {
+			output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
+			//output[i] = vectorize(((uint64_t*)buf)[i]);
+		}
+#	ifdef PAD_ZEROS
+		#pragma unroll
+		for (int i=4; i<8; i++) output[i] = vectorize(0);
+#	endif
+
+#else
+		//input[6] = cuda_swab32(buf[6]);
+		//input[7] = cuda_swab32(buf[7]);
+		output[3] = vectorize(cuda_swab32ll(((uint64_t*)buf)[3]));
+#endif
+	}
+}
+
+__host__
+void lbry_sha256_init(int thr_id)
+{
+	//cudaMemcpyToSymbol(c_H256, cpu_H256, sizeof(cpu_H256), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_K, cpu_K, sizeof(cpu_K), 0, cudaMemcpyHostToDevice);
+	CUDA_SAFE_CALL(cudaMalloc(&d_resNonces, 4*sizeof(uint32_t)));
+}
+
+__host__
+void lbry_sha256_free(int thr_id)
+{
+	cudaFree(d_resNonces);
+}
+
+__host__
+void lbry_sha256_setBlock_112(uint32_t *pdata, uint32_t *ptarget)
+{
+	uint32_t in[16], buf[8], end[11];
+	for (int i=0;i<16;i++) in[i] = cuda_swab32(pdata[i]);
+	for (int i=0; i<8;i++) buf[i] = cpu_H256[i];
+	for (int i=0;i<11;i++) end[i] = cuda_swab32(pdata[16+i]);
+	sha256_round_body_host(in, buf, cpu_K);
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_midstate112, buf, 32, 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_dataEnd112,  end, sizeof(end), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_target, &ptarget[6], sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &ptarget[6], sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
+}
+
+__host__
+void lbry_sha256_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream)
+{
+	const int threadsperblock = 256;
+
+	dim3 grid(threads/threadsperblock);
+	dim3 block(threadsperblock);
+
+	lbry_sha256_gpu_hash_112 <<<grid, block, 0, stream>>> (threads, startNonce, swabNonce, (uint64_t*) d_outputHash);
+	cudaGetLastError();
+}
+
+__host__
+void lbry_sha256_hash_32(int thr_id, uint32_t threads, uint32_t *d_Hash, cudaStream_t stream)
+{
+	const int threadsperblock = 256;
+
+	dim3 grid(threads/threadsperblock);
+	dim3 block(threadsperblock);
+
+	lbry_sha256_gpu_hash_32 <<<grid, block, 0, stream>>> (threads, (uint64_t*) d_Hash);
+}
+
+__host__
+void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream)
+{
+	const int threadsperblock = 256;
+
+	dim3 grid(threads/threadsperblock);
+	dim3 block(threadsperblock);
+
+	lbry_sha256d_gpu_hash_112 <<<grid, block, 64*4, stream>>> (threads, startNonce, swabNonce, (uint64_t*) d_outputHash);
+}
+
+__host__
+void lbry_sha256_hash_20x2(int thr_id, uint32_t threads, uint32_t *d_Hash, cudaStream_t stream)
+{
+	const int threadsperblock = 256;
+
+	dim3 grid(threads/threadsperblock);
+	dim3 block(threadsperblock);
+
+	lbry_sha256_gpu_hash_20x2 <<<grid, block, 0, stream>>> (threads, (uint64_t*) d_Hash);
+}
+
+__host__
+void lbry_sha256d_hash_20x2(int thr_id, uint32_t threads, uint32_t *d_Hash, cudaStream_t stream)
+{
+	const int threadsperblock = 256;
+
+	dim3 grid(threads/threadsperblock);
+	dim3 block(threadsperblock);
+
+	lbry_sha256d_gpu_hash_20x2 <<<grid, block, 64*4, stream>>> (threads, (uint64_t*) d_Hash);
+}
+
+__global__
+__launch_bounds__(256,3)
+void lbry_sha256d_gpu_hash_final(const uint32_t threads, const uint32_t startNonce, uint64_t *Hash512, uint32_t *resNonces)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t __align__(8) buf[8]; // align for vectorize
+		#pragma unroll
+		for (int i=0;i<8;i++) buf[i] = c_H256[i];
+
+		uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]);
+
+		uint32_t __align__(8) dat[16];
+		#pragma unroll
+		for (int i=0;i<5;i++) dat[i] = cuda_swab32(input[i]);
+		#pragma unroll
+		for (int i=0;i<5;i++) dat[i+5] = cuda_swab32(input[i+8]);
+		dat[10] = 0x80000000;
+		#pragma unroll
+		for (int i=11;i<15;i++) dat[i] = 0;
+		dat[15] = 0x140;
+
+		sha256_round_body(dat, buf, c_K);
+
+		// second sha256
+
+		#pragma unroll
+		for (int i=0;i<8;i++) dat[i] = buf[i];
+		dat[8] = 0x80000000;
+		#pragma unroll
+		for (int i=9;i<15;i++) dat[i] = 0;
+		dat[15] = 0x100;
+
+		#pragma unroll
+		for (int i=0;i<8;i++) buf[i] = c_H256[i];
+
+		sha256_round_body(dat, buf, c_K);
+
+		// valid nonces
+		uint64_t high = cuda_swab32ll(((uint64_t*)buf)[3]);
+		if (high <= d_target[0]) {
+			// printf("%08x %08x - %016llx %016llx - %08x %08x\n", buf[7], buf[6], high, d_target[0], c_target[1], c_target[0]);
+			uint32_t nonce = startNonce + thread;
+			resNonces[1] = atomicExch(resNonces, nonce);
+			d_target[0] = high;
+		}
+	}
+}
+
+__host__
+void lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_inputHash, uint32_t *resNonces, cudaStream_t stream)
+{
+	const int threadsperblock = 256;
+
+	dim3 grid(threads/threadsperblock);
+	dim3 block(threadsperblock);
+
+	CUDA_SAFE_CALL(cudaMemset(d_resNonces, 0xFF, 2 * sizeof(uint32_t)));
+	cudaThreadSynchronize();
+
+	lbry_sha256d_gpu_hash_final <<<grid, block, 0, stream>>> (threads, startNonce, (uint64_t*) d_inputHash, d_resNonces);
+
+	cudaThreadSynchronize();
+
+	CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_resNonces, 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+	if (resNonces[0] == resNonces[1]) {
+		resNonces[1] = UINT32_MAX;
+	}
+}
--- a/lbry/cuda_sha512_lbry.cu
+++ b/lbry/cuda_sha512_lbry.cu
@ -0,0 +1,181 @@
+/**
+ * sha-512 CUDA implementation.
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include <cuda_helper.h>
+
+static __constant__ uint64_t K_512[80];
+
+static const uint64_t K512[80] = {
+	0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
+	0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
+	0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
+	0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
+	0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
+	0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
+	0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
+	0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
+	0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
+	0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
+	0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
+	0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
+	0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
+	0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
+	0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
+	0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
+	0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
+	0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
+	0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
+	0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
+};
+
+//#undef xor3
+//#define xor3(a,b,c) (a^b^c)
+
+//#undef 
+
+static __device__ __forceinline__
+uint64_t bsg5_0(const uint64_t x)
+{
+	uint64_t r1 = ROTR64(x,28);
+	uint64_t r2 = ROTR64(x,34);
+	uint64_t r3 = ROTR64(x,39);
+	return xor3(r1,r2,r3);
+}
+
+static __device__ __forceinline__
+uint64_t bsg5_1(const uint64_t x)
+{
+	uint64_t r1 = ROTR64(x,14);
+	uint64_t r2 = ROTR64(x,18);
+	uint64_t r3 = ROTR64(x,41);
+	return xor3(r1,r2,r3);
+}
+
+static __device__ __forceinline__
+uint64_t ssg5_0(const uint64_t x)
+{
+	uint64_t r1 = ROTR64(x,1);
+	uint64_t r2 = ROTR64(x,8);
+	uint64_t r3 = shr_t64(x,7);
+	return xor3(r1,r2,r3);
+}
+
+static __device__ __forceinline__
+uint64_t ssg5_1(const uint64_t x)
+{
+	uint64_t r1 = ROTR64(x,19);
+	uint64_t r2 = ROTR64(x,61);
+	uint64_t r3 = shr_t64(x,6);
+	return xor3(r1,r2,r3);
+}
+
+static __device__ __forceinline__
+uint64_t xandx64(const uint64_t a, const uint64_t b, const uint64_t c)
+{
+	uint64_t result;
+	asm("{  .reg .u64 m,n; // xandx64\n\t"
+	    "xor.b64 m, %2,%3;\n\t"
+	    "and.b64 n, m,%1;\n\t"
+	    "xor.b64 %0, n,%3;\n\t"
+	    "}" : "=l"(result) : "l"(a), "l"(b), "l"(c));
+	return result;
+}
+
+static __device__ __forceinline__
+void sha512_step2(uint64_t* r, uint64_t* W, uint64_t* K, const int ord, int i)
+{
+	int u = 8-ord;
+	uint64_t a = r[(0+u) & 7];
+	uint64_t b = r[(1+u) & 7];
+	uint64_t c = r[(2+u) & 7];
+	uint64_t d = r[(3+u) & 7];
+	uint64_t e = r[(4+u) & 7];
+	uint64_t f = r[(5+u) & 7];
+	uint64_t g = r[(6+u) & 7];
+	uint64_t h = r[(7+u) & 7];
+
+	uint64_t T1 = h + bsg5_1(e) + xandx64(e,f,g) + W[i] + K[i];
+	uint64_t T2 = bsg5_0(a) + andor(a,b,c);
+	r[(3+u)& 7] = d + T1;
+	r[(7+u)& 7] = T1 + T2;
+}
+
+/**************************************************************************************************/
+
+__global__
+void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint64_t *pHash = &g_hash[thread * 8U];
+
+		uint64_t W[80];
+		uint64_t r[8];
+
+		uint64_t IV512[8] = {
+		        0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+		        0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
+		};
+
+		#pragma unroll
+		for (int i = 0; i < 8; i++)
+			r[i] = IV512[i];
+
+		#pragma unroll
+		for (int i = 0; i < 4; i++) {
+			// 32 bytes input
+			W[i] = cuda_swab64(pHash[i]);
+		}
+
+		W[4] = 0x8000000000000000; // end tag
+
+		#pragma unroll
+		for (int i = 5; i < 15; i++) W[i] = 0;
+
+		W[15] = 0x100; // 256 bits
+
+		#pragma unroll
+		for (int i = 16; i < 80; i++) W[i] = 0;
+
+		#pragma unroll 64
+		for (int i = 16; i < 80; i++)
+			W[i] = ssg5_1(W[i - 2]) + W[i - 7] + ssg5_0(W[i - 15]) + W[i - 16];
+
+		#pragma unroll 10
+		for (int i = 0; i < 10; i++) {
+			#pragma unroll 8
+			for (int ord=0; ord<8; ord++)
+				sha512_step2(r, W, K_512, ord, 8*i + ord);
+		}
+
+		#pragma unroll 8
+		for (int i = 0; i < 8; i++)
+			pHash[i] = cuda_swab64(r[i] + IV512[i]);
+	}
+}
+
+__host__
+void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStream_t stream)
+{
+	const int threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	size_t shared_size = 80*8;
+	lbry_sha512_gpu_hash_32 <<<grid, block, shared_size, stream>>> (threads, (uint64_t*)d_hash);
+}
+
+/**************************************************************************************************/
+
+__host__
+void lbry_sha512_init(int thr_id)
+{
+	cudaMemcpyToSymbol(K_512, K512, 80*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
+}
--- a/lbry/lbry.cu
+++ b/lbry/lbry.cu
@ -0,0 +1,225 @@
+/**
+ * Lbry CUDA Implementation
+ *
+ * by tpruvot@github - July 2016
+ *
+ */
+
+#include <string.h>
+#include <stdint.h>
+
+extern "C" {
+#include <sph/sph_sha2.h>
+#include <sph/sph_ripemd.h>
+}
+
+#include <cuda_helper.h>
+#include <miner.h>
+
+#define A 64
+#define debug_cpu 0
+
+extern "C" void lbry_hash(void* output, const void* input)
+{
+	uint32_t _ALIGN(A) hashA[16];
+	uint32_t _ALIGN(A) hashB[8];
+	uint32_t _ALIGN(A) hashC[8];
+
+	sph_sha256_context ctx_sha256;
+	sph_sha512_context ctx_sha512;
+	sph_ripemd160_context ctx_ripemd;
+
+	sph_sha256_init(&ctx_sha256);
+	sph_sha256(&ctx_sha256, input, 112);
+	sph_sha256_close(&ctx_sha256, hashA);
+
+	sph_sha256(&ctx_sha256, hashA, 32);
+	sph_sha256_close(&ctx_sha256, hashA);
+
+	sph_sha512_init(&ctx_sha512);
+	sph_sha512(&ctx_sha512, hashA, 32);
+	sph_sha512_close(&ctx_sha512, hashA);
+
+	sph_ripemd160_init(&ctx_ripemd);
+	sph_ripemd160(&ctx_ripemd, hashA, 32);  // sha512 low
+	sph_ripemd160_close(&ctx_ripemd, hashB);
+	if (debug_cpu) applog_hex(hashB, 20);
+
+	sph_ripemd160(&ctx_ripemd, &hashA[8], 32); // sha512 high
+	sph_ripemd160_close(&ctx_ripemd, hashC);
+	if (debug_cpu) applog_hex(hashC, 20);
+
+	sph_sha256(&ctx_sha256, hashB, 20);
+	sph_sha256(&ctx_sha256, hashC, 20);
+	sph_sha256_close(&ctx_sha256, hashA);
+	if (debug_cpu) applog_hex(hashA,32);
+
+	sph_sha256(&ctx_sha256, hashA, 32);
+	sph_sha256_close(&ctx_sha256, hashA);
+
+	memcpy(output, hashA, 32);
+}
+
+/* ############################################################################################################################### */
+
+extern void lbry_ripemd160_init(int thr_id);
+extern void lbry_sha256_init(int thr_id);
+extern void lbry_sha256_free(int thr_id);
+extern void lbry_sha256_setBlock_112(uint32_t *pdata, uint32_t *ptarget);
+extern void lbry_sha256_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream);
+extern void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream);
+extern void lbry_sha256_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStream_t stream);
+extern void lbry_sha512_init(int thr_id);
+extern void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStream_t stream);
+extern void lbry_ripemd160_hash_32x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream);
+extern void lbry_sha256_hash_20x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream);
+extern void lbry_sha256d_hash_20x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream);
+extern void lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_inputHash, uint32_t *resNonces, cudaStream_t stream);
+
+static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {
+	return iftrue ? swab32(val) : val;
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+static uint32_t *d_hash[MAX_GPUS];
+
+// nonce position is different
+#define LBC_NONCE_OFT32 27
+
+extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(A) vhash[8];
+	uint32_t _ALIGN(A) endiandata[28];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+
+	const uint32_t first_nonce = pdata[LBC_NONCE_OFT32];
+	const int swap = 0; // to toggle nonce endian
+
+	const int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 22 : 20;
+	if (device_sm[dev_id] >= 600) intensity = 23;
+	if (device_sm[dev_id] < 350) intensity = 18;
+
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark) {
+		ptarget[7] = 0xff;
+	}
+
+	if (!init[thr_id]){
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage (linux)
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
+
+		lbry_sha256_init(thr_id);
+		lbry_sha512_init(thr_id);
+		lbry_ripemd160_init(thr_id);
+		cuda_check_cpu_init(thr_id, throughput);
+		CUDA_LOG_ERROR();
+
+		init[thr_id] = true;
+	}
+
+	for (int i=0; i < LBC_NONCE_OFT32; i++) {
+		be32enc(&endiandata[i], pdata[i]);
+	}
+
+	lbry_sha256_setBlock_112(endiandata, ptarget);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+
+		// Hash with CUDA
+		#if 0
+		lbry_sha256_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], swap, 0);
+		lbry_sha256_hash_32(thr_id, throughput, d_hash[thr_id], 0);
+		#else
+		lbry_sha256d_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], swap, 0);
+		#endif
+		CUDA_LOG_ERROR();
+
+		lbry_sha512_hash_32(thr_id, throughput, d_hash[thr_id], 0);
+
+		lbry_ripemd160_hash_32x2(thr_id, throughput, d_hash[thr_id], 0);
+
+		#if 0
+		lbry_sha256d_hash_20x2(thr_id, throughput, d_hash[thr_id], 0);
+		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id]);
+		#else
+		uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
+		lbry_sha256d_hash_final(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], resNonces, 0);
+		uint32_t foundNonce = resNonces[0];
+		#endif
+
+		*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + throughput;
+
+		if (foundNonce != UINT32_MAX)
+		{
+			endiandata[LBC_NONCE_OFT32] = swab32_if(foundNonce, !swap);
+			lbry_hash(vhash, endiandata);
+
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				int res = 1;
+				uint32_t secNonce =  resNonces[1];
+				//uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], 1);
+				work->nonces[0] = swab32_if(foundNonce, swap);
+				work_set_target_ratio(work, vhash);
+				if (secNonce != UINT32_MAX) {
+				//if (secNonce) {
+					if (opt_debug)
+						gpulog(LOG_BLUE, thr_id, "found second nonce %08x", swab32(secNonce));
+					endiandata[LBC_NONCE_OFT32] = swab32_if(secNonce, !swap);
+					lbry_hash(vhash, endiandata);
+					if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio) {
+						work_set_target_ratio(work, vhash);
+						xchg(work->nonces[0], work->nonces[1]);
+					}
+					work->nonces[1] = swab32_if(secNonce, swap);
+					res++;
+				}
+				pdata[LBC_NONCE_OFT32] = work->nonces[0];
+				return res;
+			} else {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU %08x > %08x!", foundNonce, vhash[7], ptarget[7]);
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[LBC_NONCE_OFT32] >= max_nonce) {
+			pdata[LBC_NONCE_OFT32] = max_nonce;
+			break;
+		}
+
+		pdata[LBC_NONCE_OFT32] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	//*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce;
+
+	return 0;
+}
+
+// cleanup
+void free_lbry(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+	lbry_sha256_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/miner.h
+++ b/miner.h
@ -275,6 +275,7 @@ extern int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce,
 extern int scanhash_groestlcoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_heavy(int thr_id,struct work *work, uint32_t max_nonce, unsigned long *hashes_done, uint32_t maxvote, int blocklen);
 extern int scanhash_jackpot(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_lyra2v2(int thr_id,struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@ -584,6 +585,7 @@ struct stratum_job {
 	unsigned char version[4];
 	unsigned char nbits[4];
 	unsigned char ntime[4];
+	unsigned char claim[32]; // lbry
 	bool clean;
 	unsigned char nreward[2];
 	uint32_t height;
@ -797,6 +799,7 @@ void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
 void keccak256_hash(void *state, const void *input);
 unsigned int jackpothash(void *state, const void *input);
 void groestlhash(void *state, const void *input);
+void lbry_hash(void *output, const void *input);
 void lyra2re_hash(void *state, const void *input);
 void lyra2v2_hash(void *state, const void *input);
 void myriadhash(void *state, const void *input);
--- a/sph/ripemd.c
+++ b/sph/ripemd.c
@ -0,0 +1,833 @@
+/* $Id: ripemd.c 216 2010-06-08 09:46:57Z tp $ */
+/*
+ * RIPEMD-160 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_ripemd.h"
+
+/*
+ * Round functions for RIPEMD (original).
+ */
+#define F(x, y, z)    ((((y) ^ (z)) & (x)) ^ (z))
+#define G(x, y, z)    (((x) & (y)) | (((x) | (y)) & (z)))
+#define H(x, y, z)    ((x) ^ (y) ^ (z))
+
+static const sph_u32 oIV[5] = {
+	SPH_C32(0x67452301), SPH_C32(0xEFCDAB89),
+	SPH_C32(0x98BADCFE), SPH_C32(0x10325476)
+};
+
+/*
+ * Round functions for RIPEMD-128 and RIPEMD-160.
+ */
+#define F1(x, y, z)   ((x) ^ (y) ^ (z))
+#define F2(x, y, z)   ((((y) ^ (z)) & (x)) ^ (z))
+#define F3(x, y, z)   (((x) | ~(y)) ^ (z))
+#define F4(x, y, z)   ((((x) ^ (y)) & (z)) ^ (y))
+#define F5(x, y, z)   ((x) ^ ((y) | ~(z)))
+
+static const sph_u32 IV[5] = {
+	SPH_C32(0x67452301), SPH_C32(0xEFCDAB89), SPH_C32(0x98BADCFE),
+	SPH_C32(0x10325476), SPH_C32(0xC3D2E1F0)
+};
+
+#define ROTL    SPH_ROTL32
+
+/* ===================================================================== */
+/*
+ * RIPEMD (original hash, deprecated).
+ */
+
+#define FF1(A, B, C, D, X, s)   do { \
+		sph_u32 tmp = SPH_T32((A) + F(B, C, D) + (X)); \
+		(A) = ROTL(tmp, (s)); \
+	} while (0)
+
+#define GG1(A, B, C, D, X, s)   do { \
+		sph_u32 tmp = SPH_T32((A) + G(B, C, D) \
+			+ (X) + SPH_C32(0x5A827999)); \
+		(A) = ROTL(tmp, (s)); \
+	} while (0)
+
+#define HH1(A, B, C, D, X, s)   do { \
+		sph_u32 tmp = SPH_T32((A) + H(B, C, D) \
+			+ (X) + SPH_C32(0x6ED9EBA1)); \
+		(A) = ROTL(tmp, (s)); \
+	} while (0)
+
+#define FF2(A, B, C, D, X, s)   do { \
+		sph_u32 tmp = SPH_T32((A) + F(B, C, D) \
+			+ (X) + SPH_C32(0x50A28BE6)); \
+		(A) = ROTL(tmp, (s)); \
+	} while (0)
+
+#define GG2(A, B, C, D, X, s)   do { \
+		sph_u32 tmp = SPH_T32((A) + G(B, C, D) + (X)); \
+		(A) = ROTL(tmp, (s)); \
+	} while (0)
+
+#define HH2(A, B, C, D, X, s)   do { \
+		sph_u32 tmp = SPH_T32((A) + H(B, C, D) \
+			+ (X) + SPH_C32(0x5C4DD124)); \
+		(A) = ROTL(tmp, (s)); \
+	} while (0)
+
+#define RIPEMD_ROUND_BODY(in, h)   do { \
+		sph_u32 A1, B1, C1, D1; \
+		sph_u32 A2, B2, C2, D2; \
+		sph_u32 tmp; \
+ \
+		A1 = A2 = (h)[0]; \
+		B1 = B2 = (h)[1]; \
+		C1 = C2 = (h)[2]; \
+		D1 = D2 = (h)[3]; \
+ \
+		FF1(A1, B1, C1, D1, in( 0), 11); \
+		FF1(D1, A1, B1, C1, in( 1), 14); \
+		FF1(C1, D1, A1, B1, in( 2), 15); \
+		FF1(B1, C1, D1, A1, in( 3), 12); \
+		FF1(A1, B1, C1, D1, in( 4),  5); \
+		FF1(D1, A1, B1, C1, in( 5),  8); \
+		FF1(C1, D1, A1, B1, in( 6),  7); \
+		FF1(B1, C1, D1, A1, in( 7),  9); \
+		FF1(A1, B1, C1, D1, in( 8), 11); \
+		FF1(D1, A1, B1, C1, in( 9), 13); \
+		FF1(C1, D1, A1, B1, in(10), 14); \
+		FF1(B1, C1, D1, A1, in(11), 15); \
+		FF1(A1, B1, C1, D1, in(12),  6); \
+		FF1(D1, A1, B1, C1, in(13),  7); \
+		FF1(C1, D1, A1, B1, in(14),  9); \
+		FF1(B1, C1, D1, A1, in(15),  8); \
+ \
+		GG1(A1, B1, C1, D1, in( 7),  7); \
+		GG1(D1, A1, B1, C1, in( 4),  6); \
+		GG1(C1, D1, A1, B1, in(13),  8); \
+		GG1(B1, C1, D1, A1, in( 1), 13); \
+		GG1(A1, B1, C1, D1, in(10), 11); \
+		GG1(D1, A1, B1, C1, in( 6),  9); \
+		GG1(C1, D1, A1, B1, in(15),  7); \
+		GG1(B1, C1, D1, A1, in( 3), 15); \
+		GG1(A1, B1, C1, D1, in(12),  7); \
+		GG1(D1, A1, B1, C1, in( 0), 12); \
+		GG1(C1, D1, A1, B1, in( 9), 15); \
+		GG1(B1, C1, D1, A1, in( 5),  9); \
+		GG1(A1, B1, C1, D1, in(14),  7); \
+		GG1(D1, A1, B1, C1, in( 2), 11); \
+		GG1(C1, D1, A1, B1, in(11), 13); \
+		GG1(B1, C1, D1, A1, in( 8), 12); \
+ \
+		HH1(A1, B1, C1, D1, in( 3), 11); \
+		HH1(D1, A1, B1, C1, in(10), 13); \
+		HH1(C1, D1, A1, B1, in( 2), 14); \
+		HH1(B1, C1, D1, A1, in( 4),  7); \
+		HH1(A1, B1, C1, D1, in( 9), 14); \
+		HH1(D1, A1, B1, C1, in(15),  9); \
+		HH1(C1, D1, A1, B1, in( 8), 13); \
+		HH1(B1, C1, D1, A1, in( 1), 15); \
+		HH1(A1, B1, C1, D1, in(14),  6); \
+		HH1(D1, A1, B1, C1, in( 7),  8); \
+		HH1(C1, D1, A1, B1, in( 0), 13); \
+		HH1(B1, C1, D1, A1, in( 6),  6); \
+		HH1(A1, B1, C1, D1, in(11), 12); \
+		HH1(D1, A1, B1, C1, in(13),  5); \
+		HH1(C1, D1, A1, B1, in( 5),  7); \
+		HH1(B1, C1, D1, A1, in(12),  5); \
+ \
+		FF2(A2, B2, C2, D2, in( 0), 11); \
+		FF2(D2, A2, B2, C2, in( 1), 14); \
+		FF2(C2, D2, A2, B2, in( 2), 15); \
+		FF2(B2, C2, D2, A2, in( 3), 12); \
+		FF2(A2, B2, C2, D2, in( 4),  5); \
+		FF2(D2, A2, B2, C2, in( 5),  8); \
+		FF2(C2, D2, A2, B2, in( 6),  7); \
+		FF2(B2, C2, D2, A2, in( 7),  9); \
+		FF2(A2, B2, C2, D2, in( 8), 11); \
+		FF2(D2, A2, B2, C2, in( 9), 13); \
+		FF2(C2, D2, A2, B2, in(10), 14); \
+		FF2(B2, C2, D2, A2, in(11), 15); \
+		FF2(A2, B2, C2, D2, in(12),  6); \
+		FF2(D2, A2, B2, C2, in(13),  7); \
+		FF2(C2, D2, A2, B2, in(14),  9); \
+		FF2(B2, C2, D2, A2, in(15),  8); \
+ \
+		GG2(A2, B2, C2, D2, in( 7),  7); \
+		GG2(D2, A2, B2, C2, in( 4),  6); \
+		GG2(C2, D2, A2, B2, in(13),  8); \
+		GG2(B2, C2, D2, A2, in( 1), 13); \
+		GG2(A2, B2, C2, D2, in(10), 11); \
+		GG2(D2, A2, B2, C2, in( 6),  9); \
+		GG2(C2, D2, A2, B2, in(15),  7); \
+		GG2(B2, C2, D2, A2, in( 3), 15); \
+		GG2(A2, B2, C2, D2, in(12),  7); \
+		GG2(D2, A2, B2, C2, in( 0), 12); \
+		GG2(C2, D2, A2, B2, in( 9), 15); \
+		GG2(B2, C2, D2, A2, in( 5),  9); \
+		GG2(A2, B2, C2, D2, in(14),  7); \
+		GG2(D2, A2, B2, C2, in( 2), 11); \
+		GG2(C2, D2, A2, B2, in(11), 13); \
+		GG2(B2, C2, D2, A2, in( 8), 12); \
+ \
+		HH2(A2, B2, C2, D2, in( 3), 11); \
+		HH2(D2, A2, B2, C2, in(10), 13); \
+		HH2(C2, D2, A2, B2, in( 2), 14); \
+		HH2(B2, C2, D2, A2, in( 4),  7); \
+		HH2(A2, B2, C2, D2, in( 9), 14); \
+		HH2(D2, A2, B2, C2, in(15),  9); \
+		HH2(C2, D2, A2, B2, in( 8), 13); \
+		HH2(B2, C2, D2, A2, in( 1), 15); \
+		HH2(A2, B2, C2, D2, in(14),  6); \
+		HH2(D2, A2, B2, C2, in( 7),  8); \
+		HH2(C2, D2, A2, B2, in( 0), 13); \
+		HH2(B2, C2, D2, A2, in( 6),  6); \
+		HH2(A2, B2, C2, D2, in(11), 12); \
+		HH2(D2, A2, B2, C2, in(13),  5); \
+		HH2(C2, D2, A2, B2, in( 5),  7); \
+		HH2(B2, C2, D2, A2, in(12),  5); \
+ \
+		tmp = SPH_T32((h)[1] + C1 + D2); \
+		(h)[1] = SPH_T32((h)[2] + D1 + A2); \
+		(h)[2] = SPH_T32((h)[3] + A1 + B2); \
+		(h)[3] = SPH_T32((h)[0] + B1 + C2); \
+		(h)[0] = tmp; \
+	} while (0)
+
+/*
+ * One round of RIPEMD. The data must be aligned for 32-bit access.
+ */
+static void
+ripemd_round(const unsigned char *data, sph_u32 r[5])
+{
+#if SPH_LITTLE_FAST
+
+#define RIPEMD_IN(x)   sph_dec32le_aligned(data + (4 * (x)))
+
+#else
+
+	sph_u32 X_var[16];
+	int i;
+
+	for (i = 0; i < 16; i ++)
+		X_var[i] = sph_dec32le_aligned(data + 4 * i);
+#define RIPEMD_IN(x)   X_var[x]
+
+#endif
+	RIPEMD_ROUND_BODY(RIPEMD_IN, r);
+#undef RIPEMD_IN
+}
+
+/* see sph_ripemd.h */
+void
+sph_ripemd_init(void *cc)
+{
+	sph_ripemd_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, oIV, sizeof sc->val);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+#define RFUN   ripemd_round
+#define HASH   ripemd
+#define LE32   1
+#include "md_helper.c"
+#undef RFUN
+#undef HASH
+#undef LE32
+
+/* see sph_ripemd.h */
+void
+sph_ripemd_close(void *cc, void *dst)
+{
+	ripemd_close(cc, dst, 4);
+	sph_ripemd_init(cc);
+}
+
+/* see sph_ripemd.h */
+void
+sph_ripemd_comp(const sph_u32 msg[16], sph_u32 val[4])
+{
+#define RIPEMD_IN(x)   msg[x]
+	RIPEMD_ROUND_BODY(RIPEMD_IN, val);
+#undef RIPEMD_IN
+}
+
+/* ===================================================================== */
+/*
+ * RIPEMD-128.
+ */
+
+/*
+ * Round constants for RIPEMD-128.
+ */
+#define sK11   SPH_C32(0x00000000)
+#define sK12   SPH_C32(0x5A827999)
+#define sK13   SPH_C32(0x6ED9EBA1)
+#define sK14   SPH_C32(0x8F1BBCDC)
+
+#define sK21   SPH_C32(0x50A28BE6)
+#define sK22   SPH_C32(0x5C4DD124)
+#define sK23   SPH_C32(0x6D703EF3)
+#define sK24   SPH_C32(0x00000000)
+
+#define sRR(a, b, c, d, f, s, r, k)   do { \
+		a = ROTL(SPH_T32(a + f(b, c, d) + r + k), s); \
+	} while (0)
+
+#define sROUND1(a, b, c, d, f, s, r, k)  \
+	sRR(a ## 1, b ## 1, c ## 1, d ## 1, f, s, r, sK1 ## k)
+
+#define sROUND2(a, b, c, d, f, s, r, k)  \
+	sRR(a ## 2, b ## 2, c ## 2, d ## 2, f, s, r, sK2 ## k)
+
+/*
+ * This macro defines the body for a RIPEMD-128 compression function
+ * implementation. The "in" parameter should evaluate, when applied to a
+ * numerical input parameter from 0 to 15, to an expression which yields
+ * the corresponding input block. The "h" parameter should evaluate to
+ * an array or pointer expression designating the array of 4 words which
+ * contains the input and output of the compression function.
+ */
+
+#define RIPEMD128_ROUND_BODY(in, h)   do { \
+		sph_u32 A1, B1, C1, D1; \
+		sph_u32 A2, B2, C2, D2; \
+		sph_u32 tmp; \
+ \
+		A1 = A2 = (h)[0]; \
+		B1 = B2 = (h)[1]; \
+		C1 = C2 = (h)[2]; \
+		D1 = D2 = (h)[3]; \
+ \
+		sROUND1(A, B, C, D, F1, 11, in( 0),  1); \
+		sROUND1(D, A, B, C, F1, 14, in( 1),  1); \
+		sROUND1(C, D, A, B, F1, 15, in( 2),  1); \
+		sROUND1(B, C, D, A, F1, 12, in( 3),  1); \
+		sROUND1(A, B, C, D, F1,  5, in( 4),  1); \
+		sROUND1(D, A, B, C, F1,  8, in( 5),  1); \
+		sROUND1(C, D, A, B, F1,  7, in( 6),  1); \
+		sROUND1(B, C, D, A, F1,  9, in( 7),  1); \
+		sROUND1(A, B, C, D, F1, 11, in( 8),  1); \
+		sROUND1(D, A, B, C, F1, 13, in( 9),  1); \
+		sROUND1(C, D, A, B, F1, 14, in(10),  1); \
+		sROUND1(B, C, D, A, F1, 15, in(11),  1); \
+		sROUND1(A, B, C, D, F1,  6, in(12),  1); \
+		sROUND1(D, A, B, C, F1,  7, in(13),  1); \
+		sROUND1(C, D, A, B, F1,  9, in(14),  1); \
+		sROUND1(B, C, D, A, F1,  8, in(15),  1); \
+ \
+		sROUND1(A, B, C, D, F2,  7, in( 7),  2); \
+		sROUND1(D, A, B, C, F2,  6, in( 4),  2); \
+		sROUND1(C, D, A, B, F2,  8, in(13),  2); \
+		sROUND1(B, C, D, A, F2, 13, in( 1),  2); \
+		sROUND1(A, B, C, D, F2, 11, in(10),  2); \
+		sROUND1(D, A, B, C, F2,  9, in( 6),  2); \
+		sROUND1(C, D, A, B, F2,  7, in(15),  2); \
+		sROUND1(B, C, D, A, F2, 15, in( 3),  2); \
+		sROUND1(A, B, C, D, F2,  7, in(12),  2); \
+		sROUND1(D, A, B, C, F2, 12, in( 0),  2); \
+		sROUND1(C, D, A, B, F2, 15, in( 9),  2); \
+		sROUND1(B, C, D, A, F2,  9, in( 5),  2); \
+		sROUND1(A, B, C, D, F2, 11, in( 2),  2); \
+		sROUND1(D, A, B, C, F2,  7, in(14),  2); \
+		sROUND1(C, D, A, B, F2, 13, in(11),  2); \
+		sROUND1(B, C, D, A, F2, 12, in( 8),  2); \
+ \
+		sROUND1(A, B, C, D, F3, 11, in( 3),  3); \
+		sROUND1(D, A, B, C, F3, 13, in(10),  3); \
+		sROUND1(C, D, A, B, F3,  6, in(14),  3); \
+		sROUND1(B, C, D, A, F3,  7, in( 4),  3); \
+		sROUND1(A, B, C, D, F3, 14, in( 9),  3); \
+		sROUND1(D, A, B, C, F3,  9, in(15),  3); \
+		sROUND1(C, D, A, B, F3, 13, in( 8),  3); \
+		sROUND1(B, C, D, A, F3, 15, in( 1),  3); \
+		sROUND1(A, B, C, D, F3, 14, in( 2),  3); \
+		sROUND1(D, A, B, C, F3,  8, in( 7),  3); \
+		sROUND1(C, D, A, B, F3, 13, in( 0),  3); \
+		sROUND1(B, C, D, A, F3,  6, in( 6),  3); \
+		sROUND1(A, B, C, D, F3,  5, in(13),  3); \
+		sROUND1(D, A, B, C, F3, 12, in(11),  3); \
+		sROUND1(C, D, A, B, F3,  7, in( 5),  3); \
+		sROUND1(B, C, D, A, F3,  5, in(12),  3); \
+ \
+		sROUND1(A, B, C, D, F4, 11, in( 1),  4); \
+		sROUND1(D, A, B, C, F4, 12, in( 9),  4); \
+		sROUND1(C, D, A, B, F4, 14, in(11),  4); \
+		sROUND1(B, C, D, A, F4, 15, in(10),  4); \
+		sROUND1(A, B, C, D, F4, 14, in( 0),  4); \
+		sROUND1(D, A, B, C, F4, 15, in( 8),  4); \
+		sROUND1(C, D, A, B, F4,  9, in(12),  4); \
+		sROUND1(B, C, D, A, F4,  8, in( 4),  4); \
+		sROUND1(A, B, C, D, F4,  9, in(13),  4); \
+		sROUND1(D, A, B, C, F4, 14, in( 3),  4); \
+		sROUND1(C, D, A, B, F4,  5, in( 7),  4); \
+		sROUND1(B, C, D, A, F4,  6, in(15),  4); \
+		sROUND1(A, B, C, D, F4,  8, in(14),  4); \
+		sROUND1(D, A, B, C, F4,  6, in( 5),  4); \
+		sROUND1(C, D, A, B, F4,  5, in( 6),  4); \
+		sROUND1(B, C, D, A, F4, 12, in( 2),  4); \
+ \
+		sROUND2(A, B, C, D, F4,  8, in( 5),  1); \
+		sROUND2(D, A, B, C, F4,  9, in(14),  1); \
+		sROUND2(C, D, A, B, F4,  9, in( 7),  1); \
+		sROUND2(B, C, D, A, F4, 11, in( 0),  1); \
+		sROUND2(A, B, C, D, F4, 13, in( 9),  1); \
+		sROUND2(D, A, B, C, F4, 15, in( 2),  1); \
+		sROUND2(C, D, A, B, F4, 15, in(11),  1); \
+		sROUND2(B, C, D, A, F4,  5, in( 4),  1); \
+		sROUND2(A, B, C, D, F4,  7, in(13),  1); \
+		sROUND2(D, A, B, C, F4,  7, in( 6),  1); \
+		sROUND2(C, D, A, B, F4,  8, in(15),  1); \
+		sROUND2(B, C, D, A, F4, 11, in( 8),  1); \
+		sROUND2(A, B, C, D, F4, 14, in( 1),  1); \
+		sROUND2(D, A, B, C, F4, 14, in(10),  1); \
+		sROUND2(C, D, A, B, F4, 12, in( 3),  1); \
+		sROUND2(B, C, D, A, F4,  6, in(12),  1); \
+ \
+		sROUND2(A, B, C, D, F3,  9, in( 6),  2); \
+		sROUND2(D, A, B, C, F3, 13, in(11),  2); \
+		sROUND2(C, D, A, B, F3, 15, in( 3),  2); \
+		sROUND2(B, C, D, A, F3,  7, in( 7),  2); \
+		sROUND2(A, B, C, D, F3, 12, in( 0),  2); \
+		sROUND2(D, A, B, C, F3,  8, in(13),  2); \
+		sROUND2(C, D, A, B, F3,  9, in( 5),  2); \
+		sROUND2(B, C, D, A, F3, 11, in(10),  2); \
+		sROUND2(A, B, C, D, F3,  7, in(14),  2); \
+		sROUND2(D, A, B, C, F3,  7, in(15),  2); \
+		sROUND2(C, D, A, B, F3, 12, in( 8),  2); \
+		sROUND2(B, C, D, A, F3,  7, in(12),  2); \
+		sROUND2(A, B, C, D, F3,  6, in( 4),  2); \
+		sROUND2(D, A, B, C, F3, 15, in( 9),  2); \
+		sROUND2(C, D, A, B, F3, 13, in( 1),  2); \
+		sROUND2(B, C, D, A, F3, 11, in( 2),  2); \
+ \
+		sROUND2(A, B, C, D, F2,  9, in(15),  3); \
+		sROUND2(D, A, B, C, F2,  7, in( 5),  3); \
+		sROUND2(C, D, A, B, F2, 15, in( 1),  3); \
+		sROUND2(B, C, D, A, F2, 11, in( 3),  3); \
+		sROUND2(A, B, C, D, F2,  8, in( 7),  3); \
+		sROUND2(D, A, B, C, F2,  6, in(14),  3); \
+		sROUND2(C, D, A, B, F2,  6, in( 6),  3); \
+		sROUND2(B, C, D, A, F2, 14, in( 9),  3); \
+		sROUND2(A, B, C, D, F2, 12, in(11),  3); \
+		sROUND2(D, A, B, C, F2, 13, in( 8),  3); \
+		sROUND2(C, D, A, B, F2,  5, in(12),  3); \
+		sROUND2(B, C, D, A, F2, 14, in( 2),  3); \
+		sROUND2(A, B, C, D, F2, 13, in(10),  3); \
+		sROUND2(D, A, B, C, F2, 13, in( 0),  3); \
+		sROUND2(C, D, A, B, F2,  7, in( 4),  3); \
+		sROUND2(B, C, D, A, F2,  5, in(13),  3); \
+ \
+		sROUND2(A, B, C, D, F1, 15, in( 8),  4); \
+		sROUND2(D, A, B, C, F1,  5, in( 6),  4); \
+		sROUND2(C, D, A, B, F1,  8, in( 4),  4); \
+		sROUND2(B, C, D, A, F1, 11, in( 1),  4); \
+		sROUND2(A, B, C, D, F1, 14, in( 3),  4); \
+		sROUND2(D, A, B, C, F1, 14, in(11),  4); \
+		sROUND2(C, D, A, B, F1,  6, in(15),  4); \
+		sROUND2(B, C, D, A, F1, 14, in( 0),  4); \
+		sROUND2(A, B, C, D, F1,  6, in( 5),  4); \
+		sROUND2(D, A, B, C, F1,  9, in(12),  4); \
+		sROUND2(C, D, A, B, F1, 12, in( 2),  4); \
+		sROUND2(B, C, D, A, F1,  9, in(13),  4); \
+		sROUND2(A, B, C, D, F1, 12, in( 9),  4); \
+		sROUND2(D, A, B, C, F1,  5, in( 7),  4); \
+		sROUND2(C, D, A, B, F1, 15, in(10),  4); \
+		sROUND2(B, C, D, A, F1,  8, in(14),  4); \
+ \
+		tmp = SPH_T32((h)[1] + C1 + D2); \
+		(h)[1] = SPH_T32((h)[2] + D1 + A2); \
+		(h)[2] = SPH_T32((h)[3] + A1 + B2); \
+		(h)[3] = SPH_T32((h)[0] + B1 + C2); \
+		(h)[0] = tmp; \
+	} while (0)
+
+/*
+ * One round of RIPEMD-128. The data must be aligned for 32-bit access.
+ */
+static void
+ripemd128_round(const unsigned char *data, sph_u32 r[5])
+{
+#if SPH_LITTLE_FAST
+
+#define RIPEMD128_IN(x)   sph_dec32le_aligned(data + (4 * (x)))
+
+#else
+
+	sph_u32 X_var[16];
+	int i;
+
+	for (i = 0; i < 16; i ++)
+		X_var[i] = sph_dec32le_aligned(data + 4 * i);
+#define RIPEMD128_IN(x)   X_var[x]
+
+#endif
+	RIPEMD128_ROUND_BODY(RIPEMD128_IN, r);
+#undef RIPEMD128_IN
+}
+
+/* see sph_ripemd.h */
+void
+sph_ripemd128_init(void *cc)
+{
+	sph_ripemd128_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, IV, sizeof sc->val);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+#define RFUN   ripemd128_round
+#define HASH   ripemd128
+#define LE32   1
+#include "md_helper.c"
+#undef RFUN
+#undef HASH
+#undef LE32
+
+/* see sph_ripemd.h */
+void
+sph_ripemd128_close(void *cc, void *dst)
+{
+	ripemd128_close(cc, dst, 4);
+	sph_ripemd128_init(cc);
+}
+
+/* see sph_ripemd.h */
+void
+sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4])
+{
+#define RIPEMD128_IN(x)   msg[x]
+	RIPEMD128_ROUND_BODY(RIPEMD128_IN, val);
+#undef RIPEMD128_IN
+}
+
+/* ===================================================================== */
+/*
+ * RIPEMD-160.
+ */
+
+/*
+ * Round constants for RIPEMD-160.
+ */
+#define K11    SPH_C32(0x00000000)
+#define K12    SPH_C32(0x5A827999)
+#define K13    SPH_C32(0x6ED9EBA1)
+#define K14    SPH_C32(0x8F1BBCDC)
+#define K15    SPH_C32(0xA953FD4E)
+
+#define K21    SPH_C32(0x50A28BE6)
+#define K22    SPH_C32(0x5C4DD124)
+#define K23    SPH_C32(0x6D703EF3)
+#define K24    SPH_C32(0x7A6D76E9)
+#define K25    SPH_C32(0x00000000)
+
+#define RR(a, b, c, d, e, f, s, r, k)   do { \
+		a = SPH_T32(ROTL(SPH_T32(a + f(b, c, d) + r + k), s) + e); \
+		c = ROTL(c, 10); \
+	} while (0)
+
+#define ROUND1(a, b, c, d, e, f, s, r, k)  \
+	RR(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
+
+#define ROUND2(a, b, c, d, e, f, s, r, k)  \
+	RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
+
+/*
+ * This macro defines the body for a RIPEMD-160 compression function
+ * implementation. The "in" parameter should evaluate, when applied to a
+ * numerical input parameter from 0 to 15, to an expression which yields
+ * the corresponding input block. The "h" parameter should evaluate to
+ * an array or pointer expression designating the array of 5 words which
+ * contains the input and output of the compression function.
+ */
+
+#define RIPEMD160_ROUND_BODY(in, h)   do { \
+		sph_u32 A1, B1, C1, D1, E1; \
+		sph_u32 A2, B2, C2, D2, E2; \
+		sph_u32 tmp; \
+ \
+		A1 = A2 = (h)[0]; \
+		B1 = B2 = (h)[1]; \
+		C1 = C2 = (h)[2]; \
+		D1 = D2 = (h)[3]; \
+		E1 = E2 = (h)[4]; \
+ \
+		ROUND1(A, B, C, D, E, F1, 11, in( 0),  1); \
+		ROUND1(E, A, B, C, D, F1, 14, in( 1),  1); \
+		ROUND1(D, E, A, B, C, F1, 15, in( 2),  1); \
+		ROUND1(C, D, E, A, B, F1, 12, in( 3),  1); \
+		ROUND1(B, C, D, E, A, F1,  5, in( 4),  1); \
+		ROUND1(A, B, C, D, E, F1,  8, in( 5),  1); \
+		ROUND1(E, A, B, C, D, F1,  7, in( 6),  1); \
+		ROUND1(D, E, A, B, C, F1,  9, in( 7),  1); \
+		ROUND1(C, D, E, A, B, F1, 11, in( 8),  1); \
+		ROUND1(B, C, D, E, A, F1, 13, in( 9),  1); \
+		ROUND1(A, B, C, D, E, F1, 14, in(10),  1); \
+		ROUND1(E, A, B, C, D, F1, 15, in(11),  1); \
+		ROUND1(D, E, A, B, C, F1,  6, in(12),  1); \
+		ROUND1(C, D, E, A, B, F1,  7, in(13),  1); \
+		ROUND1(B, C, D, E, A, F1,  9, in(14),  1); \
+		ROUND1(A, B, C, D, E, F1,  8, in(15),  1); \
+ \
+		ROUND1(E, A, B, C, D, F2,  7, in( 7),  2); \
+		ROUND1(D, E, A, B, C, F2,  6, in( 4),  2); \
+		ROUND1(C, D, E, A, B, F2,  8, in(13),  2); \
+		ROUND1(B, C, D, E, A, F2, 13, in( 1),  2); \
+		ROUND1(A, B, C, D, E, F2, 11, in(10),  2); \
+		ROUND1(E, A, B, C, D, F2,  9, in( 6),  2); \
+		ROUND1(D, E, A, B, C, F2,  7, in(15),  2); \
+		ROUND1(C, D, E, A, B, F2, 15, in( 3),  2); \
+		ROUND1(B, C, D, E, A, F2,  7, in(12),  2); \
+		ROUND1(A, B, C, D, E, F2, 12, in( 0),  2); \
+		ROUND1(E, A, B, C, D, F2, 15, in( 9),  2); \
+		ROUND1(D, E, A, B, C, F2,  9, in( 5),  2); \
+		ROUND1(C, D, E, A, B, F2, 11, in( 2),  2); \
+		ROUND1(B, C, D, E, A, F2,  7, in(14),  2); \
+		ROUND1(A, B, C, D, E, F2, 13, in(11),  2); \
+		ROUND1(E, A, B, C, D, F2, 12, in( 8),  2); \
+ \
+		ROUND1(D, E, A, B, C, F3, 11, in( 3),  3); \
+		ROUND1(C, D, E, A, B, F3, 13, in(10),  3); \
+		ROUND1(B, C, D, E, A, F3,  6, in(14),  3); \
+		ROUND1(A, B, C, D, E, F3,  7, in( 4),  3); \
+		ROUND1(E, A, B, C, D, F3, 14, in( 9),  3); \
+		ROUND1(D, E, A, B, C, F3,  9, in(15),  3); \
+		ROUND1(C, D, E, A, B, F3, 13, in( 8),  3); \
+		ROUND1(B, C, D, E, A, F3, 15, in( 1),  3); \
+		ROUND1(A, B, C, D, E, F3, 14, in( 2),  3); \
+		ROUND1(E, A, B, C, D, F3,  8, in( 7),  3); \
+		ROUND1(D, E, A, B, C, F3, 13, in( 0),  3); \
+		ROUND1(C, D, E, A, B, F3,  6, in( 6),  3); \
+		ROUND1(B, C, D, E, A, F3,  5, in(13),  3); \
+		ROUND1(A, B, C, D, E, F3, 12, in(11),  3); \
+		ROUND1(E, A, B, C, D, F3,  7, in( 5),  3); \
+		ROUND1(D, E, A, B, C, F3,  5, in(12),  3); \
+ \
+		ROUND1(C, D, E, A, B, F4, 11, in( 1),  4); \
+		ROUND1(B, C, D, E, A, F4, 12, in( 9),  4); \
+		ROUND1(A, B, C, D, E, F4, 14, in(11),  4); \
+		ROUND1(E, A, B, C, D, F4, 15, in(10),  4); \
+		ROUND1(D, E, A, B, C, F4, 14, in( 0),  4); \
+		ROUND1(C, D, E, A, B, F4, 15, in( 8),  4); \
+		ROUND1(B, C, D, E, A, F4,  9, in(12),  4); \
+		ROUND1(A, B, C, D, E, F4,  8, in( 4),  4); \
+		ROUND1(E, A, B, C, D, F4,  9, in(13),  4); \
+		ROUND1(D, E, A, B, C, F4, 14, in( 3),  4); \
+		ROUND1(C, D, E, A, B, F4,  5, in( 7),  4); \
+		ROUND1(B, C, D, E, A, F4,  6, in(15),  4); \
+		ROUND1(A, B, C, D, E, F4,  8, in(14),  4); \
+		ROUND1(E, A, B, C, D, F4,  6, in( 5),  4); \
+		ROUND1(D, E, A, B, C, F4,  5, in( 6),  4); \
+		ROUND1(C, D, E, A, B, F4, 12, in( 2),  4); \
+ \
+		ROUND1(B, C, D, E, A, F5,  9, in( 4),  5); \
+		ROUND1(A, B, C, D, E, F5, 15, in( 0),  5); \
+		ROUND1(E, A, B, C, D, F5,  5, in( 5),  5); \
+		ROUND1(D, E, A, B, C, F5, 11, in( 9),  5); \
+		ROUND1(C, D, E, A, B, F5,  6, in( 7),  5); \
+		ROUND1(B, C, D, E, A, F5,  8, in(12),  5); \
+		ROUND1(A, B, C, D, E, F5, 13, in( 2),  5); \
+		ROUND1(E, A, B, C, D, F5, 12, in(10),  5); \
+		ROUND1(D, E, A, B, C, F5,  5, in(14),  5); \
+		ROUND1(C, D, E, A, B, F5, 12, in( 1),  5); \
+		ROUND1(B, C, D, E, A, F5, 13, in( 3),  5); \
+		ROUND1(A, B, C, D, E, F5, 14, in( 8),  5); \
+		ROUND1(E, A, B, C, D, F5, 11, in(11),  5); \
+		ROUND1(D, E, A, B, C, F5,  8, in( 6),  5); \
+		ROUND1(C, D, E, A, B, F5,  5, in(15),  5); \
+		ROUND1(B, C, D, E, A, F5,  6, in(13),  5); \
+ \
+		ROUND2(A, B, C, D, E, F5,  8, in( 5),  1); \
+		ROUND2(E, A, B, C, D, F5,  9, in(14),  1); \
+		ROUND2(D, E, A, B, C, F5,  9, in( 7),  1); \
+		ROUND2(C, D, E, A, B, F5, 11, in( 0),  1); \
+		ROUND2(B, C, D, E, A, F5, 13, in( 9),  1); \
+		ROUND2(A, B, C, D, E, F5, 15, in( 2),  1); \
+		ROUND2(E, A, B, C, D, F5, 15, in(11),  1); \
+		ROUND2(D, E, A, B, C, F5,  5, in( 4),  1); \
+		ROUND2(C, D, E, A, B, F5,  7, in(13),  1); \
+		ROUND2(B, C, D, E, A, F5,  7, in( 6),  1); \
+		ROUND2(A, B, C, D, E, F5,  8, in(15),  1); \
+		ROUND2(E, A, B, C, D, F5, 11, in( 8),  1); \
+		ROUND2(D, E, A, B, C, F5, 14, in( 1),  1); \
+		ROUND2(C, D, E, A, B, F5, 14, in(10),  1); \
+		ROUND2(B, C, D, E, A, F5, 12, in( 3),  1); \
+		ROUND2(A, B, C, D, E, F5,  6, in(12),  1); \
+ \
+		ROUND2(E, A, B, C, D, F4,  9, in( 6),  2); \
+		ROUND2(D, E, A, B, C, F4, 13, in(11),  2); \
+		ROUND2(C, D, E, A, B, F4, 15, in( 3),  2); \
+		ROUND2(B, C, D, E, A, F4,  7, in( 7),  2); \
+		ROUND2(A, B, C, D, E, F4, 12, in( 0),  2); \
+		ROUND2(E, A, B, C, D, F4,  8, in(13),  2); \
+		ROUND2(D, E, A, B, C, F4,  9, in( 5),  2); \
+		ROUND2(C, D, E, A, B, F4, 11, in(10),  2); \
+		ROUND2(B, C, D, E, A, F4,  7, in(14),  2); \
+		ROUND2(A, B, C, D, E, F4,  7, in(15),  2); \
+		ROUND2(E, A, B, C, D, F4, 12, in( 8),  2); \
+		ROUND2(D, E, A, B, C, F4,  7, in(12),  2); \
+		ROUND2(C, D, E, A, B, F4,  6, in( 4),  2); \
+		ROUND2(B, C, D, E, A, F4, 15, in( 9),  2); \
+		ROUND2(A, B, C, D, E, F4, 13, in( 1),  2); \
+		ROUND2(E, A, B, C, D, F4, 11, in( 2),  2); \
+ \
+		ROUND2(D, E, A, B, C, F3,  9, in(15),  3); \
+		ROUND2(C, D, E, A, B, F3,  7, in( 5),  3); \
+		ROUND2(B, C, D, E, A, F3, 15, in( 1),  3); \
+		ROUND2(A, B, C, D, E, F3, 11, in( 3),  3); \
+		ROUND2(E, A, B, C, D, F3,  8, in( 7),  3); \
+		ROUND2(D, E, A, B, C, F3,  6, in(14),  3); \
+		ROUND2(C, D, E, A, B, F3,  6, in( 6),  3); \
+		ROUND2(B, C, D, E, A, F3, 14, in( 9),  3); \
+		ROUND2(A, B, C, D, E, F3, 12, in(11),  3); \
+		ROUND2(E, A, B, C, D, F3, 13, in( 8),  3); \
+		ROUND2(D, E, A, B, C, F3,  5, in(12),  3); \
+		ROUND2(C, D, E, A, B, F3, 14, in( 2),  3); \
+		ROUND2(B, C, D, E, A, F3, 13, in(10),  3); \
+		ROUND2(A, B, C, D, E, F3, 13, in( 0),  3); \
+		ROUND2(E, A, B, C, D, F3,  7, in( 4),  3); \
+		ROUND2(D, E, A, B, C, F3,  5, in(13),  3); \
+ \
+		ROUND2(C, D, E, A, B, F2, 15, in( 8),  4); \
+		ROUND2(B, C, D, E, A, F2,  5, in( 6),  4); \
+		ROUND2(A, B, C, D, E, F2,  8, in( 4),  4); \
+		ROUND2(E, A, B, C, D, F2, 11, in( 1),  4); \
+		ROUND2(D, E, A, B, C, F2, 14, in( 3),  4); \
+		ROUND2(C, D, E, A, B, F2, 14, in(11),  4); \
+		ROUND2(B, C, D, E, A, F2,  6, in(15),  4); \
+		ROUND2(A, B, C, D, E, F2, 14, in( 0),  4); \
+		ROUND2(E, A, B, C, D, F2,  6, in( 5),  4); \
+		ROUND2(D, E, A, B, C, F2,  9, in(12),  4); \
+		ROUND2(C, D, E, A, B, F2, 12, in( 2),  4); \
+		ROUND2(B, C, D, E, A, F2,  9, in(13),  4); \
+		ROUND2(A, B, C, D, E, F2, 12, in( 9),  4); \
+		ROUND2(E, A, B, C, D, F2,  5, in( 7),  4); \
+		ROUND2(D, E, A, B, C, F2, 15, in(10),  4); \
+		ROUND2(C, D, E, A, B, F2,  8, in(14),  4); \
+ \
+		ROUND2(B, C, D, E, A, F1,  8, in(12),  5); \
+		ROUND2(A, B, C, D, E, F1,  5, in(15),  5); \
+		ROUND2(E, A, B, C, D, F1, 12, in(10),  5); \
+		ROUND2(D, E, A, B, C, F1,  9, in( 4),  5); \
+		ROUND2(C, D, E, A, B, F1, 12, in( 1),  5); \
+		ROUND2(B, C, D, E, A, F1,  5, in( 5),  5); \
+		ROUND2(A, B, C, D, E, F1, 14, in( 8),  5); \
+		ROUND2(E, A, B, C, D, F1,  6, in( 7),  5); \
+		ROUND2(D, E, A, B, C, F1,  8, in( 6),  5); \
+		ROUND2(C, D, E, A, B, F1, 13, in( 2),  5); \
+		ROUND2(B, C, D, E, A, F1,  6, in(13),  5); \
+		ROUND2(A, B, C, D, E, F1,  5, in(14),  5); \
+		ROUND2(E, A, B, C, D, F1, 15, in( 0),  5); \
+		ROUND2(D, E, A, B, C, F1, 13, in( 3),  5); \
+		ROUND2(C, D, E, A, B, F1, 11, in( 9),  5); \
+		ROUND2(B, C, D, E, A, F1, 11, in(11),  5); \
+ \
+		tmp = SPH_T32((h)[1] + C1 + D2); \
+		(h)[1] = SPH_T32((h)[2] + D1 + E2); \
+		(h)[2] = SPH_T32((h)[3] + E1 + A2); \
+		(h)[3] = SPH_T32((h)[4] + A1 + B2); \
+		(h)[4] = SPH_T32((h)[0] + B1 + C2); \
+		(h)[0] = tmp; \
+	} while (0)
+
+/*
+ * One round of RIPEMD-160. The data must be aligned for 32-bit access.
+ */
+static void
+ripemd160_round(const unsigned char *data, sph_u32 r[5])
+{
+#if SPH_LITTLE_FAST
+
+#define RIPEMD160_IN(x)   sph_dec32le_aligned(data + (4 * (x)))
+
+#else
+
+	sph_u32 X_var[16];
+	int i;
+
+	for (i = 0; i < 16; i ++)
+		X_var[i] = sph_dec32le_aligned(data + 4 * i);
+#define RIPEMD160_IN(x)   X_var[x]
+
+#endif
+	RIPEMD160_ROUND_BODY(RIPEMD160_IN, r);
+#undef RIPEMD160_IN
+}
+
+/* see sph_ripemd.h */
+void
+sph_ripemd160_init(void *cc)
+{
+	sph_ripemd160_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, IV, sizeof sc->val);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+#define RFUN   ripemd160_round
+#define HASH   ripemd160
+#define LE32   1
+#include "md_helper.c"
+#undef RFUN
+#undef HASH
+#undef LE32
+
+/* see sph_ripemd.h */
+void
+sph_ripemd160_close(void *cc, void *dst)
+{
+	ripemd160_close(cc, dst, 5);
+	sph_ripemd160_init(cc);
+}
+
+/* see sph_ripemd.h */
+void
+sph_ripemd160_comp(const sph_u32 msg[16], sph_u32 val[5])
+{
+#define RIPEMD160_IN(x)   msg[x]
+	RIPEMD160_ROUND_BODY(RIPEMD160_IN, val);
+#undef RIPEMD160_IN
+}
--- a/sph/sph_ripemd.h
+++ b/sph/sph_ripemd.h
@ -0,0 +1,273 @@
+/* $Id: sph_ripemd.h 216 2010-06-08 09:46:57Z tp $ */
+/**
+ * RIPEMD, RIPEMD-128 and RIPEMD-160 interface.
+ *
+ * RIPEMD was first described in: Research and Development in Advanced
+ * Communication Technologies in Europe, "RIPE Integrity Primitives:
+ * Final Report of RACE Integrity Primitives Evaluation (R1040)", RACE,
+ * June 1992.
+ *
+ * A new, strengthened version, dubbed RIPEMD-160, was published in: H.
+ * Dobbertin, A. Bosselaers, and B. Preneel, "RIPEMD-160, a strengthened
+ * version of RIPEMD", Fast Software Encryption - FSE'96, LNCS 1039,
+ * Springer (1996), pp. 71--82.
+ *
+ * This article describes both RIPEMD-160, with a 160-bit output, and a
+ * reduced version called RIPEMD-128, which has a 128-bit output. RIPEMD-128
+ * was meant as a "drop-in" replacement for any hash function with 128-bit
+ * output, especially the original RIPEMD.
+ *
+ * @warning   Collisions, and an efficient method to build other collisions,
+ * have been published for the original RIPEMD, which is thus considered as
+ * cryptographically broken. It is also very rarely encountered, and there
+ * seems to exist no free description or implementation of RIPEMD (except
+ * the sphlib code, of course). As of january 2007, RIPEMD-128 and RIPEMD-160
+ * seem as secure as their output length allows.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @file     sph_ripemd.h
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#ifndef SPH_RIPEMD_H__
+#define SPH_RIPEMD_H__
+
+#include <stddef.h>
+#include "sph_types.h"
+
+/**
+ * Output size (in bits) for RIPEMD.
+ */
+#define SPH_SIZE_ripemd   128
+
+/**
+ * Output size (in bits) for RIPEMD-128.
+ */
+#define SPH_SIZE_ripemd128   128
+
+/**
+ * Output size (in bits) for RIPEMD-160.
+ */
+#define SPH_SIZE_ripemd160   160
+
+/**
+ * This structure is a context for RIPEMD computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a RIPEMD computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running RIPEMD computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	sph_u32 val[4];
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_ripemd_context;
+
+/**
+ * Initialize a RIPEMD context. This process performs no memory allocation.
+ *
+ * @param cc   the RIPEMD context (pointer to
+ *             a <code>sph_ripemd_context</code>)
+ */
+void sph_ripemd_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the RIPEMD context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_ripemd(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current RIPEMD computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (16 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the RIPEMD context
+ * @param dst   the destination buffer
+ */
+void sph_ripemd_close(void *cc, void *dst);
+
+/**
+ * Apply the RIPEMD compression function on the provided data. The
+ * <code>msg</code> parameter contains the 16 32-bit input blocks,
+ * as numerical values (hence after the little-endian decoding). The
+ * <code>val</code> parameter contains the 5 32-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array.
+ *
+ * @param msg   the message block (16 values)
+ * @param val   the function 128-bit input and output
+ */
+void sph_ripemd_comp(const sph_u32 msg[16], sph_u32 val[4]);
+
+/* ===================================================================== */
+
+/**
+ * This structure is a context for RIPEMD-128 computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a RIPEMD-128 computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running RIPEMD-128 computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	sph_u32 val[4];
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_ripemd128_context;
+
+/**
+ * Initialize a RIPEMD-128 context. This process performs no memory allocation.
+ *
+ * @param cc   the RIPEMD-128 context (pointer to
+ *             a <code>sph_ripemd128_context</code>)
+ */
+void sph_ripemd128_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the RIPEMD-128 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_ripemd128(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current RIPEMD-128 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (16 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the RIPEMD-128 context
+ * @param dst   the destination buffer
+ */
+void sph_ripemd128_close(void *cc, void *dst);
+
+/**
+ * Apply the RIPEMD-128 compression function on the provided data. The
+ * <code>msg</code> parameter contains the 16 32-bit input blocks,
+ * as numerical values (hence after the little-endian decoding). The
+ * <code>val</code> parameter contains the 5 32-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array.
+ *
+ * @param msg   the message block (16 values)
+ * @param val   the function 128-bit input and output
+ */
+void sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4]);
+
+/* ===================================================================== */
+
+/**
+ * This structure is a context for RIPEMD-160 computations: it contains the
+ * intermediate values and some data from the last entered block. Once
+ * a RIPEMD-160 computation has been performed, the context can be reused for
+ * another computation.
+ *
+ * The contents of this structure are private. A running RIPEMD-160 computation
+ * can be cloned by copying the context (e.g. with a simple
+ * <code>memcpy()</code>).
+ */
+typedef struct {
+#ifndef DOXYGEN_IGNORE
+	unsigned char buf[64];    /* first field, for alignment */
+	sph_u32 val[5];
+#if SPH_64
+	sph_u64 count;
+#else
+	sph_u32 count_high, count_low;
+#endif
+#endif
+} sph_ripemd160_context;
+
+/**
+ * Initialize a RIPEMD-160 context. This process performs no memory allocation.
+ *
+ * @param cc   the RIPEMD-160 context (pointer to
+ *             a <code>sph_ripemd160_context</code>)
+ */
+void sph_ripemd160_init(void *cc);
+
+/**
+ * Process some data bytes. It is acceptable that <code>len</code> is zero
+ * (in which case this function does nothing).
+ *
+ * @param cc     the RIPEMD-160 context
+ * @param data   the input data
+ * @param len    the input data length (in bytes)
+ */
+void sph_ripemd160(void *cc, const void *data, size_t len);
+
+/**
+ * Terminate the current RIPEMD-160 computation and output the result into the
+ * provided buffer. The destination buffer must be wide enough to
+ * accomodate the result (20 bytes). The context is automatically
+ * reinitialized.
+ *
+ * @param cc    the RIPEMD-160 context
+ * @param dst   the destination buffer
+ */
+void sph_ripemd160_close(void *cc, void *dst);
+
+/**
+ * Apply the RIPEMD-160 compression function on the provided data. The
+ * <code>msg</code> parameter contains the 16 32-bit input blocks,
+ * as numerical values (hence after the little-endian decoding). The
+ * <code>val</code> parameter contains the 5 32-bit input blocks for
+ * the compression function; the output is written in place in this
+ * array.
+ *
+ * @param msg   the message block (16 values)
+ * @param val   the function 160-bit input and output
+ */
+void sph_ripemd160_comp(const sph_u32 msg[16], sph_u32 val[5]);
+
+#endif
--- a/sph/sph_sha2.c
+++ b/sph/sph_sha2.c
@ -0,0 +1,691 @@
+/* $Id: sha2.c 227 2010-06-16 17:28:38Z tp $ */
+/*
+ * SHA-224 / SHA-256 implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2007-2010  Projet RNRT SAPHIR
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ *
+ * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
+ */
+
+#include <stddef.h>
+#include <string.h>
+
+#include "sph_sha2.h"
+
+#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHA2
+#define SPH_SMALL_FOOTPRINT_SHA2   1
+#endif
+
+#define CH(X, Y, Z)    ((((Y) ^ (Z)) & (X)) ^ (Z))
+#define MAJ(X, Y, Z)   (((Y) & (Z)) | (((Y) | (Z)) & (X)))
+
+#define ROTR    SPH_ROTR32
+
+#define BSG2_0(x)      (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
+#define BSG2_1(x)      (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
+#define SSG2_0(x)      (ROTR(x, 7) ^ ROTR(x, 18) ^ SPH_T32((x) >> 3))
+#define SSG2_1(x)      (ROTR(x, 17) ^ ROTR(x, 19) ^ SPH_T32((x) >> 10))
+
+static const sph_u32 H224[8] = {
+	SPH_C32(0xC1059ED8), SPH_C32(0x367CD507), SPH_C32(0x3070DD17),
+	SPH_C32(0xF70E5939), SPH_C32(0xFFC00B31), SPH_C32(0x68581511),
+	SPH_C32(0x64F98FA7), SPH_C32(0xBEFA4FA4)
+};
+
+static const sph_u32 H256[8] = {
+	SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85), SPH_C32(0x3C6EF372),
+	SPH_C32(0xA54FF53A), SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
+	SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
+};
+
+/*
+ * The SHA2_ROUND_BODY defines the body for a SHA-224 / SHA-256
+ * compression function implementation. The "in" parameter should
+ * evaluate, when applied to a numerical input parameter from 0 to 15,
+ * to an expression which yields the corresponding input block. The "r"
+ * parameter should evaluate to an array or pointer expression
+ * designating the array of 8 words which contains the input and output
+ * of the compression function.
+ */
+
+#if SPH_SMALL_FOOTPRINT_SHA2
+
+static const sph_u32 K[64] = {
+	SPH_C32(0x428A2F98), SPH_C32(0x71374491),
+	SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
+	SPH_C32(0x3956C25B), SPH_C32(0x59F111F1),
+	SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5),
+	SPH_C32(0xD807AA98), SPH_C32(0x12835B01),
+	SPH_C32(0x243185BE), SPH_C32(0x550C7DC3),
+	SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE),
+	SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174),
+	SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786),
+	SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC),
+	SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA),
+	SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA),
+	SPH_C32(0x983E5152), SPH_C32(0xA831C66D),
+	SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7),
+	SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147),
+	SPH_C32(0x06CA6351), SPH_C32(0x14292967),
+	SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138),
+	SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13),
+	SPH_C32(0x650A7354), SPH_C32(0x766A0ABB),
+	SPH_C32(0x81C2C92E), SPH_C32(0x92722C85),
+	SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B),
+	SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3),
+	SPH_C32(0xD192E819), SPH_C32(0xD6990624),
+	SPH_C32(0xF40E3585), SPH_C32(0x106AA070),
+	SPH_C32(0x19A4C116), SPH_C32(0x1E376C08),
+	SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5),
+	SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A),
+	SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3),
+	SPH_C32(0x748F82EE), SPH_C32(0x78A5636F),
+	SPH_C32(0x84C87814), SPH_C32(0x8CC70208),
+	SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
+	SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
+};
+
+#define SHA2_MEXP1(in, pc)   do { \
+		W[pc] = in(pc); \
+	} while (0)
+
+#define SHA2_MEXP2(in, pc)   do { \
+		W[(pc) & 0x0F] = SPH_T32(SSG2_1(W[((pc) - 2) & 0x0F]) \
+			+ W[((pc) - 7) & 0x0F] \
+			+ SSG2_0(W[((pc) - 15) & 0x0F]) + W[(pc) & 0x0F]); \
+	} while (0)
+
+#define SHA2_STEPn(n, a, b, c, d, e, f, g, h, in, pc)   do { \
+		sph_u32 t1, t2; \
+		SHA2_MEXP ## n(in, pc); \
+		t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \
+			+ K[pcount + (pc)] + W[(pc) & 0x0F]); \
+		t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \
+		d = SPH_T32(d + t1); \
+		h = SPH_T32(t1 + t2); \
+	} while (0)
+
+#define SHA2_STEP1(a, b, c, d, e, f, g, h, in, pc) \
+	SHA2_STEPn(1, a, b, c, d, e, f, g, h, in, pc)
+#define SHA2_STEP2(a, b, c, d, e, f, g, h, in, pc) \
+	SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc)
+
+#define SHA2_ROUND_BODY(in, r)   do { \
+		sph_u32 A, B, C, D, E, F, G, H; \
+		sph_u32 W[16]; \
+		unsigned pcount; \
+ \
+		A = (r)[0]; \
+		B = (r)[1]; \
+		C = (r)[2]; \
+		D = (r)[3]; \
+		E = (r)[4]; \
+		F = (r)[5]; \
+		G = (r)[6]; \
+		H = (r)[7]; \
+		pcount = 0; \
+		SHA2_STEP1(A, B, C, D, E, F, G, H, in,  0); \
+		SHA2_STEP1(H, A, B, C, D, E, F, G, in,  1); \
+		SHA2_STEP1(G, H, A, B, C, D, E, F, in,  2); \
+		SHA2_STEP1(F, G, H, A, B, C, D, E, in,  3); \
+		SHA2_STEP1(E, F, G, H, A, B, C, D, in,  4); \
+		SHA2_STEP1(D, E, F, G, H, A, B, C, in,  5); \
+		SHA2_STEP1(C, D, E, F, G, H, A, B, in,  6); \
+		SHA2_STEP1(B, C, D, E, F, G, H, A, in,  7); \
+		SHA2_STEP1(A, B, C, D, E, F, G, H, in,  8); \
+		SHA2_STEP1(H, A, B, C, D, E, F, G, in,  9); \
+		SHA2_STEP1(G, H, A, B, C, D, E, F, in, 10); \
+		SHA2_STEP1(F, G, H, A, B, C, D, E, in, 11); \
+		SHA2_STEP1(E, F, G, H, A, B, C, D, in, 12); \
+		SHA2_STEP1(D, E, F, G, H, A, B, C, in, 13); \
+		SHA2_STEP1(C, D, E, F, G, H, A, B, in, 14); \
+		SHA2_STEP1(B, C, D, E, F, G, H, A, in, 15); \
+		for (pcount = 16; pcount < 64; pcount += 16) { \
+			SHA2_STEP2(A, B, C, D, E, F, G, H, in,  0); \
+			SHA2_STEP2(H, A, B, C, D, E, F, G, in,  1); \
+			SHA2_STEP2(G, H, A, B, C, D, E, F, in,  2); \
+			SHA2_STEP2(F, G, H, A, B, C, D, E, in,  3); \
+			SHA2_STEP2(E, F, G, H, A, B, C, D, in,  4); \
+			SHA2_STEP2(D, E, F, G, H, A, B, C, in,  5); \
+			SHA2_STEP2(C, D, E, F, G, H, A, B, in,  6); \
+			SHA2_STEP2(B, C, D, E, F, G, H, A, in,  7); \
+			SHA2_STEP2(A, B, C, D, E, F, G, H, in,  8); \
+			SHA2_STEP2(H, A, B, C, D, E, F, G, in,  9); \
+			SHA2_STEP2(G, H, A, B, C, D, E, F, in, 10); \
+			SHA2_STEP2(F, G, H, A, B, C, D, E, in, 11); \
+			SHA2_STEP2(E, F, G, H, A, B, C, D, in, 12); \
+			SHA2_STEP2(D, E, F, G, H, A, B, C, in, 13); \
+			SHA2_STEP2(C, D, E, F, G, H, A, B, in, 14); \
+			SHA2_STEP2(B, C, D, E, F, G, H, A, in, 15); \
+		} \
+		(r)[0] = SPH_T32((r)[0] + A); \
+		(r)[1] = SPH_T32((r)[1] + B); \
+		(r)[2] = SPH_T32((r)[2] + C); \
+		(r)[3] = SPH_T32((r)[3] + D); \
+		(r)[4] = SPH_T32((r)[4] + E); \
+		(r)[5] = SPH_T32((r)[5] + F); \
+		(r)[6] = SPH_T32((r)[6] + G); \
+		(r)[7] = SPH_T32((r)[7] + H); \
+	} while (0)
+
+#else
+
+#define SHA2_ROUND_BODY(in, r)   do { \
+		sph_u32 A, B, C, D, E, F, G, H, T1, T2; \
+		sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \
+		sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \
+		int i; \
+ \
+ 		A = (r)[0]; \
+		B = (r)[1]; \
+		C = (r)[2]; \
+		D = (r)[3]; \
+		E = (r)[4]; \
+		F = (r)[5]; \
+		G = (r)[6]; \
+		H = (r)[7]; \
+		W00 = in(0); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0x428A2F98) + W00); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W01 = in(1); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0x71374491) + W01); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W02 = in(2); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0xB5C0FBCF) + W02); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W03 = in(3); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0xE9B5DBA5) + W03); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W04 = in(4); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x3956C25B) + W04); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W05 = in(5); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0x59F111F1) + W05); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W06 = in(6); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x923F82A4) + W06); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W07 = in(7); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0xAB1C5ED5) + W07); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W08 = in(8); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0xD807AA98) + W08); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W09 = in(9); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0x12835B01) + W09); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W10 = in(10); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0x243185BE) + W10); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W11 = in(11); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0x550C7DC3) + W11); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W12 = in(12); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x72BE5D74) + W12); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W13 = in(13); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0x80DEB1FE) + W13); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W14 = in(14); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x9BDC06A7) + W14); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W15 = in(15); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0xC19BF174) + W15); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0xE49B69C1) + W00); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0xEFBE4786) + W01); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0x0FC19DC6) + W02); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0x240CA1CC) + W03); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x2DE92C6F) + W04); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0x4A7484AA) + W05); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x5CB0A9DC) + W06); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0x76F988DA) + W07); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0x983E5152) + W08); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0xA831C66D) + W09); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0xB00327C8) + W10); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0xBF597FC7) + W11); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0xC6E00BF3) + W12); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0xD5A79147) + W13); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x06CA6351) + W14); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0x14292967) + W15); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0x27B70A85) + W00); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0x2E1B2138) + W01); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0x4D2C6DFC) + W02); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0x53380D13) + W03); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x650A7354) + W04); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0x766A0ABB) + W05); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x81C2C92E) + W06); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0x92722C85) + W07); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0xA2BFE8A1) + W08); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0xA81A664B) + W09); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0xC24B8B70) + W10); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0xC76C51A3) + W11); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0xD192E819) + W12); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0xD6990624) + W13); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0xF40E3585) + W14); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0x106AA070) + W15); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0x19A4C116) + W00); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0x1E376C08) + W01); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0x2748774C) + W02); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0x34B0BCB5) + W03); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x391C0CB3) + W04); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0x4ED8AA4A) + W05); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0x5B9CCA4F) + W06); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0x682E6FF3) + W07); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
+		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+			+ SPH_C32(0x748F82EE) + W08); \
+		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
+		D = SPH_T32(D + T1); \
+		H = SPH_T32(T1 + T2); \
+		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
+		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+			+ SPH_C32(0x78A5636F) + W09); \
+		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
+		C = SPH_T32(C + T1); \
+		G = SPH_T32(T1 + T2); \
+		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
+		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+			+ SPH_C32(0x84C87814) + W10); \
+		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
+		B = SPH_T32(B + T1); \
+		F = SPH_T32(T1 + T2); \
+		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
+		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+			+ SPH_C32(0x8CC70208) + W11); \
+		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
+		A = SPH_T32(A + T1); \
+		E = SPH_T32(T1 + T2); \
+		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
+		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+			+ SPH_C32(0x90BEFFFA) + W12); \
+		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
+		H = SPH_T32(H + T1); \
+		D = SPH_T32(T1 + T2); \
+		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
+		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+			+ SPH_C32(0xA4506CEB) + W13); \
+		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
+		G = SPH_T32(G + T1); \
+		C = SPH_T32(T1 + T2); \
+		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
+		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+			+ SPH_C32(0xBEF9A3F7) + W14); \
+		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
+		F = SPH_T32(F + T1); \
+		B = SPH_T32(T1 + T2); \
+		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
+		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+			+ SPH_C32(0xC67178F2) + W15); \
+		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
+		E = SPH_T32(E + T1); \
+		A = SPH_T32(T1 + T2); \
+		(r)[0] = SPH_T32((r)[0] + A); \
+		(r)[1] = SPH_T32((r)[1] + B); \
+		(r)[2] = SPH_T32((r)[2] + C); \
+		(r)[3] = SPH_T32((r)[3] + D); \
+		(r)[4] = SPH_T32((r)[4] + E); \
+		(r)[5] = SPH_T32((r)[5] + F); \
+		(r)[6] = SPH_T32((r)[6] + G); \
+		(r)[7] = SPH_T32((r)[7] + H); \
+	} while (0)
+
+#endif
+
+/*
+ * One round of SHA-224 / SHA-256. The data must be aligned for 32-bit access.
+ */
+static void
+sha2_round(const unsigned char *data, sph_u32 r[8])
+{
+#define SHA2_IN(x)   sph_dec32be_aligned(data + (4 * (x)))
+	SHA2_ROUND_BODY(SHA2_IN, r);
+#undef SHA2_IN
+}
+
+/* see sph_sha2.h */
+void
+sph_sha224_init(void *cc)
+{
+	sph_sha224_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, H224, sizeof H224);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+/* see sph_sha2.h */
+void
+sph_sha256_init(void *cc)
+{
+	sph_sha256_context *sc;
+
+	sc = cc;
+	memcpy(sc->val, H256, sizeof H256);
+#if SPH_64
+	sc->count = 0;
+#else
+	sc->count_high = sc->count_low = 0;
+#endif
+}
+
+#define RFUN   sha2_round
+#define HASH   sha224
+#define BE32   1
+#include "md_helper.c"
+
+/* see sph_sha2.h */
+void
+sph_sha224_close(void *cc, void *dst)
+{
+	sha224_close(cc, dst, 7);
+	sph_sha224_init(cc);
+}
+
+/* see sph_sha2.h */
+void
+sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	sha224_addbits_and_close(cc, ub, n, dst, 7);
+	sph_sha224_init(cc);
+}
+
+/* see sph_sha2.h */
+void
+sph_sha256_close(void *cc, void *dst)
+{
+	sha224_close(cc, dst, 8);
+	sph_sha256_init(cc);
+}
+
+/* see sph_sha2.h */
+void
+sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
+{
+	sha224_addbits_and_close(cc, ub, n, dst, 8);
+	sph_sha256_init(cc);
+}
+
+/* see sph_sha2.h */
+void
+sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8])
+{
+#define SHA2_IN(x)   msg[x]
+	SHA2_ROUND_BODY(SHA2_IN, val);
+#undef SHA2_IN
+}
--- a/util.cpp
+++ b/util.cpp
@ -1421,28 +1421,39 @@ static uint32_t getblocheight(struct stratum_ctx *sctx)

 static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 {
-	const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime, *nreward;
+	const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime;
+	const char *claim = NULL, *nreward = NULL;
 	size_t coinb1_size, coinb2_size;
 	bool clean, ret = false;
-	int merkle_count, i;
+	int merkle_count, i, p=0;
 	json_t *merkle_arr;
 	uchar **merkle = NULL;
 	// uchar(*merkle_tree)[32] = { 0 };
 	int ntime;
+	char algo[64] = { 0 };
+	get_currentalgo(algo, sizeof(algo));
+	bool has_claim = !strcasecmp(algo, "lbry");

-	job_id = json_string_value(json_array_get(params, 0));
-	prevhash = json_string_value(json_array_get(params, 1));
-	coinb1 = json_string_value(json_array_get(params, 2));
-	coinb2 = json_string_value(json_array_get(params, 3));
-	merkle_arr = json_array_get(params, 4);
+	job_id = json_string_value(json_array_get(params, p++));
+	prevhash = json_string_value(json_array_get(params, p++));
+	if (has_claim) {
+		claim = json_string_value(json_array_get(params, p++));
+		if (!claim || strlen(claim) != 64) {
+			applog(LOG_ERR, "Stratum notify: invalid claim parameter");
+			goto out;
+		}
+	}
+	coinb1 = json_string_value(json_array_get(params, p++));
+	coinb2 = json_string_value(json_array_get(params, p++));
+	merkle_arr = json_array_get(params, p++);
 	if (!merkle_arr || !json_is_array(merkle_arr))
 		goto out;
 	merkle_count = (int) json_array_size(merkle_arr);
-	version = json_string_value(json_array_get(params, 5));
-	nbits = json_string_value(json_array_get(params, 6));
-	stime = json_string_value(json_array_get(params, 7));
-	clean = json_is_true(json_array_get(params, 8));
-	nreward = json_string_value(json_array_get(params, 9));
+	version = json_string_value(json_array_get(params, p++));
+	nbits = json_string_value(json_array_get(params, p++));
+	stime = json_string_value(json_array_get(params, p++));
+	clean = json_is_true(json_array_get(params, p)); p++;
+	nreward = json_string_value(json_array_get(params, p++));

 	if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime ||
 	    strlen(prevhash) != 64 || strlen(version) != 8 ||
@ -1494,6 +1505,7 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	free(sctx->job.job_id);
 	sctx->job.job_id = strdup(job_id);
 	hex2bin(sctx->job.prevhash, prevhash, 32);
+	if (has_claim) hex2bin(sctx->job.claim, claim, 32);

 	sctx->job.height = getblocheight(sctx);

@ -2079,7 +2091,8 @@ void do_gpu_tests(void)
 	//scanhash_scrypt_jane(0, &work, NULL, 1, &done, &tv, &tv);

 	memset(work.data, 0, sizeof(work.data));
-	scanhash_decred(0, &work, 1, &done);
+	work.data[0] = 0;
+	scanhash_lbry(0, &work, 1, &done);

 	free(work_restart);
 	work_restart = NULL;
@ -2142,6 +2155,10 @@ void print_hash_tests(void)
 	keccak256_hash(&hash[0], &buf[0]);
 	printpfx("keccak", hash);

+	memset(buf, 0, 128);
+	lbry_hash(&hash[0], &buf[0]);
+	printpfx("lbry", hash);
+
 	luffa_hash(&hash[0], &buf[0]);
 	printpfx("luffa", hash);