lbry algo (stratum only)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
9 years ago · 5703a74e22
15 changed files with 3451 additions and 29 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -50,6 +50,8 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
 			  sph/hamsi.c sph/hamsi_helper.c sph/streebog.c \
 			  sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \
 			  sph/ripemd.c sph/sph_sha2.c \
 			  lbry/lbry.cu lbry/cuda_ripemd160.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu \
 			  qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
 			  x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
 			  x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@
-ccMiner preview 1.8-dev (May 2016) "Pascal and x11evo algo"
+ccMiner 1.8 Preview (July 2016) "CUDA 8, lbry and x11evo algos"
 ---------------------------------------------------------------
 ***************************************************************
@ -28,6 +28,7 @@ Decred (Blake256 14-rounds - 180 bytes)
 HeavyCoin & MjollnirCoin
 FugueCoin
 GroestlCoin & Myriad-Groestl
 Lbry Credits
 JackpotCoin
 QuarkCoin family & AnimeCoin
 TalkCoin
@ -35,13 +36,13 @@ DarkCoin and other X11 coins
 Chaincoin and Flaxscript (C11)
 Saffroncoin blake (256 14-rounds)
 BlakeCoin (256 8-rounds)
 Midnight (BMW 256)
 Qubit (Digibyte, ...)
 Luffa (Joincoin)
 Keccak (Maxcoin)
 Pentablake (Blake 512 x5)
 1Coin Triple S
 Neoscrypt (FeatherCoin)
 Revolver (X11evo)
 Scrypt and Scrypt:N
 Scrypt-Jane (Chacha)
 Sibcoin (sib)
@ -82,6 +83,7 @@ its command line interface and options.
                          heavy       use to mine Heavycoin
                          jackpot     use to mine Jackpotcoin
                          keccak      use to mine Maxcoin
                          lbry        use to mine LBRY Credits
                          luffa       use to mine Joincoin
                          lyra2       use to mine Vertcoin
                          mjollnir    use to mine Mjollnircoin
@ -151,6 +153,7 @@ its command line interface and options.
      --max-diff=N      Only mine if net difficulty is less than specified value
      --pstate=0        will force the Geforce 9xx to run in P0 P-State
      --plimit=150W     set the gpu power limit, allow multiple values for N cards
      --tlimit=85       Set the gpu thermal limit (windows only)
      --keep-clocks     prevent reset clocks and/or power limit on exit
      --show-diff       display submitted block and net difficulty
  -B, --background      run the miner in the background
@ -242,6 +245,7 @@ features.
  July 2016       v1.8.0
                  Pascal support with cuda 8
                  lbry new multi sha / ripemd algo (LBC)
                  x11evo algo (XRE)
                  Lyra2v2, Neoscrypt and Decred improvements
                  Enhance windows NVAPI clock and power limits
--- a/algos.h
+++ b/algos.h
@ -19,6 +19,7 @@ enum sha_algos {
 	ALGO_HEAVY,		/* Heavycoin hash */
 	ALGO_KECCAK,
 	ALGO_JACKPOT,
 	ALGO_LBRY,
 	ALGO_LUFFA,
 	ALGO_LYRA2,
 	ALGO_LYRA2v2,
@ -67,6 +68,7 @@ static const char *algo_names[] = {
 	"heavy",
 	"keccak",
 	"jackpot",
 	"lbry",
 	"luffa",
 	"lyra2",
 	"lyra2v2",
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -228,6 +228,7 @@ Options:\n\
 			heavy       Heavycoin\n\
 			jackpot     Jackpot\n\
 			keccak      Keccak-256 (Maxcoin)\n\
 			lbry        LBRY Credits (Sha/Ripemd)\n\
 			luffa       Joincoin\n\
 			lyra2       LyraBar\n\
 			lyra2v2     VertCoin\n\
@ -567,6 +568,7 @@ static void calc_network_diff(struct work *work)
 	// sample for diff 43.281 : 1c05ea29
 	// todo: endian reversed on longpoll could be zr5 specific...
 	uint32_t nbits = have_longpoll ? work->data[18] : swab32(work->data[18]);
 	if (opt_algo == ALGO_LBRY) nbits = swab32(work->data[26]);
 	if (opt_algo == ALGO_DECRED) nbits = work->data[29];
 	uint32_t bits = (nbits & 0xffffff);
 	int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
@ -837,6 +839,11 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 			le32enc(&nonce, work->data[19]);
 			be16enc(&nvote, *((uint16_t*)&work->data[20]));
 			break;
 		case ALGO_LBRY:
 			check_dups = true;
 			le32enc(&ntime, work->data[25]);
 			le32enc(&nonce, work->data[27]);
 			break;
 		case ALGO_ZR5:
 			check_dups = true;
 			be32enc(&ntime, work->data[17]);
@ -1296,6 +1303,8 @@ bool get_work(struct thr_info *thr, struct work *work)
 		memset(work->data + 19, 0x00, 52);
 		if (opt_algo == ALGO_DECRED) {
 			memset(&work->data[35], 0x00, 52);
 		} else if (opt_algo == ALGO_LBRY) {
 			work->data[28] = 0x80000000;
 		} else {
 			work->data[20] = 0x80000000;
 			work->data[31] = 0x00000280;
@ -1441,6 +1450,14 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		work->data[37] = (rand()*4) << 8; // random work data
 		sctx->job.height = work->data[32];
 		//applog_hex(work->data, 180);
 	} else if (opt_algo == ALGO_LBRY) {
 		for (i = 0; i < 8; i++)
 			work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
 		for (i = 0; i < 8; i++)
 			work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
 		work->data[25] = le32dec(sctx->job.ntime);
 		work->data[26] = le32dec(sctx->job.nbits);
 		work->data[28] = 0x80000000;
 	} else {
 		for (i = 0; i < 8; i++)
 			work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
@ -1498,6 +1515,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		case ALGO_FRESH:
 		case ALGO_FUGUE256:
 		case ALGO_GROESTL:
 		case ALGO_LBRY:
 		case ALGO_LYRA2v2:
 			work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));
 			break;
@ -1658,6 +1676,7 @@ static void *miner_thread(void *userdata)
 		// &work.data[19]
 		int wcmplen = (opt_algo == ALGO_DECRED) ? 140 : 76;
 		if (opt_algo == ALGO_LBRY) wcmplen = 108;
 		int wcmpoft = 0;
 		uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
@ -1910,6 +1929,7 @@ static void *miner_thread(void *userdata)
 				minmax = 0x40000000U;
 				break;
 			case ALGO_KECCAK:
 			case ALGO_LBRY:
 			case ALGO_LUFFA:
 			case ALGO_SKEIN:
 			case ALGO_SKEIN2:
@ -2035,6 +2055,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_JACKPOT:
 			rc = scanhash_jackpot(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_LBRY:
 			rc = scanhash_lbry(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_LUFFA:
 			rc = scanhash_luffa(thr_id, &work, max_nonce, &hashes_done);
 			break;
@ -2130,7 +2153,7 @@ static void *miner_thread(void *userdata)
 		// todo: update all algos to use work->nonces
 		work.nonces[0] = nonceptr[0];
-		if (opt_algo != ALGO_DECRED && opt_algo != ALGO_BLAKE2S) {
+		if (opt_algo != ALGO_DECRED && opt_algo != ALGO_BLAKE2S && opt_algo != ALGO_LBRY) {
 			work.nonces[1] = nonceptr[2];
 		}
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -115,7 +115,7 @@
      <MaxRegCount>80</MaxRegCount>
      <PtxAsOptionV>true</PtxAsOptionV>
      <Keep>true</Keep>
-      <CodeGeneration>compute_50,sm_50</CodeGeneration>
+      <CodeGeneration>compute_61,sm_61;compute_52,sm_52</CodeGeneration>
      <Include>$(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99</Include>
      <TargetMachinePlatform>64</TargetMachinePlatform>
    </CudaCompile>
@ -273,15 +273,11 @@
    <ClCompile Include="sph\fugue.c" />
    <ClCompile Include="sph\groestl.c" />
    <ClCompile Include="sph\haval.c" />
    <ClCompile Include="sph\haval_helper.c">
      <ExcludedFromBuild>true</ExcludedFromBuild>
    </ClCompile>
    <ClCompile Include="sph\jh.c" />
    <ClCompile Include="sph\keccak.c" />
    <ClCompile Include="sph\luffa.c" />
-    <ClCompile Include="sph\md_helper.c">
+    <ClCompile Include="sph\ripemd.c" />
-      <ExcludedFromBuild>true</ExcludedFromBuild>
+    <ClCompile Include="sph\sph_sha2.c" />
    </ClCompile>
    <ClCompile Include="sph\sha2.c" />
    <ClCompile Include="sph\sha2big.c" />
    <ClCompile Include="sph\shabal.c" />
@ -428,6 +424,10 @@
      <MaxRegCount>92</MaxRegCount>
    </CudaCompile>
    <CudaCompile Include="Algo256\cuda_skein256.cu" />
    <CudaCompile Include="lbry\cuda_ripemd160.cu" />
    <CudaCompile Include="lbry\cuda_sha256_lbry.cu" />
    <CudaCompile Include="lbry\cuda_sha512_lbry.cu" />
    <CudaCompile Include="lbry\lbry.cu" />
    <CudaCompile Include="pentablake.cu">
      <MaxRegCount>80</MaxRegCount>
      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -82,6 +82,9 @@
    <Filter Include="Source Files\CUDA\lyra2">
      <UniqueIdentifier>{1613763f-895c-4321-b58b-6f5849868956}</UniqueIdentifier>
    </Filter>
    <Filter Include="Source Files\CUDA\lbry">
      <UniqueIdentifier>{3079ea1f-f768-455a-acd6-f517fac535b4}</UniqueIdentifier>
    </Filter>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="compat\jansson\dump.c">
@ -150,9 +153,6 @@
    <ClCompile Include="sph\luffa.c">
      <Filter>Source Files\sph</Filter>
    </ClCompile>
    <ClCompile Include="sph\md_helper.c">
      <Filter>Source Files\sph</Filter>
    </ClCompile>
    <ClCompile Include="sph\sha2.c">
      <Filter>Source Files\sph</Filter>
    </ClCompile>
@ -180,7 +180,10 @@
    <ClCompile Include="sph\haval.c">
      <Filter>Source Files\sph</Filter>
    </ClCompile>
-    <ClCompile Include="sph\haval_helper.c">
+    <ClCompile Include="sph\ripemd.c">
      <Filter>Source Files\sph</Filter>
    </ClCompile>
    <ClCompile Include="sph\sph_sha2.c">
      <Filter>Source Files\sph</Filter>
    </ClCompile>
    <ClCompile Include="sph\sha2big.c">
@ -721,6 +724,18 @@
    <CudaCompile Include="Algo256\blake2s.cu">
      <Filter>Source Files\CUDA\Algo256</Filter>
    </CudaCompile>
    <CudaCompile Include="lbry\cuda_sha256_lbry.cu">
      <Filter>Source Files\CUDA\lbry</Filter>
    </CudaCompile>
    <CudaCompile Include="lbry\cuda_sha512_lbry.cu">
      <Filter>Source Files\CUDA\lbry</Filter>
    </CudaCompile>
    <CudaCompile Include="lbry\cuda_ripemd160.cu">
      <Filter>Source Files\CUDA\lbry</Filter>
    </CudaCompile>
    <CudaCompile Include="lbry\lbry.cu">
      <Filter>Source Files\CUDA\lbry</Filter>
    </CudaCompile>
  </ItemGroup>
  <ItemGroup>
    <Image Include="res\ccminer.ico">
@ -737,4 +752,4 @@
      <Filter>Ressources</Filter>
    </Text>
  </ItemGroup>
-</Project>
+</Project>
--- a/lbry/cuda_ripemd160.cu
+++ b/lbry/cuda_ripemd160.cu
@ -0,0 +1,441 @@
 /*
 * ripemd-160 kernel implementation.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2014, 2016  djm34, tpruvot
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 *
 */
 #include <stdio.h>
 #include <stdint.h>
 #include <memory.h>
 #include <cuda_helper.h>
 static __constant__ uint32_t c_IV[5] = {
 	0x67452301u, 0xEFCDAB89u, 0x98BADCFEu, 0x10325476u, 0xC3D2E1F0u
 };
 //__host__
 //uint64_t xornot64(uint64_t a, uint64_t b, uint64_t c) {
 //	return c ^ (a | !b);
 //}
 __forceinline__ __device__
 uint64_t xornot64(uint64_t a, uint64_t b, uint64_t c)
 {
 	uint64_t result;
 	asm("{  .reg .u64 m,n; // xornot64\n\t"
 		"not.b64 m,%2; \n\t"
 		"or.b64 n, %1,m;\n\t"
 		"xor.b64 %0, n,%3;\n\t"
 		"}\n\t"
 		: "=l"(result) : "l"(a), "l"(b), "l"(c));
 	return result;
 }
 //__host__
 //uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c) {
 //	return a ^ (b | !c);
 //}
 __device__ __forceinline__
 uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c)
 {
 	uint64_t result;
 	asm("{  .reg .u64 m,n; // xornt64\n\t"
 		"not.b64 m,%3; \n\t"
 		"or.b64 n, %2,m;\n\t"
 		"xor.b64 %0, %1,n;\n\t"
 		"}\n\t"
 		: "=l"(result) : "l"(a), "l"(b), "l"(c));
 	return result;
 }
 /*
 * Round functions for RIPEMD-128 and RIPEMD-160.
 */
 #if 1
 #define F1(x, y, z)   ((x) ^ (y) ^ (z))
 #define F2(x, y, z)   ((((y) ^ (z)) & (x)) ^ (z))
 #define F3(x, y, z)   (((x) | ~(y)) ^ (z))
 #define F4(x, y, z)   ((((x) ^ (y)) & (z)) ^ (y))
 #define F5(x, y, z)   ((x) ^ ((y) | ~(z)))
 #else
 #define F1(x, y, z)   xor3(x,y,z)
 #define F2(x, y, z)   xandx(x,y,z)
 #define F3(x, y, z)   xornot64(x,y,z)
 #define F4(x, y, z)   xandx(z,x,y)
 #define F5(x, y, z)   xornt64(x,y,z)
 #endif
 /*
 * Round constants for RIPEMD-160.
 */
 #define K11 0x00000000u
 #define K12 0x5A827999u
 #define K13 0x6ED9EBA1u
 #define K14 0x8F1BBCDCu
 #define K15 0xA953FD4Eu
 #define K21 0x50A28BE6u
 #define K22 0x5C4DD124u
 #define K23 0x6D703EF3u
 #define K24 0x7A6D76E9u
 #define K25 0x00000000u
 #define RR(a, b, c, d, e, f, s, r, k) { \
 	a = SPH_T32(ROTL32(SPH_T32(a + f(b, c, d) + r + k), s) + e); \
 	c = ROTL32(c, 10); \
 }
 #define ROUND1(a, b, c, d, e, f, s, r, k) \
 	RR(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
 #define ROUND2(a, b, c, d, e, f, s, r, k) \
 	RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
 #define RIPEMD160_ROUND_BODY(in, h) { \
 	uint32_t A1, B1, C1, D1, E1; \
 	uint32_t A2, B2, C2, D2, E2; \
 	uint32_t tmp; \
 \
 	A1 = A2 = h[0]; \
 	B1 = B2 = h[1]; \
 	C1 = C2 = h[2]; \
 	D1 = D2 = h[3]; \
 	E1 = E2 = h[4]; \
 \
 	ROUND1(A, B, C, D, E, F1, 11, in[ 0],  1); \
 	ROUND1(E, A, B, C, D, F1, 14, in[ 1],  1); \
 	ROUND1(D, E, A, B, C, F1, 15, in[ 2],  1); \
 	ROUND1(C, D, E, A, B, F1, 12, in[ 3],  1); \
 	ROUND1(B, C, D, E, A, F1,  5, in[ 4],  1); \
 	ROUND1(A, B, C, D, E, F1,  8, in[ 5],  1); \
 	ROUND1(E, A, B, C, D, F1,  7, in[ 6],  1); \
 	ROUND1(D, E, A, B, C, F1,  9, in[ 7],  1); \
 	ROUND1(C, D, E, A, B, F1, 11, in[ 8],  1); \
 	ROUND1(B, C, D, E, A, F1, 13, in[ 9],  1); \
 	ROUND1(A, B, C, D, E, F1, 14, in[10],  1); \
 	ROUND1(E, A, B, C, D, F1, 15, in[11],  1); \
 	ROUND1(D, E, A, B, C, F1,  6, in[12],  1); \
 	ROUND1(C, D, E, A, B, F1,  7, in[13],  1); \
 	ROUND1(B, C, D, E, A, F1,  9, in[14],  1); \
 	ROUND1(A, B, C, D, E, F1,  8, in[15],  1); \
 \
 	ROUND1(E, A, B, C, D, F2,  7, in[ 7],  2); \
 	ROUND1(D, E, A, B, C, F2,  6, in[ 4],  2); \
 	ROUND1(C, D, E, A, B, F2,  8, in[13],  2); \
 	ROUND1(B, C, D, E, A, F2, 13, in[ 1],  2); \
 	ROUND1(A, B, C, D, E, F2, 11, in[10],  2); \
 	ROUND1(E, A, B, C, D, F2,  9, in[ 6],  2); \
 	ROUND1(D, E, A, B, C, F2,  7, in[15],  2); \
 	ROUND1(C, D, E, A, B, F2, 15, in[ 3],  2); \
 	ROUND1(B, C, D, E, A, F2,  7, in[12],  2); \
 	ROUND1(A, B, C, D, E, F2, 12, in[ 0],  2); \
 	ROUND1(E, A, B, C, D, F2, 15, in[ 9],  2); \
 	ROUND1(D, E, A, B, C, F2,  9, in[ 5],  2); \
 	ROUND1(C, D, E, A, B, F2, 11, in[ 2],  2); \
 	ROUND1(B, C, D, E, A, F2,  7, in[14],  2); \
 	ROUND1(A, B, C, D, E, F2, 13, in[11],  2); \
 	ROUND1(E, A, B, C, D, F2, 12, in[ 8],  2); \
 \
 	ROUND1(D, E, A, B, C, F3, 11, in[ 3],  3); \
 	ROUND1(C, D, E, A, B, F3, 13, in[10],  3); \
 	ROUND1(B, C, D, E, A, F3,  6, in[14],  3); \
 	ROUND1(A, B, C, D, E, F3,  7, in[ 4],  3); \
 	ROUND1(E, A, B, C, D, F3, 14, in[ 9],  3); \
 	ROUND1(D, E, A, B, C, F3,  9, in[15],  3); \
 	ROUND1(C, D, E, A, B, F3, 13, in[ 8],  3); \
 	ROUND1(B, C, D, E, A, F3, 15, in[ 1],  3); \
 	ROUND1(A, B, C, D, E, F3, 14, in[ 2],  3); \
 	ROUND1(E, A, B, C, D, F3,  8, in[ 7],  3); \
 	ROUND1(D, E, A, B, C, F3, 13, in[ 0],  3); \
 	ROUND1(C, D, E, A, B, F3,  6, in[ 6],  3); \
 	ROUND1(B, C, D, E, A, F3,  5, in[13],  3); \
 	ROUND1(A, B, C, D, E, F3, 12, in[11],  3); \
 	ROUND1(E, A, B, C, D, F3,  7, in[ 5],  3); \
 	ROUND1(D, E, A, B, C, F3,  5, in[12],  3); \
 \
 	ROUND1(C, D, E, A, B, F4, 11, in[ 1],  4); \
 	ROUND1(B, C, D, E, A, F4, 12, in[ 9],  4); \
 	ROUND1(A, B, C, D, E, F4, 14, in[11],  4); \
 	ROUND1(E, A, B, C, D, F4, 15, in[10],  4); \
 	ROUND1(D, E, A, B, C, F4, 14, in[ 0],  4); \
 	ROUND1(C, D, E, A, B, F4, 15, in[ 8],  4); \
 	ROUND1(B, C, D, E, A, F4,  9, in[12],  4); \
 	ROUND1(A, B, C, D, E, F4,  8, in[ 4],  4); \
 	ROUND1(E, A, B, C, D, F4,  9, in[13],  4); \
 	ROUND1(D, E, A, B, C, F4, 14, in[ 3],  4); \
 	ROUND1(C, D, E, A, B, F4,  5, in[ 7],  4); \
 	ROUND1(B, C, D, E, A, F4,  6, in[15],  4); \
 	ROUND1(A, B, C, D, E, F4,  8, in[14],  4); \
 	ROUND1(E, A, B, C, D, F4,  6, in[ 5],  4); \
 	ROUND1(D, E, A, B, C, F4,  5, in[ 6],  4); \
 	ROUND1(C, D, E, A, B, F4, 12, in[ 2],  4); \
 \
 	ROUND1(B, C, D, E, A, F5,  9, in[ 4],  5); \
 	ROUND1(A, B, C, D, E, F5, 15, in[ 0],  5); \
 	ROUND1(E, A, B, C, D, F5,  5, in[ 5],  5); \
 	ROUND1(D, E, A, B, C, F5, 11, in[ 9],  5); \
 	ROUND1(C, D, E, A, B, F5,  6, in[ 7],  5); \
 	ROUND1(B, C, D, E, A, F5,  8, in[12],  5); \
 	ROUND1(A, B, C, D, E, F5, 13, in[ 2],  5); \
 	ROUND1(E, A, B, C, D, F5, 12, in[10],  5); \
 	ROUND1(D, E, A, B, C, F5,  5, in[14],  5); \
 	ROUND1(C, D, E, A, B, F5, 12, in[ 1],  5); \
 	ROUND1(B, C, D, E, A, F5, 13, in[ 3],  5); \
 	ROUND1(A, B, C, D, E, F5, 14, in[ 8],  5); \
 	ROUND1(E, A, B, C, D, F5, 11, in[11],  5); \
 	ROUND1(D, E, A, B, C, F5,  8, in[ 6],  5); \
 	ROUND1(C, D, E, A, B, F5,  5, in[15],  5); \
 	ROUND1(B, C, D, E, A, F5,  6, in[13],  5); \
 \
 	ROUND2(A, B, C, D, E, F5,  8, in[ 5],  1); \
 	ROUND2(E, A, B, C, D, F5,  9, in[14],  1); \
 	ROUND2(D, E, A, B, C, F5,  9, in[ 7],  1); \
 	ROUND2(C, D, E, A, B, F5, 11, in[ 0],  1); \
 	ROUND2(B, C, D, E, A, F5, 13, in[ 9],  1); \
 	ROUND2(A, B, C, D, E, F5, 15, in[ 2],  1); \
 	ROUND2(E, A, B, C, D, F5, 15, in[11],  1); \
 	ROUND2(D, E, A, B, C, F5,  5, in[ 4],  1); \
 	ROUND2(C, D, E, A, B, F5,  7, in[13],  1); \
 	ROUND2(B, C, D, E, A, F5,  7, in[ 6],  1); \
 	ROUND2(A, B, C, D, E, F5,  8, in[15],  1); \
 	ROUND2(E, A, B, C, D, F5, 11, in[ 8],  1); \
 	ROUND2(D, E, A, B, C, F5, 14, in[ 1],  1); \
 	ROUND2(C, D, E, A, B, F5, 14, in[10],  1); \
 	ROUND2(B, C, D, E, A, F5, 12, in[ 3],  1); \
 	ROUND2(A, B, C, D, E, F5,  6, in[12],  1); \
 \
 	ROUND2(E, A, B, C, D, F4,  9, in[ 6],  2); \
 	ROUND2(D, E, A, B, C, F4, 13, in[11],  2); \
 	ROUND2(C, D, E, A, B, F4, 15, in[ 3],  2); \
 	ROUND2(B, C, D, E, A, F4,  7, in[ 7],  2); \
 	ROUND2(A, B, C, D, E, F4, 12, in[ 0],  2); \
 	ROUND2(E, A, B, C, D, F4,  8, in[13],  2); \
 	ROUND2(D, E, A, B, C, F4,  9, in[ 5],  2); \
 	ROUND2(C, D, E, A, B, F4, 11, in[10],  2); \
 	ROUND2(B, C, D, E, A, F4,  7, in[14],  2); \
 	ROUND2(A, B, C, D, E, F4,  7, in[15],  2); \
 	ROUND2(E, A, B, C, D, F4, 12, in[ 8],  2); \
 	ROUND2(D, E, A, B, C, F4,  7, in[12],  2); \
 	ROUND2(C, D, E, A, B, F4,  6, in[ 4],  2); \
 	ROUND2(B, C, D, E, A, F4, 15, in[ 9],  2); \
 	ROUND2(A, B, C, D, E, F4, 13, in[ 1],  2); \
 	ROUND2(E, A, B, C, D, F4, 11, in[ 2],  2); \
 \
 	ROUND2(D, E, A, B, C, F3,  9, in[15],  3); \
 	ROUND2(C, D, E, A, B, F3,  7, in[ 5],  3); \
 	ROUND2(B, C, D, E, A, F3, 15, in[ 1],  3); \
 	ROUND2(A, B, C, D, E, F3, 11, in[ 3],  3); \
 	ROUND2(E, A, B, C, D, F3,  8, in[ 7],  3); \
 	ROUND2(D, E, A, B, C, F3,  6, in[14],  3); \
 	ROUND2(C, D, E, A, B, F3,  6, in[ 6],  3); \
 	ROUND2(B, C, D, E, A, F3, 14, in[ 9],  3); \
 	ROUND2(A, B, C, D, E, F3, 12, in[11],  3); \
 	ROUND2(E, A, B, C, D, F3, 13, in[ 8],  3); \
 	ROUND2(D, E, A, B, C, F3,  5, in[12],  3); \
 	ROUND2(C, D, E, A, B, F3, 14, in[ 2],  3); \
 	ROUND2(B, C, D, E, A, F3, 13, in[10],  3); \
 	ROUND2(A, B, C, D, E, F3, 13, in[ 0],  3); \
 	ROUND2(E, A, B, C, D, F3,  7, in[ 4],  3); \
 	ROUND2(D, E, A, B, C, F3,  5, in[13],  3); \
 \
 	ROUND2(C, D, E, A, B, F2, 15, in[ 8],  4); \
 	ROUND2(B, C, D, E, A, F2,  5, in[ 6],  4); \
 	ROUND2(A, B, C, D, E, F2,  8, in[ 4],  4); \
 	ROUND2(E, A, B, C, D, F2, 11, in[ 1],  4); \
 	ROUND2(D, E, A, B, C, F2, 14, in[ 3],  4); \
 	ROUND2(C, D, E, A, B, F2, 14, in[11],  4); \
 	ROUND2(B, C, D, E, A, F2,  6, in[15],  4); \
 	ROUND2(A, B, C, D, E, F2, 14, in[ 0],  4); \
 	ROUND2(E, A, B, C, D, F2,  6, in[ 5],  4); \
 	ROUND2(D, E, A, B, C, F2,  9, in[12],  4); \
 	ROUND2(C, D, E, A, B, F2, 12, in[ 2],  4); \
 	ROUND2(B, C, D, E, A, F2,  9, in[13],  4); \
 	ROUND2(A, B, C, D, E, F2, 12, in[ 9],  4); \
 	ROUND2(E, A, B, C, D, F2,  5, in[ 7],  4); \
 	ROUND2(D, E, A, B, C, F2, 15, in[10],  4); \
 	ROUND2(C, D, E, A, B, F2,  8, in[14],  4); \
 \
 	ROUND2(B, C, D, E, A, F1,  8, in[12],  5); \
 	ROUND2(A, B, C, D, E, F1,  5, in[15],  5); \
 	ROUND2(E, A, B, C, D, F1, 12, in[10],  5); \
 	ROUND2(D, E, A, B, C, F1,  9, in[ 4],  5); \
 	ROUND2(C, D, E, A, B, F1, 12, in[ 1],  5); \
 	ROUND2(B, C, D, E, A, F1,  5, in[ 5],  5); \
 	ROUND2(A, B, C, D, E, F1, 14, in[ 8],  5); \
 	ROUND2(E, A, B, C, D, F1,  6, in[ 7],  5); \
 	ROUND2(D, E, A, B, C, F1,  8, in[ 6],  5); \
 	ROUND2(C, D, E, A, B, F1, 13, in[ 2],  5); \
 	ROUND2(B, C, D, E, A, F1,  6, in[13],  5); \
 	ROUND2(A, B, C, D, E, F1,  5, in[14],  5); \
 	ROUND2(E, A, B, C, D, F1, 15, in[ 0],  5); \
 	ROUND2(D, E, A, B, C, F1, 13, in[ 3],  5); \
 	ROUND2(C, D, E, A, B, F1, 11, in[ 9],  5); \
 	ROUND2(B, C, D, E, A, F1, 11, in[11],  5); \
 \
 	tmp  = (h[1] + C1 + D2); \
 	h[1] = (h[2] + D1 + E2); \
 	h[2] = (h[3] + E1 + A2); \
 	h[3] = (h[4] + A1 + B2); \
 	h[4] = (h[0] + B1 + C2); \
 	h[0] = tmp; \
 }
 #if 0
 __global__
 void lbry_ripemd160_gpu_hash_32(const uint32_t threads, uint64_t *g_hash, const uint32_t byteOffset)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U + byteOffset/8]);
 		uint32_t in[16];
 		for (int i=0; i<8; i++)
 			in[i] = (hash[i]);
 		in[8] = 0x80;
 		#pragma unroll
 		for (int i=9;i<16;i++) in[i] = 0;
 		in[14] = 0x100; // size in bits
 		uint32_t h[5];
 		#pragma unroll
 		for (int i=0; i<5; i++)
 			h[i] = c_IV[i];
 		RIPEMD160_ROUND_BODY(in, h);
 		#pragma unroll
 		for (int i=0; i<5; i++)
 			hash[i] = h[i];
 #ifdef PAD_ZEROS
 		// 20 bytes hash on 32 or 64 bytes output space
 		hash[5] = 0;
 		hash[6] = 0;
 		hash[7] = 0;
 #endif
 	}
 }
 __host__
 void lbry_ripemd160_hash_32(int thr_id, uint32_t threads, uint32_t *g_Hash, uint32_t byteOffset, cudaStream_t stream)
 {
 	const uint32_t threadsperblock = 128;
 	dim3 grid(threads/threadsperblock);
 	dim3 block(threadsperblock);
 	lbry_ripemd160_gpu_hash_32 <<<grid, block, 0, stream>>> (threads, (uint64_t*) g_Hash, byteOffset);
 }
 #endif
 __global__
 //__launch_bounds__(256,6)
 void lbry_ripemd160_gpu_hash_32x2(const uint32_t threads, uint64_t *g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U]);
 		uint32_t in[16];
 		for (int i=0; i<8; i++)
 			in[i] = (hash[i]);
 		in[8] = 0x80;
 		#pragma unroll
 		for (int i=9;i<16;i++) in[i] = 0;
 		in[14] = 0x100; // size in bits
 		uint32_t h[5];
 		#pragma unroll
 		for (int i=0; i<5; i++)
 			h[i] = c_IV[i];
 		RIPEMD160_ROUND_BODY(in, h);
 		#pragma unroll
 		for (int i=0; i<5; i++)
 			hash[i] = h[i];
 #ifdef PAD_ZEROS
 		// 20 bytes hash on 32 output space
 		hash[5] = 0;
 		hash[6] = 0;
 		hash[7] = 0;
 #endif
 		// second 32 bytes block hash
 		hash += 8;
 		#pragma unroll
 		for (int i=0; i<8; i++)
 			in[i] = (hash[i]);
 		in[8] = 0x80;
 		#pragma unroll
 		for (int i=9;i<16;i++) in[i] = 0;
 		in[14] = 0x100; // size in bits
 		#pragma unroll
 		for (int i=0; i<5; i++)
 			h[i] = c_IV[i];
 		RIPEMD160_ROUND_BODY(in, h);
 		#pragma unroll
 		for (int i=0; i<5; i++)
 			hash[i] = h[i];
 #ifdef PAD_ZEROS
 		// 20 bytes hash on 32 output space
 		hash[5] = 0;
 		hash[6] = 0;
 		hash[7] = 0;
 #endif
 	}
 }
 __host__
 void lbry_ripemd160_hash_32x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream)
 {
 	const uint32_t threadsperblock = 128;
 	dim3 grid(threads/threadsperblock);
 	dim3 block(threadsperblock);
 	lbry_ripemd160_gpu_hash_32x2 <<<grid, block, 0, stream>>> (threads, (uint64_t*) g_Hash);
 }
 void lbry_ripemd160_init(int thr_id)
 {
 	//cudaMemcpyToSymbol(c_IV, IV, sizeof(IV), 0, cudaMemcpyHostToDevice);
 }
--- a/lbry/cuda_sha256_lbry.cu
+++ b/lbry/cuda_sha256_lbry.cu
@ -0,0 +1,712 @@
 /*
 * sha256 CUDA implementation.
 */
 #include <stdio.h>
 #include <stdint.h>
 #include <memory.h>
 #include <cuda_helper.h>
 #include <miner.h>
 __constant__ static uint32_t __align__(8) c_midstate112[8];
 __constant__ static uint32_t __align__(8) c_dataEnd112[12];
 const __constant__  uint32_t __align__(8) c_H256[8] = {
 	0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
 	0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
 };
 __constant__ static uint32_t __align__(8) c_K[64];
 static __thread uint32_t* d_resNonces;
 __constant__ static uint32_t __align__(8) c_target[2];
 __device__ uint64_t d_target[1];
 // ------------------------------------------------------------------------------------------------
 static const uint32_t cpu_H256[8] = {
 	0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
 	0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
 };
 static const uint32_t cpu_K[64] = {
 	0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
 	0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
 	0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
 	0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
 	0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
 	0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
 	0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
 	0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
 };
 #define ROTR ROTR32
 __host__
 static void sha256_step1_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
 	uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
 	uint32_t in, const uint32_t Kshared)
 {
 	uint32_t t1,t2;
 	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
 	uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
 	uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
 	uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
 	t1 = h + bsg21 + vxandx + Kshared + in;
 	t2 = bsg20 + andorv;
 	d = d + t1;
 	h = t1 + t2;
 }
 __host__
 static void sha256_step2_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
 	uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
 	uint32_t* in, uint32_t pc, const uint32_t Kshared)
 {
 	uint32_t t1,t2;
 	int pcidx1 = (pc-2)  & 0xF;
 	int pcidx2 = (pc-7)  & 0xF;
 	int pcidx3 = (pc-15) & 0xF;
 	uint32_t inx0 = in[pc];
 	uint32_t inx1 = in[pcidx1];
 	uint32_t inx2 = in[pcidx2];
 	uint32_t inx3 = in[pcidx3];
 	uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
 	uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
 	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
 	uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
 	uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
 	uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
 	in[pc] = ssg21 + inx2 + ssg20 + inx0;
 	t1 = h + bsg21 + vxandx + Kshared + in[pc];
 	t2 = bsg20 + andorv;
 	d =  d + t1;
 	h = t1 + t2;
 }
 __host__
 static void sha256_round_body_host(uint32_t* in, uint32_t* state, const uint32_t* Kshared)
 {
 	uint32_t a = state[0];
 	uint32_t b = state[1];
 	uint32_t c = state[2];
 	uint32_t d = state[3];
 	uint32_t e = state[4];
 	uint32_t f = state[5];
 	uint32_t g = state[6];
 	uint32_t h = state[7];
 	sha256_step1_host(a,b,c,d,e,f,g,h,in[0], Kshared[0]);
 	sha256_step1_host(h,a,b,c,d,e,f,g,in[1], Kshared[1]);
 	sha256_step1_host(g,h,a,b,c,d,e,f,in[2], Kshared[2]);
 	sha256_step1_host(f,g,h,a,b,c,d,e,in[3], Kshared[3]);
 	sha256_step1_host(e,f,g,h,a,b,c,d,in[4], Kshared[4]);
 	sha256_step1_host(d,e,f,g,h,a,b,c,in[5], Kshared[5]);
 	sha256_step1_host(c,d,e,f,g,h,a,b,in[6], Kshared[6]);
 	sha256_step1_host(b,c,d,e,f,g,h,a,in[7], Kshared[7]);
 	sha256_step1_host(a,b,c,d,e,f,g,h,in[8], Kshared[8]);
 	sha256_step1_host(h,a,b,c,d,e,f,g,in[9], Kshared[9]);
 	sha256_step1_host(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
 	sha256_step1_host(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
 	sha256_step1_host(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
 	sha256_step1_host(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
 	sha256_step1_host(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
 	sha256_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
 	for (int i=0; i<3; i++)
 	{
 		sha256_step2_host(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
 		sha256_step2_host(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
 		sha256_step2_host(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
 		sha256_step2_host(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
 		sha256_step2_host(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
 		sha256_step2_host(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
 		sha256_step2_host(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
 		sha256_step2_host(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
 		sha256_step2_host(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
 		sha256_step2_host(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
 		sha256_step2_host(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
 		sha256_step2_host(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
 		sha256_step2_host(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
 		sha256_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
 		sha256_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
 		sha256_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
 	}
 	state[0] += a;
 	state[1] += b;
 	state[2] += c;
 	state[3] += d;
 	state[4] += e;
 	state[5] += f;
 	state[6] += g;
 	state[7] += h;
 }
 __device__ __forceinline__
 uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) {
 	uint32_t result;
 #if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
 	asm ("lop3.b32 %0, %1, %2, %3, 0x96; // xor3b"  //0x96 = 0xF0 ^ 0xCC ^ 0xAA
 		: "=r"(result) : "r"(a), "r"(b),"r"(c));
 #else
 	result = a^b^c;
 #endif
 	return result;
 }
 /*
 __device__ __forceinline__
 uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) {
 	uint32_t result;
 	asm("{  .reg .u32 t1; // xor3b \n\t"
 		"xor.b32 t1, %2, %3;\n\t"
 		"xor.b32 %0, %1, t1;"
 		"}"
 		: "=r"(result) : "r"(a) ,"r"(b),"r"(c));
 	return result;
 }
 #define xor3b(a,b,c) (a ^ b ^ c)
 */
 __device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
 {
 	uint32_t r1 = ROTR32(x,2);
 	uint32_t r2 = ROTR32(x,13);
 	uint32_t r3 = ROTR32(x,22);
 	return xor3b(r1,r2,r3);
 }
 __device__ __forceinline__ uint32_t bsg2_1(const uint32_t x)
 {
 	uint32_t r1 = ROTR32(x,6);
 	uint32_t r2 = ROTR32(x,11);
 	uint32_t r3 = ROTR32(x,25);
 	return xor3b(r1,r2,r3);
 }
 __device__ __forceinline__ uint32_t ssg2_0(const uint32_t x)
 {
 	uint64_t r1 = ROTR32(x,7);
 	uint64_t r2 = ROTR32(x,18);
 	uint64_t r3 = shr_t32(x,3);
 	return xor3b(r1,r2,r3);
 }
 __device__ __forceinline__ uint32_t ssg2_1(const uint32_t x)
 {
 	uint64_t r1 = ROTR32(x,17);
 	uint64_t r2 = ROTR32(x,19);
 	uint64_t r3 = shr_t32(x,10);
 	return xor3b(r1,r2,r3);
 }
 __device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b, const uint32_t c)
 {
 	uint32_t result;
 	asm("{\n\t"
 		".reg .u32 m,n,o;\n\t"
 		"and.b32 m,  %1, %2;\n\t"
 		" or.b32 n,  %1, %2;\n\t"
 		"and.b32 o,   n, %3;\n\t"
 		" or.b32 %0,  m, o ;\n\t"
 		"}\n\t" : "=r"(result) : "r"(a), "r"(b), "r"(c)
 	);
 	return result;
 }
 __device__
 static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
 	uint32_t in, const uint32_t Kshared)
 {
 	uint32_t t1,t2;
 	uint32_t vxandx = xandx(e, f, g);
 	uint32_t bsg21 = bsg2_1(e);
 	uint32_t bsg20 = bsg2_0(a);
 	uint32_t andorv = andor32(a,b,c);
 	t1 = h + bsg21 + vxandx + Kshared + in;
 	t2 = bsg20 + andorv;
 	d = d + t1;
 	h = t1 + t2;
 }
 __device__
 static void sha2_step2(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
 	uint32_t* in, uint32_t pc, const uint32_t Kshared)
 {
 	uint32_t t1,t2;
 	int pcidx1 = (pc-2) & 0xF;
 	int pcidx2 = (pc-7) & 0xF;
 	int pcidx3 = (pc-15) & 0xF;
 	uint32_t inx0 = in[pc];
 	uint32_t inx1 = in[pcidx1];
 	uint32_t inx2 = in[pcidx2];
 	uint32_t inx3 = in[pcidx3];
 	uint32_t ssg21 = ssg2_1(inx1);
 	uint32_t ssg20 = ssg2_0(inx3);
 	uint32_t vxandx = xandx(e, f, g);
 	uint32_t bsg21 = bsg2_1(e);
 	uint32_t bsg20 = bsg2_0(a);
 	uint32_t andorv = andor32(a,b,c);
 	in[pc] = ssg21 + inx2 + ssg20 + inx0;
 	t1 = h + bsg21 + vxandx + Kshared + in[pc];
 	t2 = bsg20 + andorv;
 	d =  d + t1;
 	h = t1 + t2;
 }
 __device__
 static void sha256_round_body(uint32_t* in, uint32_t* state, uint32_t* const Kshared)
 {
 	uint32_t a = state[0];
 	uint32_t b = state[1];
 	uint32_t c = state[2];
 	uint32_t d = state[3];
 	uint32_t e = state[4];
 	uint32_t f = state[5];
 	uint32_t g = state[6];
 	uint32_t h = state[7];
 	sha2_step1(a,b,c,d,e,f,g,h,in[0], Kshared[0]);
 	sha2_step1(h,a,b,c,d,e,f,g,in[1], Kshared[1]);
 	sha2_step1(g,h,a,b,c,d,e,f,in[2], Kshared[2]);
 	sha2_step1(f,g,h,a,b,c,d,e,in[3], Kshared[3]);
 	sha2_step1(e,f,g,h,a,b,c,d,in[4], Kshared[4]);
 	sha2_step1(d,e,f,g,h,a,b,c,in[5], Kshared[5]);
 	sha2_step1(c,d,e,f,g,h,a,b,in[6], Kshared[6]);
 	sha2_step1(b,c,d,e,f,g,h,a,in[7], Kshared[7]);
 	sha2_step1(a,b,c,d,e,f,g,h,in[8], Kshared[8]);
 	sha2_step1(h,a,b,c,d,e,f,g,in[9], Kshared[9]);
 	sha2_step1(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
 	sha2_step1(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
 	sha2_step1(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
 	sha2_step1(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
 	sha2_step1(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
 	sha2_step1(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
 	#pragma unroll
 	for (int i=0; i<3; i++)
 	{
 		sha2_step2(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
 		sha2_step2(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
 		sha2_step2(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
 		sha2_step2(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
 		sha2_step2(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
 		sha2_step2(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
 		sha2_step2(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
 		sha2_step2(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
 		sha2_step2(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
 		sha2_step2(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
 		sha2_step2(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
 		sha2_step2(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
 		sha2_step2(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
 		sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
 		sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
 		sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
 	}
 	state[0] += a;
 	state[1] += b;
 	state[2] += c;
 	state[3] += d;
 	state[4] += e;
 	state[5] += f;
 	state[6] += g;
 	state[7] += h;
 }
 __device__
 uint64_t cuda_swab32ll(uint64_t x) {
 	return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
 }
 __global__
 /*__launch_bounds__(256,3)*/
 void lbry_sha256_gpu_hash_112(const uint32_t threads, const uint32_t startNonce, const bool swabNonce, uint64_t *outputHash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		const uint32_t nonce = startNonce + thread;
 		uint32_t dat[16];
 		#pragma unroll
 		for (int i=0;i<11;i++) dat[i] = c_dataEnd112[i]; // pre "swabed"
 		dat[11] = swabNonce ? cuda_swab32(nonce) : nonce;
 		dat[12] = 0x80000000;
 		dat[13] = 0;
 		dat[14] = 0;
 		dat[15] = 0x380;
 		uint32_t __align__(8) buf[8];
 		#pragma unroll
 		for (int i=0;i<8;i++) buf[i] = c_midstate112[i];
 		sha256_round_body(dat, buf, c_K);
 		// output
 		uint2* output = (uint2*) (&outputHash[thread * 8U]);
 		#pragma unroll
 		for (int i=0;i<4;i++) {
 			//output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
 			output[i] = vectorize(((uint64_t*)buf)[i]); // out without swap, new sha256 after
 		}
 	}
 }
 __global__
 /*__launch_bounds__(256,3)*/
 void lbry_sha256_gpu_hash_32(uint32_t threads, uint64_t *Hash512)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t __align__(8) buf[8]; // align for vectorize
 		#pragma unroll
 		for (int i=0; i<8; i++) buf[i] = c_H256[i];
 		uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]);
 		uint32_t dat[16];
 		#pragma unroll
 		//for (int i=0;i<8;i++) dat[i] = cuda_swab32(input[i]);
 		for (int i=0; i<8; i++) dat[i] = input[i];
 		dat[8] = 0x80000000;
 		#pragma unroll
 		for (int i=9; i<15; i++) dat[i] = 0;
 		dat[15] = 0x100;
 		sha256_round_body(dat, buf, c_K);
 		// output
 		uint2* output = (uint2*) input;
 		#pragma unroll
 		for (int i=0;i<4;i++) {
 			output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
 		}
 #ifdef PAD_ZEROS
 		#pragma unroll
 		for (int i=4; i<8; i++) output[i] = vectorize(0);
 #endif
 	}
 }
 __global__
 /*__launch_bounds__(256,3)*/
 void lbry_sha256d_gpu_hash_112(const uint32_t threads, const uint32_t startNonce, const bool swabNonce, uint64_t *outputHash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	extern __shared__ uint32_t s_K[];
 	//s_K[thread & 63] = c_K[thread & 63];
 	if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x];
 	if (thread < threads)
 	{
 		const uint32_t nonce = startNonce + thread;
 		uint32_t dat[16];
 		#pragma unroll
 		for (int i=0; i<11; i++) dat[i] = c_dataEnd112[i];
 		dat[11] = swabNonce ? cuda_swab32(nonce) : nonce;
 		dat[12] = 0x80000000;
 		dat[13] = 0;
 		dat[14] = 0;
 		dat[15] = 0x380;
 		uint32_t __align__(8) buf[8];
 		#pragma unroll
 		for (int i=0;i<8;i++) buf[i] = c_midstate112[i];
 		sha256_round_body(dat, buf, s_K);
 		// second sha256
 		#pragma unroll
 		for (int i=0; i<8; i++) dat[i] = buf[i];
 		dat[8] = 0x80000000;
 		#pragma unroll
 		for (int i=9; i<15; i++) dat[i] = 0;
 		dat[15] = 0x100;
 		#pragma unroll
 		for (int i=0; i<8; i++) buf[i] = c_H256[i];
 		sha256_round_body(dat, buf, s_K);
 		// output
 		uint2* output = (uint2*) (&outputHash[thread * 8U]);
 		#pragma unroll
 		for (int i=0;i<4;i++) {
 			output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
 			//output[i] = vectorize(((uint64_t*)buf)[i]);
 		}
 	}
 }
 __global__
 /*__launch_bounds__(256,3)*/
 void lbry_sha256_gpu_hash_20x2(uint32_t threads, uint64_t *Hash512)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t __align__(8) buf[8]; // align for vectorize
 		#pragma unroll
 		for (int i=0;i<8;i++) buf[i] = c_H256[i];
 		uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]);
 		uint32_t dat[16];
 		#pragma unroll
 		for (int i=0;i<5;i++) dat[i] = cuda_swab32(input[i]);
 		#pragma unroll
 		for (int i=0;i<5;i++) dat[i+5] = cuda_swab32(input[i+8]);
 		dat[10] = 0x80000000;
 		#pragma unroll
 		for (int i=11;i<15;i++) dat[i] = 0;
 		dat[15] = 0x140;
 		sha256_round_body(dat, buf, c_K);
 		// output
 		uint2* output = (uint2*) input;
 		#pragma unroll
 		for (int i=0;i<4;i++) {
 			//output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
 			output[i] = vectorize(((uint64_t*)buf)[i]);
 		}
 #ifdef PAD_ZEROS
 		#pragma unroll
 		for (int i=4; i<8; i++) output[i] = vectorize(0);
 #endif
 	}
 }
 __global__
 /*__launch_bounds__(256,3)*/
 void lbry_sha256d_gpu_hash_20x2(uint32_t threads, uint64_t *Hash512)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	extern __shared__ uint32_t s_K[];
 	if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x];
 	if (thread < threads)
 	{
 		uint32_t __align__(8) buf[8]; // align for vectorize
 		#pragma unroll
 		for (int i=0;i<8;i++) buf[i] = c_H256[i];
 		uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]);
 		uint32_t dat[16];
 		#pragma unroll
 		for (int i=0; i<5; i++) dat[i] = cuda_swab32(input[i]);
 		#pragma unroll
 		for (int i=0; i<5; i++) dat[i+5] = cuda_swab32(input[i+8]);
 		dat[10] = 0x80000000;
 		#pragma unroll
 		for (int i=11;i<15;i++) dat[i] = 0;
 		dat[15] = 0x140;
 		sha256_round_body(dat, buf, s_K);
 		// second sha256
 		#pragma unroll
 		for (int i=0; i<8; i++) dat[i] = buf[i];
 		dat[8] = 0x80000000;
 		#pragma unroll
 		for (int i=9; i<15; i++) dat[i] = 0;
 		dat[15] = 0x100;
 		#pragma unroll
 		for (int i=0;i<8;i++) buf[i] = c_H256[i];
 		sha256_round_body(dat, buf, s_K);
 		// output
 		uint2* output = (uint2*) input;
 #ifdef FULL_HASH
 		#pragma unroll
 		for (int i=0;i<4;i++) {
 			output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
 			//output[i] = vectorize(((uint64_t*)buf)[i]);
 		}
 #	ifdef PAD_ZEROS
 		#pragma unroll
 		for (int i=4; i<8; i++) output[i] = vectorize(0);
 #	endif
 #else
 		//input[6] = cuda_swab32(buf[6]);
 		//input[7] = cuda_swab32(buf[7]);
 		output[3] = vectorize(cuda_swab32ll(((uint64_t*)buf)[3]));
 #endif
 	}
 }
 __host__
 void lbry_sha256_init(int thr_id)
 {
 	//cudaMemcpyToSymbol(c_H256, cpu_H256, sizeof(cpu_H256), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(c_K, cpu_K, sizeof(cpu_K), 0, cudaMemcpyHostToDevice);
 	CUDA_SAFE_CALL(cudaMalloc(&d_resNonces, 4*sizeof(uint32_t)));
 }
 __host__
 void lbry_sha256_free(int thr_id)
 {
 	cudaFree(d_resNonces);
 }
 __host__
 void lbry_sha256_setBlock_112(uint32_t *pdata, uint32_t *ptarget)
 {
 	uint32_t in[16], buf[8], end[11];
 	for (int i=0;i<16;i++) in[i] = cuda_swab32(pdata[i]);
 	for (int i=0; i<8;i++) buf[i] = cpu_H256[i];
 	for (int i=0;i<11;i++) end[i] = cuda_swab32(pdata[16+i]);
 	sha256_round_body_host(in, buf, cpu_K);
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_midstate112, buf, 32, 0, cudaMemcpyHostToDevice));
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_dataEnd112,  end, sizeof(end), 0, cudaMemcpyHostToDevice));
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_target, &ptarget[6], sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &ptarget[6], sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
 }
 __host__
 void lbry_sha256_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream)
 {
 	const int threadsperblock = 256;
 	dim3 grid(threads/threadsperblock);
 	dim3 block(threadsperblock);
 	lbry_sha256_gpu_hash_112 <<<grid, block, 0, stream>>> (threads, startNonce, swabNonce, (uint64_t*) d_outputHash);
 	cudaGetLastError();
 }
 __host__
 void lbry_sha256_hash_32(int thr_id, uint32_t threads, uint32_t *d_Hash, cudaStream_t stream)
 {
 	const int threadsperblock = 256;
 	dim3 grid(threads/threadsperblock);
 	dim3 block(threadsperblock);
 	lbry_sha256_gpu_hash_32 <<<grid, block, 0, stream>>> (threads, (uint64_t*) d_Hash);
 }
 __host__
 void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream)
 {
 	const int threadsperblock = 256;
 	dim3 grid(threads/threadsperblock);
 	dim3 block(threadsperblock);
 	lbry_sha256d_gpu_hash_112 <<<grid, block, 64*4, stream>>> (threads, startNonce, swabNonce, (uint64_t*) d_outputHash);
 }
 __host__
 void lbry_sha256_hash_20x2(int thr_id, uint32_t threads, uint32_t *d_Hash, cudaStream_t stream)
 {
 	const int threadsperblock = 256;
 	dim3 grid(threads/threadsperblock);
 	dim3 block(threadsperblock);
 	lbry_sha256_gpu_hash_20x2 <<<grid, block, 0, stream>>> (threads, (uint64_t*) d_Hash);
 }
 __host__
 void lbry_sha256d_hash_20x2(int thr_id, uint32_t threads, uint32_t *d_Hash, cudaStream_t stream)
 {
 	const int threadsperblock = 256;
 	dim3 grid(threads/threadsperblock);
 	dim3 block(threadsperblock);
 	lbry_sha256d_gpu_hash_20x2 <<<grid, block, 64*4, stream>>> (threads, (uint64_t*) d_Hash);
 }
 __global__
 __launch_bounds__(256,3)
 void lbry_sha256d_gpu_hash_final(const uint32_t threads, const uint32_t startNonce, uint64_t *Hash512, uint32_t *resNonces)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t __align__(8) buf[8]; // align for vectorize
 		#pragma unroll
 		for (int i=0;i<8;i++) buf[i] = c_H256[i];
 		uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]);
 		uint32_t __align__(8) dat[16];
 		#pragma unroll
 		for (int i=0;i<5;i++) dat[i] = cuda_swab32(input[i]);
 		#pragma unroll
 		for (int i=0;i<5;i++) dat[i+5] = cuda_swab32(input[i+8]);
 		dat[10] = 0x80000000;
 		#pragma unroll
 		for (int i=11;i<15;i++) dat[i] = 0;
 		dat[15] = 0x140;
 		sha256_round_body(dat, buf, c_K);
 		// second sha256
 		#pragma unroll
 		for (int i=0;i<8;i++) dat[i] = buf[i];
 		dat[8] = 0x80000000;
 		#pragma unroll
 		for (int i=9;i<15;i++) dat[i] = 0;
 		dat[15] = 0x100;
 		#pragma unroll
 		for (int i=0;i<8;i++) buf[i] = c_H256[i];
 		sha256_round_body(dat, buf, c_K);
 		// valid nonces
 		uint64_t high = cuda_swab32ll(((uint64_t*)buf)[3]);
 		if (high <= d_target[0]) {
 			// printf("%08x %08x - %016llx %016llx - %08x %08x\n", buf[7], buf[6], high, d_target[0], c_target[1], c_target[0]);
 			uint32_t nonce = startNonce + thread;
 			resNonces[1] = atomicExch(resNonces, nonce);
 			d_target[0] = high;
 		}
 	}
 }
 __host__
 void lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_inputHash, uint32_t *resNonces, cudaStream_t stream)
 {
 	const int threadsperblock = 256;
 	dim3 grid(threads/threadsperblock);
 	dim3 block(threadsperblock);
 	CUDA_SAFE_CALL(cudaMemset(d_resNonces, 0xFF, 2 * sizeof(uint32_t)));
 	cudaThreadSynchronize();
 	lbry_sha256d_gpu_hash_final <<<grid, block, 0, stream>>> (threads, startNonce, (uint64_t*) d_inputHash, d_resNonces);
 	cudaThreadSynchronize();
 	CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_resNonces, 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
 	if (resNonces[0] == resNonces[1]) {
 		resNonces[1] = UINT32_MAX;
 	}
 }
--- a/lbry/cuda_sha512_lbry.cu
+++ b/lbry/cuda_sha512_lbry.cu
@ -0,0 +1,181 @@
 /**
 * sha-512 CUDA implementation.
 */
 #include <stdio.h>
 #include <stdint.h>
 #include <memory.h>
 #include <cuda_helper.h>
 static __constant__ uint64_t K_512[80];
 static const uint64_t K512[80] = {
 	0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
 	0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
 	0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
 	0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
 	0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
 	0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
 	0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
 	0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
 	0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
 	0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
 	0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
 	0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
 	0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
 	0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
 	0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
 	0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
 	0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
 	0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
 	0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
 	0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
 };
 //#undef xor3
 //#define xor3(a,b,c) (a^b^c)
 //#undef 
 static __device__ __forceinline__
 uint64_t bsg5_0(const uint64_t x)
 {
 	uint64_t r1 = ROTR64(x,28);
 	uint64_t r2 = ROTR64(x,34);
 	uint64_t r3 = ROTR64(x,39);
 	return xor3(r1,r2,r3);
 }
 static __device__ __forceinline__
 uint64_t bsg5_1(const uint64_t x)
 {
 	uint64_t r1 = ROTR64(x,14);
 	uint64_t r2 = ROTR64(x,18);
 	uint64_t r3 = ROTR64(x,41);
 	return xor3(r1,r2,r3);
 }
 static __device__ __forceinline__
 uint64_t ssg5_0(const uint64_t x)
 {
 	uint64_t r1 = ROTR64(x,1);
 	uint64_t r2 = ROTR64(x,8);
 	uint64_t r3 = shr_t64(x,7);
 	return xor3(r1,r2,r3);
 }
 static __device__ __forceinline__
 uint64_t ssg5_1(const uint64_t x)
 {
 	uint64_t r1 = ROTR64(x,19);
 	uint64_t r2 = ROTR64(x,61);
 	uint64_t r3 = shr_t64(x,6);
 	return xor3(r1,r2,r3);
 }
 static __device__ __forceinline__
 uint64_t xandx64(const uint64_t a, const uint64_t b, const uint64_t c)
 {
 	uint64_t result;
 	asm("{  .reg .u64 m,n; // xandx64\n\t"
 	    "xor.b64 m, %2,%3;\n\t"
 	    "and.b64 n, m,%1;\n\t"
 	    "xor.b64 %0, n,%3;\n\t"
 	    "}" : "=l"(result) : "l"(a), "l"(b), "l"(c));
 	return result;
 }
 static __device__ __forceinline__
 void sha512_step2(uint64_t* r, uint64_t* W, uint64_t* K, const int ord, int i)
 {
 	int u = 8-ord;
 	uint64_t a = r[(0+u) & 7];
 	uint64_t b = r[(1+u) & 7];
 	uint64_t c = r[(2+u) & 7];
 	uint64_t d = r[(3+u) & 7];
 	uint64_t e = r[(4+u) & 7];
 	uint64_t f = r[(5+u) & 7];
 	uint64_t g = r[(6+u) & 7];
 	uint64_t h = r[(7+u) & 7];
 	uint64_t T1 = h + bsg5_1(e) + xandx64(e,f,g) + W[i] + K[i];
 	uint64_t T2 = bsg5_0(a) + andor(a,b,c);
 	r[(3+u)& 7] = d + T1;
 	r[(7+u)& 7] = T1 + T2;
 }
 /**************************************************************************************************/
 __global__
 void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint64_t *pHash = &g_hash[thread * 8U];
 		uint64_t W[80];
 		uint64_t r[8];
 		uint64_t IV512[8] = {
 		        0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
 		        0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
 		};
 		#pragma unroll
 		for (int i = 0; i < 8; i++)
 			r[i] = IV512[i];
 		#pragma unroll
 		for (int i = 0; i < 4; i++) {
 			// 32 bytes input
 			W[i] = cuda_swab64(pHash[i]);
 		}
 		W[4] = 0x8000000000000000; // end tag
 		#pragma unroll
 		for (int i = 5; i < 15; i++) W[i] = 0;
 		W[15] = 0x100; // 256 bits
 		#pragma unroll
 		for (int i = 16; i < 80; i++) W[i] = 0;
 		#pragma unroll 64
 		for (int i = 16; i < 80; i++)
 			W[i] = ssg5_1(W[i - 2]) + W[i - 7] + ssg5_0(W[i - 15]) + W[i - 16];
 		#pragma unroll 10
 		for (int i = 0; i < 10; i++) {
 			#pragma unroll 8
 			for (int ord=0; ord<8; ord++)
 				sha512_step2(r, W, K_512, ord, 8*i + ord);
 		}
 		#pragma unroll 8
 		for (int i = 0; i < 8; i++)
 			pHash[i] = cuda_swab64(r[i] + IV512[i]);
 	}
 }
 __host__
 void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStream_t stream)
 {
 	const int threadsperblock = 256;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 	size_t shared_size = 80*8;
 	lbry_sha512_gpu_hash_32 <<<grid, block, shared_size, stream>>> (threads, (uint64_t*)d_hash);
 }
 /**************************************************************************************************/
 __host__
 void lbry_sha512_init(int thr_id)
 {
 	cudaMemcpyToSymbol(K_512, K512, 80*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
 }
--- a/lbry/lbry.cu
+++ b/lbry/lbry.cu
@ -0,0 +1,225 @@
 /**
 * Lbry CUDA Implementation
 *
 * by tpruvot@github - July 2016
 *
 */
 #include <string.h>
 #include <stdint.h>
 extern "C" {
 #include <sph/sph_sha2.h>
 #include <sph/sph_ripemd.h>
 }
 #include <cuda_helper.h>
 #include <miner.h>
 #define A 64
 #define debug_cpu 0
 extern "C" void lbry_hash(void* output, const void* input)
 {
 	uint32_t _ALIGN(A) hashA[16];
 	uint32_t _ALIGN(A) hashB[8];
 	uint32_t _ALIGN(A) hashC[8];
 	sph_sha256_context ctx_sha256;
 	sph_sha512_context ctx_sha512;
 	sph_ripemd160_context ctx_ripemd;
 	sph_sha256_init(&ctx_sha256);
 	sph_sha256(&ctx_sha256, input, 112);
 	sph_sha256_close(&ctx_sha256, hashA);
 	sph_sha256(&ctx_sha256, hashA, 32);
 	sph_sha256_close(&ctx_sha256, hashA);
 	sph_sha512_init(&ctx_sha512);
 	sph_sha512(&ctx_sha512, hashA, 32);
 	sph_sha512_close(&ctx_sha512, hashA);
 	sph_ripemd160_init(&ctx_ripemd);
 	sph_ripemd160(&ctx_ripemd, hashA, 32);  // sha512 low
 	sph_ripemd160_close(&ctx_ripemd, hashB);
 	if (debug_cpu) applog_hex(hashB, 20);
 	sph_ripemd160(&ctx_ripemd, &hashA[8], 32); // sha512 high
 	sph_ripemd160_close(&ctx_ripemd, hashC);
 	if (debug_cpu) applog_hex(hashC, 20);
 	sph_sha256(&ctx_sha256, hashB, 20);
 	sph_sha256(&ctx_sha256, hashC, 20);
 	sph_sha256_close(&ctx_sha256, hashA);
 	if (debug_cpu) applog_hex(hashA,32);
 	sph_sha256(&ctx_sha256, hashA, 32);
 	sph_sha256_close(&ctx_sha256, hashA);
 	memcpy(output, hashA, 32);
 }
 /* ############################################################################################################################### */
 extern void lbry_ripemd160_init(int thr_id);
 extern void lbry_sha256_init(int thr_id);
 extern void lbry_sha256_free(int thr_id);
 extern void lbry_sha256_setBlock_112(uint32_t *pdata, uint32_t *ptarget);
 extern void lbry_sha256_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream);
 extern void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream);
 extern void lbry_sha256_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStream_t stream);
 extern void lbry_sha512_init(int thr_id);
 extern void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStream_t stream);
 extern void lbry_ripemd160_hash_32x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream);
 extern void lbry_sha256_hash_20x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream);
 extern void lbry_sha256d_hash_20x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream);
 extern void lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_inputHash, uint32_t *resNonces, cudaStream_t stream);
 static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {
 	return iftrue ? swab32(val) : val;
 }
 static bool init[MAX_GPUS] = { 0 };
 static uint32_t *d_hash[MAX_GPUS];
 // nonce position is different
 #define LBC_NONCE_OFT32 27
 extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t _ALIGN(A) vhash[8];
 	uint32_t _ALIGN(A) endiandata[28];
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[LBC_NONCE_OFT32];
 	const int swap = 0; // to toggle nonce endian
 	const int dev_id = device_map[thr_id];
 	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 22 : 20;
 	if (device_sm[dev_id] >= 600) intensity = 23;
 	if (device_sm[dev_id] < 350) intensity = 18;
 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
 	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 	if (opt_benchmark) {
 		ptarget[7] = 0xff;
 	}
 	if (!init[thr_id]){
 		cudaSetDevice(dev_id);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
 			// reduce cpu usage (linux)
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
 		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
 		lbry_sha256_init(thr_id);
 		lbry_sha512_init(thr_id);
 		lbry_ripemd160_init(thr_id);
 		cuda_check_cpu_init(thr_id, throughput);
 		CUDA_LOG_ERROR();
 		init[thr_id] = true;
 	}
 	for (int i=0; i < LBC_NONCE_OFT32; i++) {
 		be32enc(&endiandata[i], pdata[i]);
 	}
 	lbry_sha256_setBlock_112(endiandata, ptarget);
 	cuda_check_cpu_setTarget(ptarget);
 	do {
 		// Hash with CUDA
 		#if 0
 		lbry_sha256_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], swap, 0);
 		lbry_sha256_hash_32(thr_id, throughput, d_hash[thr_id], 0);
 		#else
 		lbry_sha256d_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], swap, 0);
 		#endif
 		CUDA_LOG_ERROR();
 		lbry_sha512_hash_32(thr_id, throughput, d_hash[thr_id], 0);
 		lbry_ripemd160_hash_32x2(thr_id, throughput, d_hash[thr_id], 0);
 		#if 0
 		lbry_sha256d_hash_20x2(thr_id, throughput, d_hash[thr_id], 0);
 		uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id]);
 		#else
 		uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
 		lbry_sha256d_hash_final(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], resNonces, 0);
 		uint32_t foundNonce = resNonces[0];
 		#endif
 		*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + throughput;
 		if (foundNonce != UINT32_MAX)
 		{
 			endiandata[LBC_NONCE_OFT32] = swab32_if(foundNonce, !swap);
 			lbry_hash(vhash, endiandata);
 			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
 				int res = 1;
 				uint32_t secNonce =  resNonces[1];
 				//uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], 1);
 				work->nonces[0] = swab32_if(foundNonce, swap);
 				work_set_target_ratio(work, vhash);
 				if (secNonce != UINT32_MAX) {
 				//if (secNonce) {
 					if (opt_debug)
 						gpulog(LOG_BLUE, thr_id, "found second nonce %08x", swab32(secNonce));
 					endiandata[LBC_NONCE_OFT32] = swab32_if(secNonce, !swap);
 					lbry_hash(vhash, endiandata);
 					if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio) {
 						work_set_target_ratio(work, vhash);
 						xchg(work->nonces[0], work->nonces[1]);
 					}
 					work->nonces[1] = swab32_if(secNonce, swap);
 					res++;
 				}
 				pdata[LBC_NONCE_OFT32] = work->nonces[0];
 				return res;
 			} else {
 				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU %08x > %08x!", foundNonce, vhash[7], ptarget[7]);
 			}
 		}
 		if ((uint64_t) throughput + pdata[LBC_NONCE_OFT32] >= max_nonce) {
 			pdata[LBC_NONCE_OFT32] = max_nonce;
 			break;
 		}
 		pdata[LBC_NONCE_OFT32] += throughput;
 	} while (!work_restart[thr_id].restart);
 	//*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce;
 	return 0;
 }
 // cleanup
 void free_lbry(int thr_id)
 {
 	if (!init[thr_id])
 		return;
 	cudaThreadSynchronize();
 	cudaFree(d_hash[thr_id]);
 	lbry_sha256_free(thr_id);
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;
 	cudaDeviceSynchronize();
 }
--- a/miner.h
+++ b/miner.h
@ -275,6 +275,7 @@ extern int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce,
 extern int scanhash_groestlcoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_heavy(int thr_id,struct work *work, uint32_t max_nonce, unsigned long *hashes_done, uint32_t maxvote, int blocklen);
 extern int scanhash_jackpot(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_lyra2v2(int thr_id,struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@ -584,6 +585,7 @@ struct stratum_job {
 	unsigned char version[4];
 	unsigned char nbits[4];
 	unsigned char ntime[4];
 	unsigned char claim[32]; // lbry
 	bool clean;
 	unsigned char nreward[2];
 	uint32_t height;
@ -797,6 +799,7 @@ void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
 void keccak256_hash(void *state, const void *input);
 unsigned int jackpothash(void *state, const void *input);
 void groestlhash(void *state, const void *input);
 void lbry_hash(void *output, const void *input);
 void lyra2re_hash(void *state, const void *input);
 void lyra2v2_hash(void *state, const void *input);
 void myriadhash(void *state, const void *input);
--- a/sph/ripemd.c
+++ b/sph/ripemd.c
@ -0,0 +1,833 @@
 /* $Id: ripemd.c 216 2010-06-08 09:46:57Z tp $ */
 /*
 * RIPEMD-160 implementation.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2007-2010  Projet RNRT SAPHIR
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 *
 * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 */
 #include <stddef.h>
 #include <string.h>
 #include "sph_ripemd.h"
 /*
 * Round functions for RIPEMD (original).
 */
 #define F(x, y, z)    ((((y) ^ (z)) & (x)) ^ (z))
 #define G(x, y, z)    (((x) & (y)) | (((x) | (y)) & (z)))
 #define H(x, y, z)    ((x) ^ (y) ^ (z))
 static const sph_u32 oIV[5] = {
 	SPH_C32(0x67452301), SPH_C32(0xEFCDAB89),
 	SPH_C32(0x98BADCFE), SPH_C32(0x10325476)
 };
 /*
 * Round functions for RIPEMD-128 and RIPEMD-160.
 */
 #define F1(x, y, z)   ((x) ^ (y) ^ (z))
 #define F2(x, y, z)   ((((y) ^ (z)) & (x)) ^ (z))
 #define F3(x, y, z)   (((x) | ~(y)) ^ (z))
 #define F4(x, y, z)   ((((x) ^ (y)) & (z)) ^ (y))
 #define F5(x, y, z)   ((x) ^ ((y) | ~(z)))
 static const sph_u32 IV[5] = {
 	SPH_C32(0x67452301), SPH_C32(0xEFCDAB89), SPH_C32(0x98BADCFE),
 	SPH_C32(0x10325476), SPH_C32(0xC3D2E1F0)
 };
 #define ROTL    SPH_ROTL32
 /* ===================================================================== */
 /*
 * RIPEMD (original hash, deprecated).
 */
 #define FF1(A, B, C, D, X, s)   do { \
 		sph_u32 tmp = SPH_T32((A) + F(B, C, D) + (X)); \
 		(A) = ROTL(tmp, (s)); \
 	} while (0)
 #define GG1(A, B, C, D, X, s)   do { \
 		sph_u32 tmp = SPH_T32((A) + G(B, C, D) \
 			+ (X) + SPH_C32(0x5A827999)); \
 		(A) = ROTL(tmp, (s)); \
 	} while (0)
 #define HH1(A, B, C, D, X, s)   do { \
 		sph_u32 tmp = SPH_T32((A) + H(B, C, D) \
 			+ (X) + SPH_C32(0x6ED9EBA1)); \
 		(A) = ROTL(tmp, (s)); \
 	} while (0)
 #define FF2(A, B, C, D, X, s)   do { \
 		sph_u32 tmp = SPH_T32((A) + F(B, C, D) \
 			+ (X) + SPH_C32(0x50A28BE6)); \
 		(A) = ROTL(tmp, (s)); \
 	} while (0)
 #define GG2(A, B, C, D, X, s)   do { \
 		sph_u32 tmp = SPH_T32((A) + G(B, C, D) + (X)); \
 		(A) = ROTL(tmp, (s)); \
 	} while (0)
 #define HH2(A, B, C, D, X, s)   do { \
 		sph_u32 tmp = SPH_T32((A) + H(B, C, D) \
 			+ (X) + SPH_C32(0x5C4DD124)); \
 		(A) = ROTL(tmp, (s)); \
 	} while (0)
 #define RIPEMD_ROUND_BODY(in, h)   do { \
 		sph_u32 A1, B1, C1, D1; \
 		sph_u32 A2, B2, C2, D2; \
 		sph_u32 tmp; \
 \
 		A1 = A2 = (h)[0]; \
 		B1 = B2 = (h)[1]; \
 		C1 = C2 = (h)[2]; \
 		D1 = D2 = (h)[3]; \
 \
 		FF1(A1, B1, C1, D1, in( 0), 11); \
 		FF1(D1, A1, B1, C1, in( 1), 14); \
 		FF1(C1, D1, A1, B1, in( 2), 15); \
 		FF1(B1, C1, D1, A1, in( 3), 12); \
 		FF1(A1, B1, C1, D1, in( 4),  5); \
 		FF1(D1, A1, B1, C1, in( 5),  8); \
 		FF1(C1, D1, A1, B1, in( 6),  7); \
 		FF1(B1, C1, D1, A1, in( 7),  9); \
 		FF1(A1, B1, C1, D1, in( 8), 11); \
 		FF1(D1, A1, B1, C1, in( 9), 13); \
 		FF1(C1, D1, A1, B1, in(10), 14); \
 		FF1(B1, C1, D1, A1, in(11), 15); \
 		FF1(A1, B1, C1, D1, in(12),  6); \
 		FF1(D1, A1, B1, C1, in(13),  7); \
 		FF1(C1, D1, A1, B1, in(14),  9); \
 		FF1(B1, C1, D1, A1, in(15),  8); \
 \
 		GG1(A1, B1, C1, D1, in( 7),  7); \
 		GG1(D1, A1, B1, C1, in( 4),  6); \
 		GG1(C1, D1, A1, B1, in(13),  8); \
 		GG1(B1, C1, D1, A1, in( 1), 13); \
 		GG1(A1, B1, C1, D1, in(10), 11); \
 		GG1(D1, A1, B1, C1, in( 6),  9); \
 		GG1(C1, D1, A1, B1, in(15),  7); \
 		GG1(B1, C1, D1, A1, in( 3), 15); \
 		GG1(A1, B1, C1, D1, in(12),  7); \
 		GG1(D1, A1, B1, C1, in( 0), 12); \
 		GG1(C1, D1, A1, B1, in( 9), 15); \
 		GG1(B1, C1, D1, A1, in( 5),  9); \
 		GG1(A1, B1, C1, D1, in(14),  7); \
 		GG1(D1, A1, B1, C1, in( 2), 11); \
 		GG1(C1, D1, A1, B1, in(11), 13); \
 		GG1(B1, C1, D1, A1, in( 8), 12); \
 \
 		HH1(A1, B1, C1, D1, in( 3), 11); \
 		HH1(D1, A1, B1, C1, in(10), 13); \
 		HH1(C1, D1, A1, B1, in( 2), 14); \
 		HH1(B1, C1, D1, A1, in( 4),  7); \
 		HH1(A1, B1, C1, D1, in( 9), 14); \
 		HH1(D1, A1, B1, C1, in(15),  9); \
 		HH1(C1, D1, A1, B1, in( 8), 13); \
 		HH1(B1, C1, D1, A1, in( 1), 15); \
 		HH1(A1, B1, C1, D1, in(14),  6); \
 		HH1(D1, A1, B1, C1, in( 7),  8); \
 		HH1(C1, D1, A1, B1, in( 0), 13); \
 		HH1(B1, C1, D1, A1, in( 6),  6); \
 		HH1(A1, B1, C1, D1, in(11), 12); \
 		HH1(D1, A1, B1, C1, in(13),  5); \
 		HH1(C1, D1, A1, B1, in( 5),  7); \
 		HH1(B1, C1, D1, A1, in(12),  5); \
 \
 		FF2(A2, B2, C2, D2, in( 0), 11); \
 		FF2(D2, A2, B2, C2, in( 1), 14); \
 		FF2(C2, D2, A2, B2, in( 2), 15); \
 		FF2(B2, C2, D2, A2, in( 3), 12); \
 		FF2(A2, B2, C2, D2, in( 4),  5); \
 		FF2(D2, A2, B2, C2, in( 5),  8); \
 		FF2(C2, D2, A2, B2, in( 6),  7); \
 		FF2(B2, C2, D2, A2, in( 7),  9); \
 		FF2(A2, B2, C2, D2, in( 8), 11); \
 		FF2(D2, A2, B2, C2, in( 9), 13); \
 		FF2(C2, D2, A2, B2, in(10), 14); \
 		FF2(B2, C2, D2, A2, in(11), 15); \
 		FF2(A2, B2, C2, D2, in(12),  6); \
 		FF2(D2, A2, B2, C2, in(13),  7); \
 		FF2(C2, D2, A2, B2, in(14),  9); \
 		FF2(B2, C2, D2, A2, in(15),  8); \
 \
 		GG2(A2, B2, C2, D2, in( 7),  7); \
 		GG2(D2, A2, B2, C2, in( 4),  6); \
 		GG2(C2, D2, A2, B2, in(13),  8); \
 		GG2(B2, C2, D2, A2, in( 1), 13); \
 		GG2(A2, B2, C2, D2, in(10), 11); \
 		GG2(D2, A2, B2, C2, in( 6),  9); \
 		GG2(C2, D2, A2, B2, in(15),  7); \
 		GG2(B2, C2, D2, A2, in( 3), 15); \
 		GG2(A2, B2, C2, D2, in(12),  7); \
 		GG2(D2, A2, B2, C2, in( 0), 12); \
 		GG2(C2, D2, A2, B2, in( 9), 15); \
 		GG2(B2, C2, D2, A2, in( 5),  9); \
 		GG2(A2, B2, C2, D2, in(14),  7); \
 		GG2(D2, A2, B2, C2, in( 2), 11); \
 		GG2(C2, D2, A2, B2, in(11), 13); \
 		GG2(B2, C2, D2, A2, in( 8), 12); \
 \
 		HH2(A2, B2, C2, D2, in( 3), 11); \
 		HH2(D2, A2, B2, C2, in(10), 13); \
 		HH2(C2, D2, A2, B2, in( 2), 14); \
 		HH2(B2, C2, D2, A2, in( 4),  7); \
 		HH2(A2, B2, C2, D2, in( 9), 14); \
 		HH2(D2, A2, B2, C2, in(15),  9); \
 		HH2(C2, D2, A2, B2, in( 8), 13); \
 		HH2(B2, C2, D2, A2, in( 1), 15); \
 		HH2(A2, B2, C2, D2, in(14),  6); \
 		HH2(D2, A2, B2, C2, in( 7),  8); \
 		HH2(C2, D2, A2, B2, in( 0), 13); \
 		HH2(B2, C2, D2, A2, in( 6),  6); \
 		HH2(A2, B2, C2, D2, in(11), 12); \
 		HH2(D2, A2, B2, C2, in(13),  5); \
 		HH2(C2, D2, A2, B2, in( 5),  7); \
 		HH2(B2, C2, D2, A2, in(12),  5); \
 \
 		tmp = SPH_T32((h)[1] + C1 + D2); \
 		(h)[1] = SPH_T32((h)[2] + D1 + A2); \
 		(h)[2] = SPH_T32((h)[3] + A1 + B2); \
 		(h)[3] = SPH_T32((h)[0] + B1 + C2); \
 		(h)[0] = tmp; \
 	} while (0)
 /*
 * One round of RIPEMD. The data must be aligned for 32-bit access.
 */
 static void
 ripemd_round(const unsigned char *data, sph_u32 r[5])
 {
 #if SPH_LITTLE_FAST
 #define RIPEMD_IN(x)   sph_dec32le_aligned(data + (4 * (x)))
 #else
 	sph_u32 X_var[16];
 	int i;
 	for (i = 0; i < 16; i ++)
 		X_var[i] = sph_dec32le_aligned(data + 4 * i);
 #define RIPEMD_IN(x)   X_var[x]
 #endif
 	RIPEMD_ROUND_BODY(RIPEMD_IN, r);
 #undef RIPEMD_IN
 }
 /* see sph_ripemd.h */
 void
 sph_ripemd_init(void *cc)
 {
 	sph_ripemd_context *sc;
 	sc = cc;
 	memcpy(sc->val, oIV, sizeof sc->val);
 #if SPH_64
 	sc->count = 0;
 #else
 	sc->count_high = sc->count_low = 0;
 #endif
 }
 #define RFUN   ripemd_round
 #define HASH   ripemd
 #define LE32   1
 #include "md_helper.c"
 #undef RFUN
 #undef HASH
 #undef LE32
 /* see sph_ripemd.h */
 void
 sph_ripemd_close(void *cc, void *dst)
 {
 	ripemd_close(cc, dst, 4);
 	sph_ripemd_init(cc);
 }
 /* see sph_ripemd.h */
 void
 sph_ripemd_comp(const sph_u32 msg[16], sph_u32 val[4])
 {
 #define RIPEMD_IN(x)   msg[x]
 	RIPEMD_ROUND_BODY(RIPEMD_IN, val);
 #undef RIPEMD_IN
 }
 /* ===================================================================== */
 /*
 * RIPEMD-128.
 */
 /*
 * Round constants for RIPEMD-128.
 */
 #define sK11   SPH_C32(0x00000000)
 #define sK12   SPH_C32(0x5A827999)
 #define sK13   SPH_C32(0x6ED9EBA1)
 #define sK14   SPH_C32(0x8F1BBCDC)
 #define sK21   SPH_C32(0x50A28BE6)
 #define sK22   SPH_C32(0x5C4DD124)
 #define sK23   SPH_C32(0x6D703EF3)
 #define sK24   SPH_C32(0x00000000)
 #define sRR(a, b, c, d, f, s, r, k)   do { \
 		a = ROTL(SPH_T32(a + f(b, c, d) + r + k), s); \
 	} while (0)
 #define sROUND1(a, b, c, d, f, s, r, k)  \
 	sRR(a ## 1, b ## 1, c ## 1, d ## 1, f, s, r, sK1 ## k)
 #define sROUND2(a, b, c, d, f, s, r, k)  \
 	sRR(a ## 2, b ## 2, c ## 2, d ## 2, f, s, r, sK2 ## k)
 /*
 * This macro defines the body for a RIPEMD-128 compression function
 * implementation. The "in" parameter should evaluate, when applied to a
 * numerical input parameter from 0 to 15, to an expression which yields
 * the corresponding input block. The "h" parameter should evaluate to
 * an array or pointer expression designating the array of 4 words which
 * contains the input and output of the compression function.
 */
 #define RIPEMD128_ROUND_BODY(in, h)   do { \
 		sph_u32 A1, B1, C1, D1; \
 		sph_u32 A2, B2, C2, D2; \
 		sph_u32 tmp; \
 \
 		A1 = A2 = (h)[0]; \
 		B1 = B2 = (h)[1]; \
 		C1 = C2 = (h)[2]; \
 		D1 = D2 = (h)[3]; \
 \
 		sROUND1(A, B, C, D, F1, 11, in( 0),  1); \
 		sROUND1(D, A, B, C, F1, 14, in( 1),  1); \
 		sROUND1(C, D, A, B, F1, 15, in( 2),  1); \
 		sROUND1(B, C, D, A, F1, 12, in( 3),  1); \
 		sROUND1(A, B, C, D, F1,  5, in( 4),  1); \
 		sROUND1(D, A, B, C, F1,  8, in( 5),  1); \
 		sROUND1(C, D, A, B, F1,  7, in( 6),  1); \
 		sROUND1(B, C, D, A, F1,  9, in( 7),  1); \
 		sROUND1(A, B, C, D, F1, 11, in( 8),  1); \
 		sROUND1(D, A, B, C, F1, 13, in( 9),  1); \
 		sROUND1(C, D, A, B, F1, 14, in(10),  1); \
 		sROUND1(B, C, D, A, F1, 15, in(11),  1); \
 		sROUND1(A, B, C, D, F1,  6, in(12),  1); \
 		sROUND1(D, A, B, C, F1,  7, in(13),  1); \
 		sROUND1(C, D, A, B, F1,  9, in(14),  1); \
 		sROUND1(B, C, D, A, F1,  8, in(15),  1); \
 \
 		sROUND1(A, B, C, D, F2,  7, in( 7),  2); \
 		sROUND1(D, A, B, C, F2,  6, in( 4),  2); \
 		sROUND1(C, D, A, B, F2,  8, in(13),  2); \
 		sROUND1(B, C, D, A, F2, 13, in( 1),  2); \
 		sROUND1(A, B, C, D, F2, 11, in(10),  2); \
 		sROUND1(D, A, B, C, F2,  9, in( 6),  2); \
 		sROUND1(C, D, A, B, F2,  7, in(15),  2); \
 		sROUND1(B, C, D, A, F2, 15, in( 3),  2); \
 		sROUND1(A, B, C, D, F2,  7, in(12),  2); \
 		sROUND1(D, A, B, C, F2, 12, in( 0),  2); \
 		sROUND1(C, D, A, B, F2, 15, in( 9),  2); \
 		sROUND1(B, C, D, A, F2,  9, in( 5),  2); \
 		sROUND1(A, B, C, D, F2, 11, in( 2),  2); \
 		sROUND1(D, A, B, C, F2,  7, in(14),  2); \
 		sROUND1(C, D, A, B, F2, 13, in(11),  2); \
 		sROUND1(B, C, D, A, F2, 12, in( 8),  2); \
 \
 		sROUND1(A, B, C, D, F3, 11, in( 3),  3); \
 		sROUND1(D, A, B, C, F3, 13, in(10),  3); \
 		sROUND1(C, D, A, B, F3,  6, in(14),  3); \
 		sROUND1(B, C, D, A, F3,  7, in( 4),  3); \
 		sROUND1(A, B, C, D, F3, 14, in( 9),  3); \
 		sROUND1(D, A, B, C, F3,  9, in(15),  3); \
 		sROUND1(C, D, A, B, F3, 13, in( 8),  3); \
 		sROUND1(B, C, D, A, F3, 15, in( 1),  3); \
 		sROUND1(A, B, C, D, F3, 14, in( 2),  3); \
 		sROUND1(D, A, B, C, F3,  8, in( 7),  3); \
 		sROUND1(C, D, A, B, F3, 13, in( 0),  3); \
 		sROUND1(B, C, D, A, F3,  6, in( 6),  3); \
 		sROUND1(A, B, C, D, F3,  5, in(13),  3); \
 		sROUND1(D, A, B, C, F3, 12, in(11),  3); \
 		sROUND1(C, D, A, B, F3,  7, in( 5),  3); \
 		sROUND1(B, C, D, A, F3,  5, in(12),  3); \
 \
 		sROUND1(A, B, C, D, F4, 11, in( 1),  4); \
 		sROUND1(D, A, B, C, F4, 12, in( 9),  4); \
 		sROUND1(C, D, A, B, F4, 14, in(11),  4); \
 		sROUND1(B, C, D, A, F4, 15, in(10),  4); \
 		sROUND1(A, B, C, D, F4, 14, in( 0),  4); \
 		sROUND1(D, A, B, C, F4, 15, in( 8),  4); \
 		sROUND1(C, D, A, B, F4,  9, in(12),  4); \
 		sROUND1(B, C, D, A, F4,  8, in( 4),  4); \
 		sROUND1(A, B, C, D, F4,  9, in(13),  4); \
 		sROUND1(D, A, B, C, F4, 14, in( 3),  4); \
 		sROUND1(C, D, A, B, F4,  5, in( 7),  4); \
 		sROUND1(B, C, D, A, F4,  6, in(15),  4); \
 		sROUND1(A, B, C, D, F4,  8, in(14),  4); \
 		sROUND1(D, A, B, C, F4,  6, in( 5),  4); \
 		sROUND1(C, D, A, B, F4,  5, in( 6),  4); \
 		sROUND1(B, C, D, A, F4, 12, in( 2),  4); \
 \
 		sROUND2(A, B, C, D, F4,  8, in( 5),  1); \
 		sROUND2(D, A, B, C, F4,  9, in(14),  1); \
 		sROUND2(C, D, A, B, F4,  9, in( 7),  1); \
 		sROUND2(B, C, D, A, F4, 11, in( 0),  1); \
 		sROUND2(A, B, C, D, F4, 13, in( 9),  1); \
 		sROUND2(D, A, B, C, F4, 15, in( 2),  1); \
 		sROUND2(C, D, A, B, F4, 15, in(11),  1); \
 		sROUND2(B, C, D, A, F4,  5, in( 4),  1); \
 		sROUND2(A, B, C, D, F4,  7, in(13),  1); \
 		sROUND2(D, A, B, C, F4,  7, in( 6),  1); \
 		sROUND2(C, D, A, B, F4,  8, in(15),  1); \
 		sROUND2(B, C, D, A, F4, 11, in( 8),  1); \
 		sROUND2(A, B, C, D, F4, 14, in( 1),  1); \
 		sROUND2(D, A, B, C, F4, 14, in(10),  1); \
 		sROUND2(C, D, A, B, F4, 12, in( 3),  1); \
 		sROUND2(B, C, D, A, F4,  6, in(12),  1); \
 \
 		sROUND2(A, B, C, D, F3,  9, in( 6),  2); \
 		sROUND2(D, A, B, C, F3, 13, in(11),  2); \
 		sROUND2(C, D, A, B, F3, 15, in( 3),  2); \
 		sROUND2(B, C, D, A, F3,  7, in( 7),  2); \
 		sROUND2(A, B, C, D, F3, 12, in( 0),  2); \
 		sROUND2(D, A, B, C, F3,  8, in(13),  2); \
 		sROUND2(C, D, A, B, F3,  9, in( 5),  2); \
 		sROUND2(B, C, D, A, F3, 11, in(10),  2); \
 		sROUND2(A, B, C, D, F3,  7, in(14),  2); \
 		sROUND2(D, A, B, C, F3,  7, in(15),  2); \
 		sROUND2(C, D, A, B, F3, 12, in( 8),  2); \
 		sROUND2(B, C, D, A, F3,  7, in(12),  2); \
 		sROUND2(A, B, C, D, F3,  6, in( 4),  2); \
 		sROUND2(D, A, B, C, F3, 15, in( 9),  2); \
 		sROUND2(C, D, A, B, F3, 13, in( 1),  2); \
 		sROUND2(B, C, D, A, F3, 11, in( 2),  2); \
 \
 		sROUND2(A, B, C, D, F2,  9, in(15),  3); \
 		sROUND2(D, A, B, C, F2,  7, in( 5),  3); \
 		sROUND2(C, D, A, B, F2, 15, in( 1),  3); \
 		sROUND2(B, C, D, A, F2, 11, in( 3),  3); \
 		sROUND2(A, B, C, D, F2,  8, in( 7),  3); \
 		sROUND2(D, A, B, C, F2,  6, in(14),  3); \
 		sROUND2(C, D, A, B, F2,  6, in( 6),  3); \
 		sROUND2(B, C, D, A, F2, 14, in( 9),  3); \
 		sROUND2(A, B, C, D, F2, 12, in(11),  3); \
 		sROUND2(D, A, B, C, F2, 13, in( 8),  3); \
 		sROUND2(C, D, A, B, F2,  5, in(12),  3); \
 		sROUND2(B, C, D, A, F2, 14, in( 2),  3); \
 		sROUND2(A, B, C, D, F2, 13, in(10),  3); \
 		sROUND2(D, A, B, C, F2, 13, in( 0),  3); \
 		sROUND2(C, D, A, B, F2,  7, in( 4),  3); \
 		sROUND2(B, C, D, A, F2,  5, in(13),  3); \
 \
 		sROUND2(A, B, C, D, F1, 15, in( 8),  4); \
 		sROUND2(D, A, B, C, F1,  5, in( 6),  4); \
 		sROUND2(C, D, A, B, F1,  8, in( 4),  4); \
 		sROUND2(B, C, D, A, F1, 11, in( 1),  4); \
 		sROUND2(A, B, C, D, F1, 14, in( 3),  4); \
 		sROUND2(D, A, B, C, F1, 14, in(11),  4); \
 		sROUND2(C, D, A, B, F1,  6, in(15),  4); \
 		sROUND2(B, C, D, A, F1, 14, in( 0),  4); \
 		sROUND2(A, B, C, D, F1,  6, in( 5),  4); \
 		sROUND2(D, A, B, C, F1,  9, in(12),  4); \
 		sROUND2(C, D, A, B, F1, 12, in( 2),  4); \
 		sROUND2(B, C, D, A, F1,  9, in(13),  4); \
 		sROUND2(A, B, C, D, F1, 12, in( 9),  4); \
 		sROUND2(D, A, B, C, F1,  5, in( 7),  4); \
 		sROUND2(C, D, A, B, F1, 15, in(10),  4); \
 		sROUND2(B, C, D, A, F1,  8, in(14),  4); \
 \
 		tmp = SPH_T32((h)[1] + C1 + D2); \
 		(h)[1] = SPH_T32((h)[2] + D1 + A2); \
 		(h)[2] = SPH_T32((h)[3] + A1 + B2); \
 		(h)[3] = SPH_T32((h)[0] + B1 + C2); \
 		(h)[0] = tmp; \
 	} while (0)
 /*
 * One round of RIPEMD-128. The data must be aligned for 32-bit access.
 */
 static void
 ripemd128_round(const unsigned char *data, sph_u32 r[5])
 {
 #if SPH_LITTLE_FAST
 #define RIPEMD128_IN(x)   sph_dec32le_aligned(data + (4 * (x)))
 #else
 	sph_u32 X_var[16];
 	int i;
 	for (i = 0; i < 16; i ++)
 		X_var[i] = sph_dec32le_aligned(data + 4 * i);
 #define RIPEMD128_IN(x)   X_var[x]
 #endif
 	RIPEMD128_ROUND_BODY(RIPEMD128_IN, r);
 #undef RIPEMD128_IN
 }
 /* see sph_ripemd.h */
 void
 sph_ripemd128_init(void *cc)
 {
 	sph_ripemd128_context *sc;
 	sc = cc;
 	memcpy(sc->val, IV, sizeof sc->val);
 #if SPH_64
 	sc->count = 0;
 #else
 	sc->count_high = sc->count_low = 0;
 #endif
 }
 #define RFUN   ripemd128_round
 #define HASH   ripemd128
 #define LE32   1
 #include "md_helper.c"
 #undef RFUN
 #undef HASH
 #undef LE32
 /* see sph_ripemd.h */
 void
 sph_ripemd128_close(void *cc, void *dst)
 {
 	ripemd128_close(cc, dst, 4);
 	sph_ripemd128_init(cc);
 }
 /* see sph_ripemd.h */
 void
 sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4])
 {
 #define RIPEMD128_IN(x)   msg[x]
 	RIPEMD128_ROUND_BODY(RIPEMD128_IN, val);
 #undef RIPEMD128_IN
 }
 /* ===================================================================== */
 /*
 * RIPEMD-160.
 */
 /*
 * Round constants for RIPEMD-160.
 */
 #define K11    SPH_C32(0x00000000)
 #define K12    SPH_C32(0x5A827999)
 #define K13    SPH_C32(0x6ED9EBA1)
 #define K14    SPH_C32(0x8F1BBCDC)
 #define K15    SPH_C32(0xA953FD4E)
 #define K21    SPH_C32(0x50A28BE6)
 #define K22    SPH_C32(0x5C4DD124)
 #define K23    SPH_C32(0x6D703EF3)
 #define K24    SPH_C32(0x7A6D76E9)
 #define K25    SPH_C32(0x00000000)
 #define RR(a, b, c, d, e, f, s, r, k)   do { \
 		a = SPH_T32(ROTL(SPH_T32(a + f(b, c, d) + r + k), s) + e); \
 		c = ROTL(c, 10); \
 	} while (0)
 #define ROUND1(a, b, c, d, e, f, s, r, k)  \
 	RR(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
 #define ROUND2(a, b, c, d, e, f, s, r, k)  \
 	RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
 /*
 * This macro defines the body for a RIPEMD-160 compression function
 * implementation. The "in" parameter should evaluate, when applied to a
 * numerical input parameter from 0 to 15, to an expression which yields
 * the corresponding input block. The "h" parameter should evaluate to
 * an array or pointer expression designating the array of 5 words which
 * contains the input and output of the compression function.
 */
 #define RIPEMD160_ROUND_BODY(in, h)   do { \
 		sph_u32 A1, B1, C1, D1, E1; \
 		sph_u32 A2, B2, C2, D2, E2; \
 		sph_u32 tmp; \
 \
 		A1 = A2 = (h)[0]; \
 		B1 = B2 = (h)[1]; \
 		C1 = C2 = (h)[2]; \
 		D1 = D2 = (h)[3]; \
 		E1 = E2 = (h)[4]; \
 \
 		ROUND1(A, B, C, D, E, F1, 11, in( 0),  1); \
 		ROUND1(E, A, B, C, D, F1, 14, in( 1),  1); \
 		ROUND1(D, E, A, B, C, F1, 15, in( 2),  1); \
 		ROUND1(C, D, E, A, B, F1, 12, in( 3),  1); \
 		ROUND1(B, C, D, E, A, F1,  5, in( 4),  1); \
 		ROUND1(A, B, C, D, E, F1,  8, in( 5),  1); \
 		ROUND1(E, A, B, C, D, F1,  7, in( 6),  1); \
 		ROUND1(D, E, A, B, C, F1,  9, in( 7),  1); \
 		ROUND1(C, D, E, A, B, F1, 11, in( 8),  1); \
 		ROUND1(B, C, D, E, A, F1, 13, in( 9),  1); \
 		ROUND1(A, B, C, D, E, F1, 14, in(10),  1); \
 		ROUND1(E, A, B, C, D, F1, 15, in(11),  1); \
 		ROUND1(D, E, A, B, C, F1,  6, in(12),  1); \
 		ROUND1(C, D, E, A, B, F1,  7, in(13),  1); \
 		ROUND1(B, C, D, E, A, F1,  9, in(14),  1); \
 		ROUND1(A, B, C, D, E, F1,  8, in(15),  1); \
 \
 		ROUND1(E, A, B, C, D, F2,  7, in( 7),  2); \
 		ROUND1(D, E, A, B, C, F2,  6, in( 4),  2); \
 		ROUND1(C, D, E, A, B, F2,  8, in(13),  2); \
 		ROUND1(B, C, D, E, A, F2, 13, in( 1),  2); \
 		ROUND1(A, B, C, D, E, F2, 11, in(10),  2); \
 		ROUND1(E, A, B, C, D, F2,  9, in( 6),  2); \
 		ROUND1(D, E, A, B, C, F2,  7, in(15),  2); \
 		ROUND1(C, D, E, A, B, F2, 15, in( 3),  2); \
 		ROUND1(B, C, D, E, A, F2,  7, in(12),  2); \
 		ROUND1(A, B, C, D, E, F2, 12, in( 0),  2); \
 		ROUND1(E, A, B, C, D, F2, 15, in( 9),  2); \
 		ROUND1(D, E, A, B, C, F2,  9, in( 5),  2); \
 		ROUND1(C, D, E, A, B, F2, 11, in( 2),  2); \
 		ROUND1(B, C, D, E, A, F2,  7, in(14),  2); \
 		ROUND1(A, B, C, D, E, F2, 13, in(11),  2); \
 		ROUND1(E, A, B, C, D, F2, 12, in( 8),  2); \
 \
 		ROUND1(D, E, A, B, C, F3, 11, in( 3),  3); \
 		ROUND1(C, D, E, A, B, F3, 13, in(10),  3); \
 		ROUND1(B, C, D, E, A, F3,  6, in(14),  3); \
 		ROUND1(A, B, C, D, E, F3,  7, in( 4),  3); \
 		ROUND1(E, A, B, C, D, F3, 14, in( 9),  3); \
 		ROUND1(D, E, A, B, C, F3,  9, in(15),  3); \
 		ROUND1(C, D, E, A, B, F3, 13, in( 8),  3); \
 		ROUND1(B, C, D, E, A, F3, 15, in( 1),  3); \
 		ROUND1(A, B, C, D, E, F3, 14, in( 2),  3); \
 		ROUND1(E, A, B, C, D, F3,  8, in( 7),  3); \
 		ROUND1(D, E, A, B, C, F3, 13, in( 0),  3); \
 		ROUND1(C, D, E, A, B, F3,  6, in( 6),  3); \
 		ROUND1(B, C, D, E, A, F3,  5, in(13),  3); \
 		ROUND1(A, B, C, D, E, F3, 12, in(11),  3); \
 		ROUND1(E, A, B, C, D, F3,  7, in( 5),  3); \
 		ROUND1(D, E, A, B, C, F3,  5, in(12),  3); \
 \
 		ROUND1(C, D, E, A, B, F4, 11, in( 1),  4); \
 		ROUND1(B, C, D, E, A, F4, 12, in( 9),  4); \
 		ROUND1(A, B, C, D, E, F4, 14, in(11),  4); \
 		ROUND1(E, A, B, C, D, F4, 15, in(10),  4); \
 		ROUND1(D, E, A, B, C, F4, 14, in( 0),  4); \
 		ROUND1(C, D, E, A, B, F4, 15, in( 8),  4); \
 		ROUND1(B, C, D, E, A, F4,  9, in(12),  4); \
 		ROUND1(A, B, C, D, E, F4,  8, in( 4),  4); \
 		ROUND1(E, A, B, C, D, F4,  9, in(13),  4); \
 		ROUND1(D, E, A, B, C, F4, 14, in( 3),  4); \
 		ROUND1(C, D, E, A, B, F4,  5, in( 7),  4); \
 		ROUND1(B, C, D, E, A, F4,  6, in(15),  4); \
 		ROUND1(A, B, C, D, E, F4,  8, in(14),  4); \
 		ROUND1(E, A, B, C, D, F4,  6, in( 5),  4); \
 		ROUND1(D, E, A, B, C, F4,  5, in( 6),  4); \
 		ROUND1(C, D, E, A, B, F4, 12, in( 2),  4); \
 \
 		ROUND1(B, C, D, E, A, F5,  9, in( 4),  5); \
 		ROUND1(A, B, C, D, E, F5, 15, in( 0),  5); \
 		ROUND1(E, A, B, C, D, F5,  5, in( 5),  5); \
 		ROUND1(D, E, A, B, C, F5, 11, in( 9),  5); \
 		ROUND1(C, D, E, A, B, F5,  6, in( 7),  5); \
 		ROUND1(B, C, D, E, A, F5,  8, in(12),  5); \
 		ROUND1(A, B, C, D, E, F5, 13, in( 2),  5); \
 		ROUND1(E, A, B, C, D, F5, 12, in(10),  5); \
 		ROUND1(D, E, A, B, C, F5,  5, in(14),  5); \
 		ROUND1(C, D, E, A, B, F5, 12, in( 1),  5); \
 		ROUND1(B, C, D, E, A, F5, 13, in( 3),  5); \
 		ROUND1(A, B, C, D, E, F5, 14, in( 8),  5); \
 		ROUND1(E, A, B, C, D, F5, 11, in(11),  5); \
 		ROUND1(D, E, A, B, C, F5,  8, in( 6),  5); \
 		ROUND1(C, D, E, A, B, F5,  5, in(15),  5); \
 		ROUND1(B, C, D, E, A, F5,  6, in(13),  5); \
 \
 		ROUND2(A, B, C, D, E, F5,  8, in( 5),  1); \
 		ROUND2(E, A, B, C, D, F5,  9, in(14),  1); \
 		ROUND2(D, E, A, B, C, F5,  9, in( 7),  1); \
 		ROUND2(C, D, E, A, B, F5, 11, in( 0),  1); \
 		ROUND2(B, C, D, E, A, F5, 13, in( 9),  1); \
 		ROUND2(A, B, C, D, E, F5, 15, in( 2),  1); \
 		ROUND2(E, A, B, C, D, F5, 15, in(11),  1); \
 		ROUND2(D, E, A, B, C, F5,  5, in( 4),  1); \
 		ROUND2(C, D, E, A, B, F5,  7, in(13),  1); \
 		ROUND2(B, C, D, E, A, F5,  7, in( 6),  1); \
 		ROUND2(A, B, C, D, E, F5,  8, in(15),  1); \
 		ROUND2(E, A, B, C, D, F5, 11, in( 8),  1); \
 		ROUND2(D, E, A, B, C, F5, 14, in( 1),  1); \
 		ROUND2(C, D, E, A, B, F5, 14, in(10),  1); \
 		ROUND2(B, C, D, E, A, F5, 12, in( 3),  1); \
 		ROUND2(A, B, C, D, E, F5,  6, in(12),  1); \
 \
 		ROUND2(E, A, B, C, D, F4,  9, in( 6),  2); \
 		ROUND2(D, E, A, B, C, F4, 13, in(11),  2); \
 		ROUND2(C, D, E, A, B, F4, 15, in( 3),  2); \
 		ROUND2(B, C, D, E, A, F4,  7, in( 7),  2); \
 		ROUND2(A, B, C, D, E, F4, 12, in( 0),  2); \
 		ROUND2(E, A, B, C, D, F4,  8, in(13),  2); \
 		ROUND2(D, E, A, B, C, F4,  9, in( 5),  2); \
 		ROUND2(C, D, E, A, B, F4, 11, in(10),  2); \
 		ROUND2(B, C, D, E, A, F4,  7, in(14),  2); \
 		ROUND2(A, B, C, D, E, F4,  7, in(15),  2); \
 		ROUND2(E, A, B, C, D, F4, 12, in( 8),  2); \
 		ROUND2(D, E, A, B, C, F4,  7, in(12),  2); \
 		ROUND2(C, D, E, A, B, F4,  6, in( 4),  2); \
 		ROUND2(B, C, D, E, A, F4, 15, in( 9),  2); \
 		ROUND2(A, B, C, D, E, F4, 13, in( 1),  2); \
 		ROUND2(E, A, B, C, D, F4, 11, in( 2),  2); \
 \
 		ROUND2(D, E, A, B, C, F3,  9, in(15),  3); \
 		ROUND2(C, D, E, A, B, F3,  7, in( 5),  3); \
 		ROUND2(B, C, D, E, A, F3, 15, in( 1),  3); \
 		ROUND2(A, B, C, D, E, F3, 11, in( 3),  3); \
 		ROUND2(E, A, B, C, D, F3,  8, in( 7),  3); \
 		ROUND2(D, E, A, B, C, F3,  6, in(14),  3); \
 		ROUND2(C, D, E, A, B, F3,  6, in( 6),  3); \
 		ROUND2(B, C, D, E, A, F3, 14, in( 9),  3); \
 		ROUND2(A, B, C, D, E, F3, 12, in(11),  3); \
 		ROUND2(E, A, B, C, D, F3, 13, in( 8),  3); \
 		ROUND2(D, E, A, B, C, F3,  5, in(12),  3); \
 		ROUND2(C, D, E, A, B, F3, 14, in( 2),  3); \
 		ROUND2(B, C, D, E, A, F3, 13, in(10),  3); \
 		ROUND2(A, B, C, D, E, F3, 13, in( 0),  3); \
 		ROUND2(E, A, B, C, D, F3,  7, in( 4),  3); \
 		ROUND2(D, E, A, B, C, F3,  5, in(13),  3); \
 \
 		ROUND2(C, D, E, A, B, F2, 15, in( 8),  4); \
 		ROUND2(B, C, D, E, A, F2,  5, in( 6),  4); \
 		ROUND2(A, B, C, D, E, F2,  8, in( 4),  4); \
 		ROUND2(E, A, B, C, D, F2, 11, in( 1),  4); \
 		ROUND2(D, E, A, B, C, F2, 14, in( 3),  4); \
 		ROUND2(C, D, E, A, B, F2, 14, in(11),  4); \
 		ROUND2(B, C, D, E, A, F2,  6, in(15),  4); \
 		ROUND2(A, B, C, D, E, F2, 14, in( 0),  4); \
 		ROUND2(E, A, B, C, D, F2,  6, in( 5),  4); \
 		ROUND2(D, E, A, B, C, F2,  9, in(12),  4); \
 		ROUND2(C, D, E, A, B, F2, 12, in( 2),  4); \
 		ROUND2(B, C, D, E, A, F2,  9, in(13),  4); \
 		ROUND2(A, B, C, D, E, F2, 12, in( 9),  4); \
 		ROUND2(E, A, B, C, D, F2,  5, in( 7),  4); \
 		ROUND2(D, E, A, B, C, F2, 15, in(10),  4); \
 		ROUND2(C, D, E, A, B, F2,  8, in(14),  4); \
 \
 		ROUND2(B, C, D, E, A, F1,  8, in(12),  5); \
 		ROUND2(A, B, C, D, E, F1,  5, in(15),  5); \
 		ROUND2(E, A, B, C, D, F1, 12, in(10),  5); \
 		ROUND2(D, E, A, B, C, F1,  9, in( 4),  5); \
 		ROUND2(C, D, E, A, B, F1, 12, in( 1),  5); \
 		ROUND2(B, C, D, E, A, F1,  5, in( 5),  5); \
 		ROUND2(A, B, C, D, E, F1, 14, in( 8),  5); \
 		ROUND2(E, A, B, C, D, F1,  6, in( 7),  5); \
 		ROUND2(D, E, A, B, C, F1,  8, in( 6),  5); \
 		ROUND2(C, D, E, A, B, F1, 13, in( 2),  5); \
 		ROUND2(B, C, D, E, A, F1,  6, in(13),  5); \
 		ROUND2(A, B, C, D, E, F1,  5, in(14),  5); \
 		ROUND2(E, A, B, C, D, F1, 15, in( 0),  5); \
 		ROUND2(D, E, A, B, C, F1, 13, in( 3),  5); \
 		ROUND2(C, D, E, A, B, F1, 11, in( 9),  5); \
 		ROUND2(B, C, D, E, A, F1, 11, in(11),  5); \
 \
 		tmp = SPH_T32((h)[1] + C1 + D2); \
 		(h)[1] = SPH_T32((h)[2] + D1 + E2); \
 		(h)[2] = SPH_T32((h)[3] + E1 + A2); \
 		(h)[3] = SPH_T32((h)[4] + A1 + B2); \
 		(h)[4] = SPH_T32((h)[0] + B1 + C2); \
 		(h)[0] = tmp; \
 	} while (0)
 /*
 * One round of RIPEMD-160. The data must be aligned for 32-bit access.
 */
 static void
 ripemd160_round(const unsigned char *data, sph_u32 r[5])
 {
 #if SPH_LITTLE_FAST
 #define RIPEMD160_IN(x)   sph_dec32le_aligned(data + (4 * (x)))
 #else
 	sph_u32 X_var[16];
 	int i;
 	for (i = 0; i < 16; i ++)
 		X_var[i] = sph_dec32le_aligned(data + 4 * i);
 #define RIPEMD160_IN(x)   X_var[x]
 #endif
 	RIPEMD160_ROUND_BODY(RIPEMD160_IN, r);
 #undef RIPEMD160_IN
 }
 /* see sph_ripemd.h */
 void
 sph_ripemd160_init(void *cc)
 {
 	sph_ripemd160_context *sc;
 	sc = cc;
 	memcpy(sc->val, IV, sizeof sc->val);
 #if SPH_64
 	sc->count = 0;
 #else
 	sc->count_high = sc->count_low = 0;
 #endif
 }
 #define RFUN   ripemd160_round
 #define HASH   ripemd160
 #define LE32   1
 #include "md_helper.c"
 #undef RFUN
 #undef HASH
 #undef LE32
 /* see sph_ripemd.h */
 void
 sph_ripemd160_close(void *cc, void *dst)
 {
 	ripemd160_close(cc, dst, 5);
 	sph_ripemd160_init(cc);
 }
 /* see sph_ripemd.h */
 void
 sph_ripemd160_comp(const sph_u32 msg[16], sph_u32 val[5])
 {
 #define RIPEMD160_IN(x)   msg[x]
 	RIPEMD160_ROUND_BODY(RIPEMD160_IN, val);
 #undef RIPEMD160_IN
 }
--- a/sph/sph_ripemd.h
+++ b/sph/sph_ripemd.h
@ -0,0 +1,273 @@
 /* $Id: sph_ripemd.h 216 2010-06-08 09:46:57Z tp $ */
 /**
 * RIPEMD, RIPEMD-128 and RIPEMD-160 interface.
 *
 * RIPEMD was first described in: Research and Development in Advanced
 * Communication Technologies in Europe, "RIPE Integrity Primitives:
 * Final Report of RACE Integrity Primitives Evaluation (R1040)", RACE,
 * June 1992.
 *
 * A new, strengthened version, dubbed RIPEMD-160, was published in: H.
 * Dobbertin, A. Bosselaers, and B. Preneel, "RIPEMD-160, a strengthened
 * version of RIPEMD", Fast Software Encryption - FSE'96, LNCS 1039,
 * Springer (1996), pp. 71--82.
 *
 * This article describes both RIPEMD-160, with a 160-bit output, and a
 * reduced version called RIPEMD-128, which has a 128-bit output. RIPEMD-128
 * was meant as a "drop-in" replacement for any hash function with 128-bit
 * output, especially the original RIPEMD.
 *
 * @warning   Collisions, and an efficient method to build other collisions,
 * have been published for the original RIPEMD, which is thus considered as
 * cryptographically broken. It is also very rarely encountered, and there
 * seems to exist no free description or implementation of RIPEMD (except
 * the sphlib code, of course). As of january 2007, RIPEMD-128 and RIPEMD-160
 * seem as secure as their output length allows.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2007-2010  Projet RNRT SAPHIR
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 *
 * @file     sph_ripemd.h
 * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 */
 #ifndef SPH_RIPEMD_H__
 #define SPH_RIPEMD_H__
 #include <stddef.h>
 #include "sph_types.h"
 /**
 * Output size (in bits) for RIPEMD.
 */
 #define SPH_SIZE_ripemd   128
 /**
 * Output size (in bits) for RIPEMD-128.
 */
 #define SPH_SIZE_ripemd128   128
 /**
 * Output size (in bits) for RIPEMD-160.
 */
 #define SPH_SIZE_ripemd160   160
 /**
 * This structure is a context for RIPEMD computations: it contains the
 * intermediate values and some data from the last entered block. Once
 * a RIPEMD computation has been performed, the context can be reused for
 * another computation.
 *
 * The contents of this structure are private. A running RIPEMD computation
 * can be cloned by copying the context (e.g. with a simple
 * <code>memcpy()</code>).
 */
 typedef struct {
 #ifndef DOXYGEN_IGNORE
 	unsigned char buf[64];    /* first field, for alignment */
 	sph_u32 val[4];
 #if SPH_64
 	sph_u64 count;
 #else
 	sph_u32 count_high, count_low;
 #endif
 #endif
 } sph_ripemd_context;
 /**
 * Initialize a RIPEMD context. This process performs no memory allocation.
 *
 * @param cc   the RIPEMD context (pointer to
 *             a <code>sph_ripemd_context</code>)
 */
 void sph_ripemd_init(void *cc);
 /**
 * Process some data bytes. It is acceptable that <code>len</code> is zero
 * (in which case this function does nothing).
 *
 * @param cc     the RIPEMD context
 * @param data   the input data
 * @param len    the input data length (in bytes)
 */
 void sph_ripemd(void *cc, const void *data, size_t len);
 /**
 * Terminate the current RIPEMD computation and output the result into the
 * provided buffer. The destination buffer must be wide enough to
 * accomodate the result (16 bytes). The context is automatically
 * reinitialized.
 *
 * @param cc    the RIPEMD context
 * @param dst   the destination buffer
 */
 void sph_ripemd_close(void *cc, void *dst);
 /**
 * Apply the RIPEMD compression function on the provided data. The
 * <code>msg</code> parameter contains the 16 32-bit input blocks,
 * as numerical values (hence after the little-endian decoding). The
 * <code>val</code> parameter contains the 5 32-bit input blocks for
 * the compression function; the output is written in place in this
 * array.
 *
 * @param msg   the message block (16 values)
 * @param val   the function 128-bit input and output
 */
 void sph_ripemd_comp(const sph_u32 msg[16], sph_u32 val[4]);
 /* ===================================================================== */
 /**
 * This structure is a context for RIPEMD-128 computations: it contains the
 * intermediate values and some data from the last entered block. Once
 * a RIPEMD-128 computation has been performed, the context can be reused for
 * another computation.
 *
 * The contents of this structure are private. A running RIPEMD-128 computation
 * can be cloned by copying the context (e.g. with a simple
 * <code>memcpy()</code>).
 */
 typedef struct {
 #ifndef DOXYGEN_IGNORE
 	unsigned char buf[64];    /* first field, for alignment */
 	sph_u32 val[4];
 #if SPH_64
 	sph_u64 count;
 #else
 	sph_u32 count_high, count_low;
 #endif
 #endif
 } sph_ripemd128_context;
 /**
 * Initialize a RIPEMD-128 context. This process performs no memory allocation.
 *
 * @param cc   the RIPEMD-128 context (pointer to
 *             a <code>sph_ripemd128_context</code>)
 */
 void sph_ripemd128_init(void *cc);
 /**
 * Process some data bytes. It is acceptable that <code>len</code> is zero
 * (in which case this function does nothing).
 *
 * @param cc     the RIPEMD-128 context
 * @param data   the input data
 * @param len    the input data length (in bytes)
 */
 void sph_ripemd128(void *cc, const void *data, size_t len);
 /**
 * Terminate the current RIPEMD-128 computation and output the result into the
 * provided buffer. The destination buffer must be wide enough to
 * accomodate the result (16 bytes). The context is automatically
 * reinitialized.
 *
 * @param cc    the RIPEMD-128 context
 * @param dst   the destination buffer
 */
 void sph_ripemd128_close(void *cc, void *dst);
 /**
 * Apply the RIPEMD-128 compression function on the provided data. The
 * <code>msg</code> parameter contains the 16 32-bit input blocks,
 * as numerical values (hence after the little-endian decoding). The
 * <code>val</code> parameter contains the 5 32-bit input blocks for
 * the compression function; the output is written in place in this
 * array.
 *
 * @param msg   the message block (16 values)
 * @param val   the function 128-bit input and output
 */
 void sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4]);
 /* ===================================================================== */
 /**
 * This structure is a context for RIPEMD-160 computations: it contains the
 * intermediate values and some data from the last entered block. Once
 * a RIPEMD-160 computation has been performed, the context can be reused for
 * another computation.
 *
 * The contents of this structure are private. A running RIPEMD-160 computation
 * can be cloned by copying the context (e.g. with a simple
 * <code>memcpy()</code>).
 */
 typedef struct {
 #ifndef DOXYGEN_IGNORE
 	unsigned char buf[64];    /* first field, for alignment */
 	sph_u32 val[5];
 #if SPH_64
 	sph_u64 count;
 #else
 	sph_u32 count_high, count_low;
 #endif
 #endif
 } sph_ripemd160_context;
 /**
 * Initialize a RIPEMD-160 context. This process performs no memory allocation.
 *
 * @param cc   the RIPEMD-160 context (pointer to
 *             a <code>sph_ripemd160_context</code>)
 */
 void sph_ripemd160_init(void *cc);
 /**
 * Process some data bytes. It is acceptable that <code>len</code> is zero
 * (in which case this function does nothing).
 *
 * @param cc     the RIPEMD-160 context
 * @param data   the input data
 * @param len    the input data length (in bytes)
 */
 void sph_ripemd160(void *cc, const void *data, size_t len);
 /**
 * Terminate the current RIPEMD-160 computation and output the result into the
 * provided buffer. The destination buffer must be wide enough to
 * accomodate the result (20 bytes). The context is automatically
 * reinitialized.
 *
 * @param cc    the RIPEMD-160 context
 * @param dst   the destination buffer
 */
 void sph_ripemd160_close(void *cc, void *dst);
 /**
 * Apply the RIPEMD-160 compression function on the provided data. The
 * <code>msg</code> parameter contains the 16 32-bit input blocks,
 * as numerical values (hence after the little-endian decoding). The
 * <code>val</code> parameter contains the 5 32-bit input blocks for
 * the compression function; the output is written in place in this
 * array.
 *
 * @param msg   the message block (16 values)
 * @param val   the function 160-bit input and output
 */
 void sph_ripemd160_comp(const sph_u32 msg[16], sph_u32 val[5]);
 #endif
--- a/sph/sph_sha2.c
+++ b/sph/sph_sha2.c
@ -0,0 +1,691 @@
 /* $Id: sha2.c 227 2010-06-16 17:28:38Z tp $ */
 /*
 * SHA-224 / SHA-256 implementation.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2007-2010  Projet RNRT SAPHIR
 * 
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 * 
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 * 
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 *
 * @author   Thomas Pornin <thomas.pornin@cryptolog.com>
 */
 #include <stddef.h>
 #include <string.h>
 #include "sph_sha2.h"
 #if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHA2
 #define SPH_SMALL_FOOTPRINT_SHA2   1
 #endif
 #define CH(X, Y, Z)    ((((Y) ^ (Z)) & (X)) ^ (Z))
 #define MAJ(X, Y, Z)   (((Y) & (Z)) | (((Y) | (Z)) & (X)))
 #define ROTR    SPH_ROTR32
 #define BSG2_0(x)      (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
 #define BSG2_1(x)      (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
 #define SSG2_0(x)      (ROTR(x, 7) ^ ROTR(x, 18) ^ SPH_T32((x) >> 3))
 #define SSG2_1(x)      (ROTR(x, 17) ^ ROTR(x, 19) ^ SPH_T32((x) >> 10))
 static const sph_u32 H224[8] = {
 	SPH_C32(0xC1059ED8), SPH_C32(0x367CD507), SPH_C32(0x3070DD17),
 	SPH_C32(0xF70E5939), SPH_C32(0xFFC00B31), SPH_C32(0x68581511),
 	SPH_C32(0x64F98FA7), SPH_C32(0xBEFA4FA4)
 };
 static const sph_u32 H256[8] = {
 	SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85), SPH_C32(0x3C6EF372),
 	SPH_C32(0xA54FF53A), SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
 	SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
 };
 /*
 * The SHA2_ROUND_BODY defines the body for a SHA-224 / SHA-256
 * compression function implementation. The "in" parameter should
 * evaluate, when applied to a numerical input parameter from 0 to 15,
 * to an expression which yields the corresponding input block. The "r"
 * parameter should evaluate to an array or pointer expression
 * designating the array of 8 words which contains the input and output
 * of the compression function.
 */
 #if SPH_SMALL_FOOTPRINT_SHA2
 static const sph_u32 K[64] = {
 	SPH_C32(0x428A2F98), SPH_C32(0x71374491),
 	SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
 	SPH_C32(0x3956C25B), SPH_C32(0x59F111F1),
 	SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5),
 	SPH_C32(0xD807AA98), SPH_C32(0x12835B01),
 	SPH_C32(0x243185BE), SPH_C32(0x550C7DC3),
 	SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE),
 	SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174),
 	SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786),
 	SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC),
 	SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA),
 	SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA),
 	SPH_C32(0x983E5152), SPH_C32(0xA831C66D),
 	SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7),
 	SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147),
 	SPH_C32(0x06CA6351), SPH_C32(0x14292967),
 	SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138),
 	SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13),
 	SPH_C32(0x650A7354), SPH_C32(0x766A0ABB),
 	SPH_C32(0x81C2C92E), SPH_C32(0x92722C85),
 	SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B),
 	SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3),
 	SPH_C32(0xD192E819), SPH_C32(0xD6990624),
 	SPH_C32(0xF40E3585), SPH_C32(0x106AA070),
 	SPH_C32(0x19A4C116), SPH_C32(0x1E376C08),
 	SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5),
 	SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A),
 	SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3),
 	SPH_C32(0x748F82EE), SPH_C32(0x78A5636F),
 	SPH_C32(0x84C87814), SPH_C32(0x8CC70208),
 	SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
 	SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
 };
 #define SHA2_MEXP1(in, pc)   do { \
 		W[pc] = in(pc); \
 	} while (0)
 #define SHA2_MEXP2(in, pc)   do { \
 		W[(pc) & 0x0F] = SPH_T32(SSG2_1(W[((pc) - 2) & 0x0F]) \
 			+ W[((pc) - 7) & 0x0F] \
 			+ SSG2_0(W[((pc) - 15) & 0x0F]) + W[(pc) & 0x0F]); \
 	} while (0)
 #define SHA2_STEPn(n, a, b, c, d, e, f, g, h, in, pc)   do { \
 		sph_u32 t1, t2; \
 		SHA2_MEXP ## n(in, pc); \
 		t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \
 			+ K[pcount + (pc)] + W[(pc) & 0x0F]); \
 		t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \
 		d = SPH_T32(d + t1); \
 		h = SPH_T32(t1 + t2); \
 	} while (0)
 #define SHA2_STEP1(a, b, c, d, e, f, g, h, in, pc) \
 	SHA2_STEPn(1, a, b, c, d, e, f, g, h, in, pc)
 #define SHA2_STEP2(a, b, c, d, e, f, g, h, in, pc) \
 	SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc)
 #define SHA2_ROUND_BODY(in, r)   do { \
 		sph_u32 A, B, C, D, E, F, G, H; \
 		sph_u32 W[16]; \
 		unsigned pcount; \
 \
 		A = (r)[0]; \
 		B = (r)[1]; \
 		C = (r)[2]; \
 		D = (r)[3]; \
 		E = (r)[4]; \
 		F = (r)[5]; \
 		G = (r)[6]; \
 		H = (r)[7]; \
 		pcount = 0; \
 		SHA2_STEP1(A, B, C, D, E, F, G, H, in,  0); \
 		SHA2_STEP1(H, A, B, C, D, E, F, G, in,  1); \
 		SHA2_STEP1(G, H, A, B, C, D, E, F, in,  2); \
 		SHA2_STEP1(F, G, H, A, B, C, D, E, in,  3); \
 		SHA2_STEP1(E, F, G, H, A, B, C, D, in,  4); \
 		SHA2_STEP1(D, E, F, G, H, A, B, C, in,  5); \
 		SHA2_STEP1(C, D, E, F, G, H, A, B, in,  6); \
 		SHA2_STEP1(B, C, D, E, F, G, H, A, in,  7); \
 		SHA2_STEP1(A, B, C, D, E, F, G, H, in,  8); \
 		SHA2_STEP1(H, A, B, C, D, E, F, G, in,  9); \
 		SHA2_STEP1(G, H, A, B, C, D, E, F, in, 10); \
 		SHA2_STEP1(F, G, H, A, B, C, D, E, in, 11); \
 		SHA2_STEP1(E, F, G, H, A, B, C, D, in, 12); \
 		SHA2_STEP1(D, E, F, G, H, A, B, C, in, 13); \
 		SHA2_STEP1(C, D, E, F, G, H, A, B, in, 14); \
 		SHA2_STEP1(B, C, D, E, F, G, H, A, in, 15); \
 		for (pcount = 16; pcount < 64; pcount += 16) { \
 			SHA2_STEP2(A, B, C, D, E, F, G, H, in,  0); \
 			SHA2_STEP2(H, A, B, C, D, E, F, G, in,  1); \
 			SHA2_STEP2(G, H, A, B, C, D, E, F, in,  2); \
 			SHA2_STEP2(F, G, H, A, B, C, D, E, in,  3); \
 			SHA2_STEP2(E, F, G, H, A, B, C, D, in,  4); \
 			SHA2_STEP2(D, E, F, G, H, A, B, C, in,  5); \
 			SHA2_STEP2(C, D, E, F, G, H, A, B, in,  6); \
 			SHA2_STEP2(B, C, D, E, F, G, H, A, in,  7); \
 			SHA2_STEP2(A, B, C, D, E, F, G, H, in,  8); \
 			SHA2_STEP2(H, A, B, C, D, E, F, G, in,  9); \
 			SHA2_STEP2(G, H, A, B, C, D, E, F, in, 10); \
 			SHA2_STEP2(F, G, H, A, B, C, D, E, in, 11); \
 			SHA2_STEP2(E, F, G, H, A, B, C, D, in, 12); \
 			SHA2_STEP2(D, E, F, G, H, A, B, C, in, 13); \
 			SHA2_STEP2(C, D, E, F, G, H, A, B, in, 14); \
 			SHA2_STEP2(B, C, D, E, F, G, H, A, in, 15); \
 		} \
 		(r)[0] = SPH_T32((r)[0] + A); \
 		(r)[1] = SPH_T32((r)[1] + B); \
 		(r)[2] = SPH_T32((r)[2] + C); \
 		(r)[3] = SPH_T32((r)[3] + D); \
 		(r)[4] = SPH_T32((r)[4] + E); \
 		(r)[5] = SPH_T32((r)[5] + F); \
 		(r)[6] = SPH_T32((r)[6] + G); \
 		(r)[7] = SPH_T32((r)[7] + H); \
 	} while (0)
 #else
 #define SHA2_ROUND_BODY(in, r)   do { \
 		sph_u32 A, B, C, D, E, F, G, H, T1, T2; \
 		sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \
 		sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \
 		int i; \
 \
 		A = (r)[0]; \
 		B = (r)[1]; \
 		C = (r)[2]; \
 		D = (r)[3]; \
 		E = (r)[4]; \
 		F = (r)[5]; \
 		G = (r)[6]; \
 		H = (r)[7]; \
 		W00 = in(0); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0x428A2F98) + W00); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W01 = in(1); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0x71374491) + W01); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W02 = in(2); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0xB5C0FBCF) + W02); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W03 = in(3); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0xE9B5DBA5) + W03); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W04 = in(4); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x3956C25B) + W04); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W05 = in(5); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0x59F111F1) + W05); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W06 = in(6); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x923F82A4) + W06); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W07 = in(7); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0xAB1C5ED5) + W07); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W08 = in(8); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0xD807AA98) + W08); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W09 = in(9); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0x12835B01) + W09); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W10 = in(10); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0x243185BE) + W10); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W11 = in(11); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0x550C7DC3) + W11); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W12 = in(12); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x72BE5D74) + W12); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W13 = in(13); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0x80DEB1FE) + W13); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W14 = in(14); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x9BDC06A7) + W14); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W15 = in(15); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0xC19BF174) + W15); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0xE49B69C1) + W00); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0xEFBE4786) + W01); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0x0FC19DC6) + W02); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0x240CA1CC) + W03); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x2DE92C6F) + W04); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0x4A7484AA) + W05); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x5CB0A9DC) + W06); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0x76F988DA) + W07); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0x983E5152) + W08); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0xA831C66D) + W09); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0xB00327C8) + W10); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0xBF597FC7) + W11); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0xC6E00BF3) + W12); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0xD5A79147) + W13); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x06CA6351) + W14); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0x14292967) + W15); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0x27B70A85) + W00); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0x2E1B2138) + W01); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0x4D2C6DFC) + W02); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0x53380D13) + W03); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x650A7354) + W04); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0x766A0ABB) + W05); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x81C2C92E) + W06); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0x92722C85) + W07); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0xA2BFE8A1) + W08); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0xA81A664B) + W09); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0xC24B8B70) + W10); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0xC76C51A3) + W11); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0xD192E819) + W12); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0xD6990624) + W13); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0xF40E3585) + W14); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0x106AA070) + W15); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0x19A4C116) + W00); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0x1E376C08) + W01); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0x2748774C) + W02); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0x34B0BCB5) + W03); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x391C0CB3) + W04); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0x4ED8AA4A) + W05); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0x5B9CCA4F) + W06); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0x682E6FF3) + W07); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
 		T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
 			+ SPH_C32(0x748F82EE) + W08); \
 		T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
 		D = SPH_T32(D + T1); \
 		H = SPH_T32(T1 + T2); \
 		W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
 		T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
 			+ SPH_C32(0x78A5636F) + W09); \
 		T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
 		C = SPH_T32(C + T1); \
 		G = SPH_T32(T1 + T2); \
 		W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
 		T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
 			+ SPH_C32(0x84C87814) + W10); \
 		T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
 		B = SPH_T32(B + T1); \
 		F = SPH_T32(T1 + T2); \
 		W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
 		T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
 			+ SPH_C32(0x8CC70208) + W11); \
 		T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
 		A = SPH_T32(A + T1); \
 		E = SPH_T32(T1 + T2); \
 		W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
 		T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
 			+ SPH_C32(0x90BEFFFA) + W12); \
 		T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
 		H = SPH_T32(H + T1); \
 		D = SPH_T32(T1 + T2); \
 		W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
 		T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
 			+ SPH_C32(0xA4506CEB) + W13); \
 		T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
 		G = SPH_T32(G + T1); \
 		C = SPH_T32(T1 + T2); \
 		W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
 		T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
 			+ SPH_C32(0xBEF9A3F7) + W14); \
 		T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
 		F = SPH_T32(F + T1); \
 		B = SPH_T32(T1 + T2); \
 		W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
 		T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
 			+ SPH_C32(0xC67178F2) + W15); \
 		T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
 		E = SPH_T32(E + T1); \
 		A = SPH_T32(T1 + T2); \
 		(r)[0] = SPH_T32((r)[0] + A); \
 		(r)[1] = SPH_T32((r)[1] + B); \
 		(r)[2] = SPH_T32((r)[2] + C); \
 		(r)[3] = SPH_T32((r)[3] + D); \
 		(r)[4] = SPH_T32((r)[4] + E); \
 		(r)[5] = SPH_T32((r)[5] + F); \
 		(r)[6] = SPH_T32((r)[6] + G); \
 		(r)[7] = SPH_T32((r)[7] + H); \
 	} while (0)
 #endif
 /*
 * One round of SHA-224 / SHA-256. The data must be aligned for 32-bit access.
 */
 static void
 sha2_round(const unsigned char *data, sph_u32 r[8])
 {
 #define SHA2_IN(x)   sph_dec32be_aligned(data + (4 * (x)))
 	SHA2_ROUND_BODY(SHA2_IN, r);
 #undef SHA2_IN
 }
 /* see sph_sha2.h */
 void
 sph_sha224_init(void *cc)
 {
 	sph_sha224_context *sc;
 	sc = cc;
 	memcpy(sc->val, H224, sizeof H224);
 #if SPH_64
 	sc->count = 0;
 #else
 	sc->count_high = sc->count_low = 0;
 #endif
 }
 /* see sph_sha2.h */
 void
 sph_sha256_init(void *cc)
 {
 	sph_sha256_context *sc;
 	sc = cc;
 	memcpy(sc->val, H256, sizeof H256);
 #if SPH_64
 	sc->count = 0;
 #else
 	sc->count_high = sc->count_low = 0;
 #endif
 }
 #define RFUN   sha2_round
 #define HASH   sha224
 #define BE32   1
 #include "md_helper.c"
 /* see sph_sha2.h */
 void
 sph_sha224_close(void *cc, void *dst)
 {
 	sha224_close(cc, dst, 7);
 	sph_sha224_init(cc);
 }
 /* see sph_sha2.h */
 void
 sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
 	sha224_addbits_and_close(cc, ub, n, dst, 7);
 	sph_sha224_init(cc);
 }
 /* see sph_sha2.h */
 void
 sph_sha256_close(void *cc, void *dst)
 {
 	sha224_close(cc, dst, 8);
 	sph_sha256_init(cc);
 }
 /* see sph_sha2.h */
 void
 sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
 {
 	sha224_addbits_and_close(cc, ub, n, dst, 8);
 	sph_sha256_init(cc);
 }
 /* see sph_sha2.h */
 void
 sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8])
 {
 #define SHA2_IN(x)   msg[x]
 	SHA2_ROUND_BODY(SHA2_IN, val);
 #undef SHA2_IN
 }
--- a/util.cpp
+++ b/util.cpp
@ -1421,28 +1421,39 @@ static uint32_t getblocheight(struct stratum_ctx *sctx)
 static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 {
-	const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime, *nreward;
+	const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime;
 	const char *claim = NULL, *nreward = NULL;
 	size_t coinb1_size, coinb2_size;
 	bool clean, ret = false;
-	int merkle_count, i;
+	int merkle_count, i, p=0;
 	json_t *merkle_arr;
 	uchar **merkle = NULL;
 	// uchar(*merkle_tree)[32] = { 0 };
 	int ntime;
-
+	char algo[64] = { 0 };
-	job_id = json_string_value(json_array_get(params, 0));
+	get_currentalgo(algo, sizeof(algo));
-	prevhash = json_string_value(json_array_get(params, 1));
+	bool has_claim = !strcasecmp(algo, "lbry");
-	coinb1 = json_string_value(json_array_get(params, 2));
+
-	coinb2 = json_string_value(json_array_get(params, 3));
+	job_id = json_string_value(json_array_get(params, p++));
-	merkle_arr = json_array_get(params, 4);
+	prevhash = json_string_value(json_array_get(params, p++));
 	if (has_claim) {
 		claim = json_string_value(json_array_get(params, p++));
 		if (!claim || strlen(claim) != 64) {
 			applog(LOG_ERR, "Stratum notify: invalid claim parameter");
 			goto out;
 		}
 	}
 	coinb1 = json_string_value(json_array_get(params, p++));
 	coinb2 = json_string_value(json_array_get(params, p++));
 	merkle_arr = json_array_get(params, p++);
 	if (!merkle_arr || !json_is_array(merkle_arr))
 		goto out;
 	merkle_count = (int) json_array_size(merkle_arr);
-	version = json_string_value(json_array_get(params, 5));
+	version = json_string_value(json_array_get(params, p++));
-	nbits = json_string_value(json_array_get(params, 6));
+	nbits = json_string_value(json_array_get(params, p++));
-	stime = json_string_value(json_array_get(params, 7));
+	stime = json_string_value(json_array_get(params, p++));
-	clean = json_is_true(json_array_get(params, 8));
+	clean = json_is_true(json_array_get(params, p)); p++;
-	nreward = json_string_value(json_array_get(params, 9));
+	nreward = json_string_value(json_array_get(params, p++));
 	if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime ||
 	    strlen(prevhash) != 64 || strlen(version) != 8 ||
@ -1494,6 +1505,7 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	free(sctx->job.job_id);
 	sctx->job.job_id = strdup(job_id);
 	hex2bin(sctx->job.prevhash, prevhash, 32);
 	if (has_claim) hex2bin(sctx->job.claim, claim, 32);
 	sctx->job.height = getblocheight(sctx);
@ -2079,7 +2091,8 @@ void do_gpu_tests(void)
 	//scanhash_scrypt_jane(0, &work, NULL, 1, &done, &tv, &tv);
 	memset(work.data, 0, sizeof(work.data));
-	scanhash_decred(0, &work, 1, &done);
+	work.data[0] = 0;
 	scanhash_lbry(0, &work, 1, &done);
 	free(work_restart);
 	work_restart = NULL;
@ -2142,6 +2155,10 @@ void print_hash_tests(void)
 	keccak256_hash(&hash[0], &buf[0]);
 	printpfx("keccak", hash);
 	memset(buf, 0, 128);
 	lbry_hash(&hash[0], &buf[0]);
 	printpfx("lbry", hash);
 	luffa_hash(&hash[0], &buf[0]);
 	printpfx("luffa", hash);