Browse Source

lbry algo (stratum only)

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
master
Tanguy Pruvot 9 years ago
parent
commit
5703a74e22
  1. 2
      Makefile.am
  2. 8
      README.txt
  3. 2
      algos.h
  4. 25
      ccminer.cpp
  5. 14
      ccminer.vcxproj
  6. 23
      ccminer.vcxproj.filters
  7. 441
      lbry/cuda_ripemd160.cu
  8. 712
      lbry/cuda_sha256_lbry.cu
  9. 181
      lbry/cuda_sha512_lbry.cu
  10. 225
      lbry/lbry.cu
  11. 3
      miner.h
  12. 833
      sph/ripemd.c
  13. 273
      sph/sph_ripemd.h
  14. 691
      sph/sph_sha2.c
  15. 45
      util.cpp

2
Makefile.am

@ -50,6 +50,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \ @@ -50,6 +50,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \
sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
sph/hamsi.c sph/hamsi_helper.c sph/streebog.c \
sph/shabal.c sph/whirlpool.c sph/sha2big.c sph/haval.c \
sph/ripemd.c sph/sph_sha2.c \
lbry/lbry.cu lbry/cuda_ripemd160.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu \
qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \

8
README.txt

@ -1,5 +1,5 @@ @@ -1,5 +1,5 @@
ccMiner preview 1.8-dev (May 2016) "Pascal and x11evo algo"
ccMiner 1.8 Preview (July 2016) "CUDA 8, lbry and x11evo algos"
---------------------------------------------------------------
***************************************************************
@ -28,6 +28,7 @@ Decred (Blake256 14-rounds - 180 bytes) @@ -28,6 +28,7 @@ Decred (Blake256 14-rounds - 180 bytes)
HeavyCoin & MjollnirCoin
FugueCoin
GroestlCoin & Myriad-Groestl
Lbry Credits
JackpotCoin
QuarkCoin family & AnimeCoin
TalkCoin
@ -35,13 +36,13 @@ DarkCoin and other X11 coins @@ -35,13 +36,13 @@ DarkCoin and other X11 coins
Chaincoin and Flaxscript (C11)
Saffroncoin blake (256 14-rounds)
BlakeCoin (256 8-rounds)
Midnight (BMW 256)
Qubit (Digibyte, ...)
Luffa (Joincoin)
Keccak (Maxcoin)
Pentablake (Blake 512 x5)
1Coin Triple S
Neoscrypt (FeatherCoin)
Revolver (X11evo)
Scrypt and Scrypt:N
Scrypt-Jane (Chacha)
Sibcoin (sib)
@ -82,6 +83,7 @@ its command line interface and options. @@ -82,6 +83,7 @@ its command line interface and options.
heavy use to mine Heavycoin
jackpot use to mine Jackpotcoin
keccak use to mine Maxcoin
lbry use to mine LBRY Credits
luffa use to mine Joincoin
lyra2 use to mine Vertcoin
mjollnir use to mine Mjollnircoin
@ -151,6 +153,7 @@ its command line interface and options. @@ -151,6 +153,7 @@ its command line interface and options.
--max-diff=N Only mine if net difficulty is less than specified value
--pstate=0 will force the Geforce 9xx to run in P0 P-State
--plimit=150W set the gpu power limit, allow multiple values for N cards
--tlimit=85 Set the gpu thermal limit (windows only)
--keep-clocks prevent reset clocks and/or power limit on exit
--show-diff display submitted block and net difficulty
-B, --background run the miner in the background
@ -242,6 +245,7 @@ features. @@ -242,6 +245,7 @@ features.
July 2016 v1.8.0
Pascal support with cuda 8
lbry new multi sha / ripemd algo (LBC)
x11evo algo (XRE)
Lyra2v2, Neoscrypt and Decred improvements
Enhance windows NVAPI clock and power limits

2
algos.h

@ -19,6 +19,7 @@ enum sha_algos { @@ -19,6 +19,7 @@ enum sha_algos {
ALGO_HEAVY, /* Heavycoin hash */
ALGO_KECCAK,
ALGO_JACKPOT,
ALGO_LBRY,
ALGO_LUFFA,
ALGO_LYRA2,
ALGO_LYRA2v2,
@ -67,6 +68,7 @@ static const char *algo_names[] = { @@ -67,6 +68,7 @@ static const char *algo_names[] = {
"heavy",
"keccak",
"jackpot",
"lbry",
"luffa",
"lyra2",
"lyra2v2",

25
ccminer.cpp

@ -228,6 +228,7 @@ Options:\n\ @@ -228,6 +228,7 @@ Options:\n\
heavy Heavycoin\n\
jackpot Jackpot\n\
keccak Keccak-256 (Maxcoin)\n\
lbry LBRY Credits (Sha/Ripemd)\n\
luffa Joincoin\n\
lyra2 LyraBar\n\
lyra2v2 VertCoin\n\
@ -567,6 +568,7 @@ static void calc_network_diff(struct work *work) @@ -567,6 +568,7 @@ static void calc_network_diff(struct work *work)
// sample for diff 43.281 : 1c05ea29
// todo: endian reversed on longpoll could be zr5 specific...
uint32_t nbits = have_longpoll ? work->data[18] : swab32(work->data[18]);
if (opt_algo == ALGO_LBRY) nbits = swab32(work->data[26]);
if (opt_algo == ALGO_DECRED) nbits = work->data[29];
uint32_t bits = (nbits & 0xffffff);
int16_t shift = (swab32(nbits) & 0xff); // 0x1c = 28
@ -837,6 +839,11 @@ static bool submit_upstream_work(CURL *curl, struct work *work) @@ -837,6 +839,11 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
le32enc(&nonce, work->data[19]);
be16enc(&nvote, *((uint16_t*)&work->data[20]));
break;
case ALGO_LBRY:
check_dups = true;
le32enc(&ntime, work->data[25]);
le32enc(&nonce, work->data[27]);
break;
case ALGO_ZR5:
check_dups = true;
be32enc(&ntime, work->data[17]);
@ -1296,6 +1303,8 @@ bool get_work(struct thr_info *thr, struct work *work) @@ -1296,6 +1303,8 @@ bool get_work(struct thr_info *thr, struct work *work)
memset(work->data + 19, 0x00, 52);
if (opt_algo == ALGO_DECRED) {
memset(&work->data[35], 0x00, 52);
} else if (opt_algo == ALGO_LBRY) {
work->data[28] = 0x80000000;
} else {
work->data[20] = 0x80000000;
work->data[31] = 0x00000280;
@ -1441,6 +1450,14 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) @@ -1441,6 +1450,14 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
work->data[37] = (rand()*4) << 8; // random work data
sctx->job.height = work->data[32];
//applog_hex(work->data, 180);
} else if (opt_algo == ALGO_LBRY) {
for (i = 0; i < 8; i++)
work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
for (i = 0; i < 8; i++)
work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
work->data[25] = le32dec(sctx->job.ntime);
work->data[26] = le32dec(sctx->job.nbits);
work->data[28] = 0x80000000;
} else {
for (i = 0; i < 8; i++)
work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
@ -1498,6 +1515,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) @@ -1498,6 +1515,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
case ALGO_FRESH:
case ALGO_FUGUE256:
case ALGO_GROESTL:
case ALGO_LBRY:
case ALGO_LYRA2v2:
work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));
break;
@ -1658,6 +1676,7 @@ static void *miner_thread(void *userdata) @@ -1658,6 +1676,7 @@ static void *miner_thread(void *userdata)
// &work.data[19]
int wcmplen = (opt_algo == ALGO_DECRED) ? 140 : 76;
if (opt_algo == ALGO_LBRY) wcmplen = 108;
int wcmpoft = 0;
uint32_t *nonceptr = (uint32_t*) (((char*)work.data) + wcmplen);
@ -1910,6 +1929,7 @@ static void *miner_thread(void *userdata) @@ -1910,6 +1929,7 @@ static void *miner_thread(void *userdata)
minmax = 0x40000000U;
break;
case ALGO_KECCAK:
case ALGO_LBRY:
case ALGO_LUFFA:
case ALGO_SKEIN:
case ALGO_SKEIN2:
@ -2035,6 +2055,9 @@ static void *miner_thread(void *userdata) @@ -2035,6 +2055,9 @@ static void *miner_thread(void *userdata)
case ALGO_JACKPOT:
rc = scanhash_jackpot(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_LBRY:
rc = scanhash_lbry(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_LUFFA:
rc = scanhash_luffa(thr_id, &work, max_nonce, &hashes_done);
break;
@ -2130,7 +2153,7 @@ static void *miner_thread(void *userdata) @@ -2130,7 +2153,7 @@ static void *miner_thread(void *userdata)
// todo: update all algos to use work->nonces
work.nonces[0] = nonceptr[0];
if (opt_algo != ALGO_DECRED && opt_algo != ALGO_BLAKE2S) {
if (opt_algo != ALGO_DECRED && opt_algo != ALGO_BLAKE2S && opt_algo != ALGO_LBRY) {
work.nonces[1] = nonceptr[2];
}

14
ccminer.vcxproj

@ -115,7 +115,7 @@ @@ -115,7 +115,7 @@
<MaxRegCount>80</MaxRegCount>
<PtxAsOptionV>true</PtxAsOptionV>
<Keep>true</Keep>
<CodeGeneration>compute_50,sm_50</CodeGeneration>
<CodeGeneration>compute_61,sm_61;compute_52,sm_52</CodeGeneration>
<Include>$(NVTOOLSEXT_PATH)\include;..\..\..\Common\C99</Include>
<TargetMachinePlatform>64</TargetMachinePlatform>
</CudaCompile>
@ -273,15 +273,11 @@ @@ -273,15 +273,11 @@
<ClCompile Include="sph\fugue.c" />
<ClCompile Include="sph\groestl.c" />
<ClCompile Include="sph\haval.c" />
<ClCompile Include="sph\haval_helper.c">
<ExcludedFromBuild>true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="sph\jh.c" />
<ClCompile Include="sph\keccak.c" />
<ClCompile Include="sph\luffa.c" />
<ClCompile Include="sph\md_helper.c">
<ExcludedFromBuild>true</ExcludedFromBuild>
</ClCompile>
<ClCompile Include="sph\ripemd.c" />
<ClCompile Include="sph\sph_sha2.c" />
<ClCompile Include="sph\sha2.c" />
<ClCompile Include="sph\sha2big.c" />
<ClCompile Include="sph\shabal.c" />
@ -428,6 +424,10 @@ @@ -428,6 +424,10 @@
<MaxRegCount>92</MaxRegCount>
</CudaCompile>
<CudaCompile Include="Algo256\cuda_skein256.cu" />
<CudaCompile Include="lbry\cuda_ripemd160.cu" />
<CudaCompile Include="lbry\cuda_sha256_lbry.cu" />
<CudaCompile Include="lbry\cuda_sha512_lbry.cu" />
<CudaCompile Include="lbry\lbry.cu" />
<CudaCompile Include="pentablake.cu">
<MaxRegCount>80</MaxRegCount>
<AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions>

23
ccminer.vcxproj.filters

@ -82,6 +82,9 @@ @@ -82,6 +82,9 @@
<Filter Include="Source Files\CUDA\lyra2">
<UniqueIdentifier>{1613763f-895c-4321-b58b-6f5849868956}</UniqueIdentifier>
</Filter>
<Filter Include="Source Files\CUDA\lbry">
<UniqueIdentifier>{3079ea1f-f768-455a-acd6-f517fac535b4}</UniqueIdentifier>
</Filter>
</ItemGroup>
<ItemGroup>
<ClCompile Include="compat\jansson\dump.c">
@ -150,9 +153,6 @@ @@ -150,9 +153,6 @@
<ClCompile Include="sph\luffa.c">
<Filter>Source Files\sph</Filter>
</ClCompile>
<ClCompile Include="sph\md_helper.c">
<Filter>Source Files\sph</Filter>
</ClCompile>
<ClCompile Include="sph\sha2.c">
<Filter>Source Files\sph</Filter>
</ClCompile>
@ -180,7 +180,10 @@ @@ -180,7 +180,10 @@
<ClCompile Include="sph\haval.c">
<Filter>Source Files\sph</Filter>
</ClCompile>
<ClCompile Include="sph\haval_helper.c">
<ClCompile Include="sph\ripemd.c">
<Filter>Source Files\sph</Filter>
</ClCompile>
<ClCompile Include="sph\sph_sha2.c">
<Filter>Source Files\sph</Filter>
</ClCompile>
<ClCompile Include="sph\sha2big.c">
@ -721,6 +724,18 @@ @@ -721,6 +724,18 @@
<CudaCompile Include="Algo256\blake2s.cu">
<Filter>Source Files\CUDA\Algo256</Filter>
</CudaCompile>
<CudaCompile Include="lbry\cuda_sha256_lbry.cu">
<Filter>Source Files\CUDA\lbry</Filter>
</CudaCompile>
<CudaCompile Include="lbry\cuda_sha512_lbry.cu">
<Filter>Source Files\CUDA\lbry</Filter>
</CudaCompile>
<CudaCompile Include="lbry\cuda_ripemd160.cu">
<Filter>Source Files\CUDA\lbry</Filter>
</CudaCompile>
<CudaCompile Include="lbry\lbry.cu">
<Filter>Source Files\CUDA\lbry</Filter>
</CudaCompile>
</ItemGroup>
<ItemGroup>
<Image Include="res\ccminer.ico">

441
lbry/cuda_ripemd160.cu

@ -0,0 +1,441 @@ @@ -0,0 +1,441 @@
/*
* ripemd-160 kernel implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2014, 2016 djm34, tpruvot
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
*/
#include <stdio.h>
#include <stdint.h>
#include <memory.h>
#include <cuda_helper.h>
static __constant__ uint32_t c_IV[5] = {
0x67452301u, 0xEFCDAB89u, 0x98BADCFEu, 0x10325476u, 0xC3D2E1F0u
};
//__host__
//uint64_t xornot64(uint64_t a, uint64_t b, uint64_t c) {
// return c ^ (a | !b);
//}
__forceinline__ __device__
uint64_t xornot64(uint64_t a, uint64_t b, uint64_t c)
{
uint64_t result;
asm("{ .reg .u64 m,n; // xornot64\n\t"
"not.b64 m,%2; \n\t"
"or.b64 n, %1,m;\n\t"
"xor.b64 %0, n,%3;\n\t"
"}\n\t"
: "=l"(result) : "l"(a), "l"(b), "l"(c));
return result;
}
//__host__
//uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c) {
// return a ^ (b | !c);
//}
__device__ __forceinline__
uint64_t xornt64(uint64_t a, uint64_t b, uint64_t c)
{
uint64_t result;
asm("{ .reg .u64 m,n; // xornt64\n\t"
"not.b64 m,%3; \n\t"
"or.b64 n, %2,m;\n\t"
"xor.b64 %0, %1,n;\n\t"
"}\n\t"
: "=l"(result) : "l"(a), "l"(b), "l"(c));
return result;
}
/*
* Round functions for RIPEMD-128 and RIPEMD-160.
*/
#if 1
#define F1(x, y, z) ((x) ^ (y) ^ (z))
#define F2(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
#define F3(x, y, z) (((x) | ~(y)) ^ (z))
#define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y))
#define F5(x, y, z) ((x) ^ ((y) | ~(z)))
#else
#define F1(x, y, z) xor3(x,y,z)
#define F2(x, y, z) xandx(x,y,z)
#define F3(x, y, z) xornot64(x,y,z)
#define F4(x, y, z) xandx(z,x,y)
#define F5(x, y, z) xornt64(x,y,z)
#endif
/*
* Round constants for RIPEMD-160.
*/
#define K11 0x00000000u
#define K12 0x5A827999u
#define K13 0x6ED9EBA1u
#define K14 0x8F1BBCDCu
#define K15 0xA953FD4Eu
#define K21 0x50A28BE6u
#define K22 0x5C4DD124u
#define K23 0x6D703EF3u
#define K24 0x7A6D76E9u
#define K25 0x00000000u
#define RR(a, b, c, d, e, f, s, r, k) { \
a = SPH_T32(ROTL32(SPH_T32(a + f(b, c, d) + r + k), s) + e); \
c = ROTL32(c, 10); \
}
#define ROUND1(a, b, c, d, e, f, s, r, k) \
RR(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
#define ROUND2(a, b, c, d, e, f, s, r, k) \
RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
#define RIPEMD160_ROUND_BODY(in, h) { \
uint32_t A1, B1, C1, D1, E1; \
uint32_t A2, B2, C2, D2, E2; \
uint32_t tmp; \
\
A1 = A2 = h[0]; \
B1 = B2 = h[1]; \
C1 = C2 = h[2]; \
D1 = D2 = h[3]; \
E1 = E2 = h[4]; \
\
ROUND1(A, B, C, D, E, F1, 11, in[ 0], 1); \
ROUND1(E, A, B, C, D, F1, 14, in[ 1], 1); \
ROUND1(D, E, A, B, C, F1, 15, in[ 2], 1); \
ROUND1(C, D, E, A, B, F1, 12, in[ 3], 1); \
ROUND1(B, C, D, E, A, F1, 5, in[ 4], 1); \
ROUND1(A, B, C, D, E, F1, 8, in[ 5], 1); \
ROUND1(E, A, B, C, D, F1, 7, in[ 6], 1); \
ROUND1(D, E, A, B, C, F1, 9, in[ 7], 1); \
ROUND1(C, D, E, A, B, F1, 11, in[ 8], 1); \
ROUND1(B, C, D, E, A, F1, 13, in[ 9], 1); \
ROUND1(A, B, C, D, E, F1, 14, in[10], 1); \
ROUND1(E, A, B, C, D, F1, 15, in[11], 1); \
ROUND1(D, E, A, B, C, F1, 6, in[12], 1); \
ROUND1(C, D, E, A, B, F1, 7, in[13], 1); \
ROUND1(B, C, D, E, A, F1, 9, in[14], 1); \
ROUND1(A, B, C, D, E, F1, 8, in[15], 1); \
\
ROUND1(E, A, B, C, D, F2, 7, in[ 7], 2); \
ROUND1(D, E, A, B, C, F2, 6, in[ 4], 2); \
ROUND1(C, D, E, A, B, F2, 8, in[13], 2); \
ROUND1(B, C, D, E, A, F2, 13, in[ 1], 2); \
ROUND1(A, B, C, D, E, F2, 11, in[10], 2); \
ROUND1(E, A, B, C, D, F2, 9, in[ 6], 2); \
ROUND1(D, E, A, B, C, F2, 7, in[15], 2); \
ROUND1(C, D, E, A, B, F2, 15, in[ 3], 2); \
ROUND1(B, C, D, E, A, F2, 7, in[12], 2); \
ROUND1(A, B, C, D, E, F2, 12, in[ 0], 2); \
ROUND1(E, A, B, C, D, F2, 15, in[ 9], 2); \
ROUND1(D, E, A, B, C, F2, 9, in[ 5], 2); \
ROUND1(C, D, E, A, B, F2, 11, in[ 2], 2); \
ROUND1(B, C, D, E, A, F2, 7, in[14], 2); \
ROUND1(A, B, C, D, E, F2, 13, in[11], 2); \
ROUND1(E, A, B, C, D, F2, 12, in[ 8], 2); \
\
ROUND1(D, E, A, B, C, F3, 11, in[ 3], 3); \
ROUND1(C, D, E, A, B, F3, 13, in[10], 3); \
ROUND1(B, C, D, E, A, F3, 6, in[14], 3); \
ROUND1(A, B, C, D, E, F3, 7, in[ 4], 3); \
ROUND1(E, A, B, C, D, F3, 14, in[ 9], 3); \
ROUND1(D, E, A, B, C, F3, 9, in[15], 3); \
ROUND1(C, D, E, A, B, F3, 13, in[ 8], 3); \
ROUND1(B, C, D, E, A, F3, 15, in[ 1], 3); \
ROUND1(A, B, C, D, E, F3, 14, in[ 2], 3); \
ROUND1(E, A, B, C, D, F3, 8, in[ 7], 3); \
ROUND1(D, E, A, B, C, F3, 13, in[ 0], 3); \
ROUND1(C, D, E, A, B, F3, 6, in[ 6], 3); \
ROUND1(B, C, D, E, A, F3, 5, in[13], 3); \
ROUND1(A, B, C, D, E, F3, 12, in[11], 3); \
ROUND1(E, A, B, C, D, F3, 7, in[ 5], 3); \
ROUND1(D, E, A, B, C, F3, 5, in[12], 3); \
\
ROUND1(C, D, E, A, B, F4, 11, in[ 1], 4); \
ROUND1(B, C, D, E, A, F4, 12, in[ 9], 4); \
ROUND1(A, B, C, D, E, F4, 14, in[11], 4); \
ROUND1(E, A, B, C, D, F4, 15, in[10], 4); \
ROUND1(D, E, A, B, C, F4, 14, in[ 0], 4); \
ROUND1(C, D, E, A, B, F4, 15, in[ 8], 4); \
ROUND1(B, C, D, E, A, F4, 9, in[12], 4); \
ROUND1(A, B, C, D, E, F4, 8, in[ 4], 4); \
ROUND1(E, A, B, C, D, F4, 9, in[13], 4); \
ROUND1(D, E, A, B, C, F4, 14, in[ 3], 4); \
ROUND1(C, D, E, A, B, F4, 5, in[ 7], 4); \
ROUND1(B, C, D, E, A, F4, 6, in[15], 4); \
ROUND1(A, B, C, D, E, F4, 8, in[14], 4); \
ROUND1(E, A, B, C, D, F4, 6, in[ 5], 4); \
ROUND1(D, E, A, B, C, F4, 5, in[ 6], 4); \
ROUND1(C, D, E, A, B, F4, 12, in[ 2], 4); \
\
ROUND1(B, C, D, E, A, F5, 9, in[ 4], 5); \
ROUND1(A, B, C, D, E, F5, 15, in[ 0], 5); \
ROUND1(E, A, B, C, D, F5, 5, in[ 5], 5); \
ROUND1(D, E, A, B, C, F5, 11, in[ 9], 5); \
ROUND1(C, D, E, A, B, F5, 6, in[ 7], 5); \
ROUND1(B, C, D, E, A, F5, 8, in[12], 5); \
ROUND1(A, B, C, D, E, F5, 13, in[ 2], 5); \
ROUND1(E, A, B, C, D, F5, 12, in[10], 5); \
ROUND1(D, E, A, B, C, F5, 5, in[14], 5); \
ROUND1(C, D, E, A, B, F5, 12, in[ 1], 5); \
ROUND1(B, C, D, E, A, F5, 13, in[ 3], 5); \
ROUND1(A, B, C, D, E, F5, 14, in[ 8], 5); \
ROUND1(E, A, B, C, D, F5, 11, in[11], 5); \
ROUND1(D, E, A, B, C, F5, 8, in[ 6], 5); \
ROUND1(C, D, E, A, B, F5, 5, in[15], 5); \
ROUND1(B, C, D, E, A, F5, 6, in[13], 5); \
\
ROUND2(A, B, C, D, E, F5, 8, in[ 5], 1); \
ROUND2(E, A, B, C, D, F5, 9, in[14], 1); \
ROUND2(D, E, A, B, C, F5, 9, in[ 7], 1); \
ROUND2(C, D, E, A, B, F5, 11, in[ 0], 1); \
ROUND2(B, C, D, E, A, F5, 13, in[ 9], 1); \
ROUND2(A, B, C, D, E, F5, 15, in[ 2], 1); \
ROUND2(E, A, B, C, D, F5, 15, in[11], 1); \
ROUND2(D, E, A, B, C, F5, 5, in[ 4], 1); \
ROUND2(C, D, E, A, B, F5, 7, in[13], 1); \
ROUND2(B, C, D, E, A, F5, 7, in[ 6], 1); \
ROUND2(A, B, C, D, E, F5, 8, in[15], 1); \
ROUND2(E, A, B, C, D, F5, 11, in[ 8], 1); \
ROUND2(D, E, A, B, C, F5, 14, in[ 1], 1); \
ROUND2(C, D, E, A, B, F5, 14, in[10], 1); \
ROUND2(B, C, D, E, A, F5, 12, in[ 3], 1); \
ROUND2(A, B, C, D, E, F5, 6, in[12], 1); \
\
ROUND2(E, A, B, C, D, F4, 9, in[ 6], 2); \
ROUND2(D, E, A, B, C, F4, 13, in[11], 2); \
ROUND2(C, D, E, A, B, F4, 15, in[ 3], 2); \
ROUND2(B, C, D, E, A, F4, 7, in[ 7], 2); \
ROUND2(A, B, C, D, E, F4, 12, in[ 0], 2); \
ROUND2(E, A, B, C, D, F4, 8, in[13], 2); \
ROUND2(D, E, A, B, C, F4, 9, in[ 5], 2); \
ROUND2(C, D, E, A, B, F4, 11, in[10], 2); \
ROUND2(B, C, D, E, A, F4, 7, in[14], 2); \
ROUND2(A, B, C, D, E, F4, 7, in[15], 2); \
ROUND2(E, A, B, C, D, F4, 12, in[ 8], 2); \
ROUND2(D, E, A, B, C, F4, 7, in[12], 2); \
ROUND2(C, D, E, A, B, F4, 6, in[ 4], 2); \
ROUND2(B, C, D, E, A, F4, 15, in[ 9], 2); \
ROUND2(A, B, C, D, E, F4, 13, in[ 1], 2); \
ROUND2(E, A, B, C, D, F4, 11, in[ 2], 2); \
\
ROUND2(D, E, A, B, C, F3, 9, in[15], 3); \
ROUND2(C, D, E, A, B, F3, 7, in[ 5], 3); \
ROUND2(B, C, D, E, A, F3, 15, in[ 1], 3); \
ROUND2(A, B, C, D, E, F3, 11, in[ 3], 3); \
ROUND2(E, A, B, C, D, F3, 8, in[ 7], 3); \
ROUND2(D, E, A, B, C, F3, 6, in[14], 3); \
ROUND2(C, D, E, A, B, F3, 6, in[ 6], 3); \
ROUND2(B, C, D, E, A, F3, 14, in[ 9], 3); \
ROUND2(A, B, C, D, E, F3, 12, in[11], 3); \
ROUND2(E, A, B, C, D, F3, 13, in[ 8], 3); \
ROUND2(D, E, A, B, C, F3, 5, in[12], 3); \
ROUND2(C, D, E, A, B, F3, 14, in[ 2], 3); \
ROUND2(B, C, D, E, A, F3, 13, in[10], 3); \
ROUND2(A, B, C, D, E, F3, 13, in[ 0], 3); \
ROUND2(E, A, B, C, D, F3, 7, in[ 4], 3); \
ROUND2(D, E, A, B, C, F3, 5, in[13], 3); \
\
ROUND2(C, D, E, A, B, F2, 15, in[ 8], 4); \
ROUND2(B, C, D, E, A, F2, 5, in[ 6], 4); \
ROUND2(A, B, C, D, E, F2, 8, in[ 4], 4); \
ROUND2(E, A, B, C, D, F2, 11, in[ 1], 4); \
ROUND2(D, E, A, B, C, F2, 14, in[ 3], 4); \
ROUND2(C, D, E, A, B, F2, 14, in[11], 4); \
ROUND2(B, C, D, E, A, F2, 6, in[15], 4); \
ROUND2(A, B, C, D, E, F2, 14, in[ 0], 4); \
ROUND2(E, A, B, C, D, F2, 6, in[ 5], 4); \
ROUND2(D, E, A, B, C, F2, 9, in[12], 4); \
ROUND2(C, D, E, A, B, F2, 12, in[ 2], 4); \
ROUND2(B, C, D, E, A, F2, 9, in[13], 4); \
ROUND2(A, B, C, D, E, F2, 12, in[ 9], 4); \
ROUND2(E, A, B, C, D, F2, 5, in[ 7], 4); \
ROUND2(D, E, A, B, C, F2, 15, in[10], 4); \
ROUND2(C, D, E, A, B, F2, 8, in[14], 4); \
\
ROUND2(B, C, D, E, A, F1, 8, in[12], 5); \
ROUND2(A, B, C, D, E, F1, 5, in[15], 5); \
ROUND2(E, A, B, C, D, F1, 12, in[10], 5); \
ROUND2(D, E, A, B, C, F1, 9, in[ 4], 5); \
ROUND2(C, D, E, A, B, F1, 12, in[ 1], 5); \
ROUND2(B, C, D, E, A, F1, 5, in[ 5], 5); \
ROUND2(A, B, C, D, E, F1, 14, in[ 8], 5); \
ROUND2(E, A, B, C, D, F1, 6, in[ 7], 5); \
ROUND2(D, E, A, B, C, F1, 8, in[ 6], 5); \
ROUND2(C, D, E, A, B, F1, 13, in[ 2], 5); \
ROUND2(B, C, D, E, A, F1, 6, in[13], 5); \
ROUND2(A, B, C, D, E, F1, 5, in[14], 5); \
ROUND2(E, A, B, C, D, F1, 15, in[ 0], 5); \
ROUND2(D, E, A, B, C, F1, 13, in[ 3], 5); \
ROUND2(C, D, E, A, B, F1, 11, in[ 9], 5); \
ROUND2(B, C, D, E, A, F1, 11, in[11], 5); \
\
tmp = (h[1] + C1 + D2); \
h[1] = (h[2] + D1 + E2); \
h[2] = (h[3] + E1 + A2); \
h[3] = (h[4] + A1 + B2); \
h[4] = (h[0] + B1 + C2); \
h[0] = tmp; \
}
#if 0
__global__
void lbry_ripemd160_gpu_hash_32(const uint32_t threads, uint64_t *g_hash, const uint32_t byteOffset)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U + byteOffset/8]);
uint32_t in[16];
for (int i=0; i<8; i++)
in[i] = (hash[i]);
in[8] = 0x80;
#pragma unroll
for (int i=9;i<16;i++) in[i] = 0;
in[14] = 0x100; // size in bits
uint32_t h[5];
#pragma unroll
for (int i=0; i<5; i++)
h[i] = c_IV[i];
RIPEMD160_ROUND_BODY(in, h);
#pragma unroll
for (int i=0; i<5; i++)
hash[i] = h[i];
#ifdef PAD_ZEROS
// 20 bytes hash on 32 or 64 bytes output space
hash[5] = 0;
hash[6] = 0;
hash[7] = 0;
#endif
}
}
__host__
void lbry_ripemd160_hash_32(int thr_id, uint32_t threads, uint32_t *g_Hash, uint32_t byteOffset, cudaStream_t stream)
{
const uint32_t threadsperblock = 128;
dim3 grid(threads/threadsperblock);
dim3 block(threadsperblock);
lbry_ripemd160_gpu_hash_32 <<<grid, block, 0, stream>>> (threads, (uint64_t*) g_Hash, byteOffset);
}
#endif
__global__
//__launch_bounds__(256,6)
void lbry_ripemd160_gpu_hash_32x2(const uint32_t threads, uint64_t *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t *hash = (uint32_t*) (&g_hash[thread * 8U]);
uint32_t in[16];
for (int i=0; i<8; i++)
in[i] = (hash[i]);
in[8] = 0x80;
#pragma unroll
for (int i=9;i<16;i++) in[i] = 0;
in[14] = 0x100; // size in bits
uint32_t h[5];
#pragma unroll
for (int i=0; i<5; i++)
h[i] = c_IV[i];
RIPEMD160_ROUND_BODY(in, h);
#pragma unroll
for (int i=0; i<5; i++)
hash[i] = h[i];
#ifdef PAD_ZEROS
// 20 bytes hash on 32 output space
hash[5] = 0;
hash[6] = 0;
hash[7] = 0;
#endif
// second 32 bytes block hash
hash += 8;
#pragma unroll
for (int i=0; i<8; i++)
in[i] = (hash[i]);
in[8] = 0x80;
#pragma unroll
for (int i=9;i<16;i++) in[i] = 0;
in[14] = 0x100; // size in bits
#pragma unroll
for (int i=0; i<5; i++)
h[i] = c_IV[i];
RIPEMD160_ROUND_BODY(in, h);
#pragma unroll
for (int i=0; i<5; i++)
hash[i] = h[i];
#ifdef PAD_ZEROS
// 20 bytes hash on 32 output space
hash[5] = 0;
hash[6] = 0;
hash[7] = 0;
#endif
}
}
__host__
void lbry_ripemd160_hash_32x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream)
{
const uint32_t threadsperblock = 128;
dim3 grid(threads/threadsperblock);
dim3 block(threadsperblock);
lbry_ripemd160_gpu_hash_32x2 <<<grid, block, 0, stream>>> (threads, (uint64_t*) g_Hash);
}
void lbry_ripemd160_init(int thr_id)
{
//cudaMemcpyToSymbol(c_IV, IV, sizeof(IV), 0, cudaMemcpyHostToDevice);
}

712
lbry/cuda_sha256_lbry.cu

@ -0,0 +1,712 @@ @@ -0,0 +1,712 @@
/*
* sha256 CUDA implementation.
*/
#include <stdio.h>
#include <stdint.h>
#include <memory.h>
#include <cuda_helper.h>
#include <miner.h>
__constant__ static uint32_t __align__(8) c_midstate112[8];
__constant__ static uint32_t __align__(8) c_dataEnd112[12];
const __constant__ uint32_t __align__(8) c_H256[8] = {
0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
};
__constant__ static uint32_t __align__(8) c_K[64];
static __thread uint32_t* d_resNonces;
__constant__ static uint32_t __align__(8) c_target[2];
__device__ uint64_t d_target[1];
// ------------------------------------------------------------------------------------------------
static const uint32_t cpu_H256[8] = {
0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
};
static const uint32_t cpu_K[64] = {
0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
};
#define ROTR ROTR32
__host__
static void sha256_step1_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
uint32_t in, const uint32_t Kshared)
{
uint32_t t1,t2;
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
t1 = h + bsg21 + vxandx + Kshared + in;
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
__host__
static void sha256_step2_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
uint32_t* in, uint32_t pc, const uint32_t Kshared)
{
uint32_t t1,t2;
int pcidx1 = (pc-2) & 0xF;
int pcidx2 = (pc-7) & 0xF;
int pcidx3 = (pc-15) & 0xF;
uint32_t inx0 = in[pc];
uint32_t inx1 = in[pcidx1];
uint32_t inx2 = in[pcidx2];
uint32_t inx3 = in[pcidx3];
uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
in[pc] = ssg21 + inx2 + ssg20 + inx0;
t1 = h + bsg21 + vxandx + Kshared + in[pc];
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
__host__
static void sha256_round_body_host(uint32_t* in, uint32_t* state, const uint32_t* Kshared)
{
uint32_t a = state[0];
uint32_t b = state[1];
uint32_t c = state[2];
uint32_t d = state[3];
uint32_t e = state[4];
uint32_t f = state[5];
uint32_t g = state[6];
uint32_t h = state[7];
sha256_step1_host(a,b,c,d,e,f,g,h,in[0], Kshared[0]);
sha256_step1_host(h,a,b,c,d,e,f,g,in[1], Kshared[1]);
sha256_step1_host(g,h,a,b,c,d,e,f,in[2], Kshared[2]);
sha256_step1_host(f,g,h,a,b,c,d,e,in[3], Kshared[3]);
sha256_step1_host(e,f,g,h,a,b,c,d,in[4], Kshared[4]);
sha256_step1_host(d,e,f,g,h,a,b,c,in[5], Kshared[5]);
sha256_step1_host(c,d,e,f,g,h,a,b,in[6], Kshared[6]);
sha256_step1_host(b,c,d,e,f,g,h,a,in[7], Kshared[7]);
sha256_step1_host(a,b,c,d,e,f,g,h,in[8], Kshared[8]);
sha256_step1_host(h,a,b,c,d,e,f,g,in[9], Kshared[9]);
sha256_step1_host(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
sha256_step1_host(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
sha256_step1_host(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
sha256_step1_host(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
sha256_step1_host(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
sha256_step1_host(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
for (int i=0; i<3; i++)
{
sha256_step2_host(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
sha256_step2_host(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
sha256_step2_host(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
sha256_step2_host(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
sha256_step2_host(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
sha256_step2_host(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
sha256_step2_host(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
sha256_step2_host(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
sha256_step2_host(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
sha256_step2_host(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
sha256_step2_host(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
sha256_step2_host(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
sha256_step2_host(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
sha256_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
sha256_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
sha256_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
}
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
}
__device__ __forceinline__
uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) {
uint32_t result;
#if __CUDA_ARCH__ >= 500 && CUDA_VERSION >= 7050
asm ("lop3.b32 %0, %1, %2, %3, 0x96; // xor3b" //0x96 = 0xF0 ^ 0xCC ^ 0xAA
: "=r"(result) : "r"(a), "r"(b),"r"(c));
#else
result = a^b^c;
#endif
return result;
}
/*
__device__ __forceinline__
uint32_t xor3b(const uint32_t a, const uint32_t b, const uint32_t c) {
uint32_t result;
asm("{ .reg .u32 t1; // xor3b \n\t"
"xor.b32 t1, %2, %3;\n\t"
"xor.b32 %0, %1, t1;"
"}"
: "=r"(result) : "r"(a) ,"r"(b),"r"(c));
return result;
}
#define xor3b(a,b,c) (a ^ b ^ c)
*/
__device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
{
uint32_t r1 = ROTR32(x,2);
uint32_t r2 = ROTR32(x,13);
uint32_t r3 = ROTR32(x,22);
return xor3b(r1,r2,r3);
}
__device__ __forceinline__ uint32_t bsg2_1(const uint32_t x)
{
uint32_t r1 = ROTR32(x,6);
uint32_t r2 = ROTR32(x,11);
uint32_t r3 = ROTR32(x,25);
return xor3b(r1,r2,r3);
}
__device__ __forceinline__ uint32_t ssg2_0(const uint32_t x)
{
uint64_t r1 = ROTR32(x,7);
uint64_t r2 = ROTR32(x,18);
uint64_t r3 = shr_t32(x,3);
return xor3b(r1,r2,r3);
}
__device__ __forceinline__ uint32_t ssg2_1(const uint32_t x)
{
uint64_t r1 = ROTR32(x,17);
uint64_t r2 = ROTR32(x,19);
uint64_t r3 = shr_t32(x,10);
return xor3b(r1,r2,r3);
}
__device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b, const uint32_t c)
{
uint32_t result;
asm("{\n\t"
".reg .u32 m,n,o;\n\t"
"and.b32 m, %1, %2;\n\t"
" or.b32 n, %1, %2;\n\t"
"and.b32 o, n, %3;\n\t"
" or.b32 %0, m, o ;\n\t"
"}\n\t" : "=r"(result) : "r"(a), "r"(b), "r"(c)
);
return result;
}
__device__
static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
uint32_t in, const uint32_t Kshared)
{
uint32_t t1,t2;
uint32_t vxandx = xandx(e, f, g);
uint32_t bsg21 = bsg2_1(e);
uint32_t bsg20 = bsg2_0(a);
uint32_t andorv = andor32(a,b,c);
t1 = h + bsg21 + vxandx + Kshared + in;
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
__device__
static void sha2_step2(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
uint32_t* in, uint32_t pc, const uint32_t Kshared)
{
uint32_t t1,t2;
int pcidx1 = (pc-2) & 0xF;
int pcidx2 = (pc-7) & 0xF;
int pcidx3 = (pc-15) & 0xF;
uint32_t inx0 = in[pc];
uint32_t inx1 = in[pcidx1];
uint32_t inx2 = in[pcidx2];
uint32_t inx3 = in[pcidx3];
uint32_t ssg21 = ssg2_1(inx1);
uint32_t ssg20 = ssg2_0(inx3);
uint32_t vxandx = xandx(e, f, g);
uint32_t bsg21 = bsg2_1(e);
uint32_t bsg20 = bsg2_0(a);
uint32_t andorv = andor32(a,b,c);
in[pc] = ssg21 + inx2 + ssg20 + inx0;
t1 = h + bsg21 + vxandx + Kshared + in[pc];
t2 = bsg20 + andorv;
d = d + t1;
h = t1 + t2;
}
__device__
static void sha256_round_body(uint32_t* in, uint32_t* state, uint32_t* const Kshared)
{
uint32_t a = state[0];
uint32_t b = state[1];
uint32_t c = state[2];
uint32_t d = state[3];
uint32_t e = state[4];
uint32_t f = state[5];
uint32_t g = state[6];
uint32_t h = state[7];
sha2_step1(a,b,c,d,e,f,g,h,in[0], Kshared[0]);
sha2_step1(h,a,b,c,d,e,f,g,in[1], Kshared[1]);
sha2_step1(g,h,a,b,c,d,e,f,in[2], Kshared[2]);
sha2_step1(f,g,h,a,b,c,d,e,in[3], Kshared[3]);
sha2_step1(e,f,g,h,a,b,c,d,in[4], Kshared[4]);
sha2_step1(d,e,f,g,h,a,b,c,in[5], Kshared[5]);
sha2_step1(c,d,e,f,g,h,a,b,in[6], Kshared[6]);
sha2_step1(b,c,d,e,f,g,h,a,in[7], Kshared[7]);
sha2_step1(a,b,c,d,e,f,g,h,in[8], Kshared[8]);
sha2_step1(h,a,b,c,d,e,f,g,in[9], Kshared[9]);
sha2_step1(g,h,a,b,c,d,e,f,in[10],Kshared[10]);
sha2_step1(f,g,h,a,b,c,d,e,in[11],Kshared[11]);
sha2_step1(e,f,g,h,a,b,c,d,in[12],Kshared[12]);
sha2_step1(d,e,f,g,h,a,b,c,in[13],Kshared[13]);
sha2_step1(c,d,e,f,g,h,a,b,in[14],Kshared[14]);
sha2_step1(b,c,d,e,f,g,h,a,in[15],Kshared[15]);
#pragma unroll
for (int i=0; i<3; i++)
{
sha2_step2(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
sha2_step2(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
sha2_step2(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
sha2_step2(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
sha2_step2(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
sha2_step2(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
sha2_step2(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
sha2_step2(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
sha2_step2(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
sha2_step2(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
sha2_step2(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
sha2_step2(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
sha2_step2(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
}
state[0] += a;
state[1] += b;
state[2] += c;
state[3] += d;
state[4] += e;
state[5] += f;
state[6] += g;
state[7] += h;
}
__device__
uint64_t cuda_swab32ll(uint64_t x) {
return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
}
__global__
/*__launch_bounds__(256,3)*/
void lbry_sha256_gpu_hash_112(const uint32_t threads, const uint32_t startNonce, const bool swabNonce, uint64_t *outputHash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
const uint32_t nonce = startNonce + thread;
uint32_t dat[16];
#pragma unroll
for (int i=0;i<11;i++) dat[i] = c_dataEnd112[i]; // pre "swabed"
dat[11] = swabNonce ? cuda_swab32(nonce) : nonce;
dat[12] = 0x80000000;
dat[13] = 0;
dat[14] = 0;
dat[15] = 0x380;
uint32_t __align__(8) buf[8];
#pragma unroll
for (int i=0;i<8;i++) buf[i] = c_midstate112[i];
sha256_round_body(dat, buf, c_K);
// output
uint2* output = (uint2*) (&outputHash[thread * 8U]);
#pragma unroll
for (int i=0;i<4;i++) {
//output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
output[i] = vectorize(((uint64_t*)buf)[i]); // out without swap, new sha256 after
}
}
}
__global__
/*__launch_bounds__(256,3)*/
void lbry_sha256_gpu_hash_32(uint32_t threads, uint64_t *Hash512)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t __align__(8) buf[8]; // align for vectorize
#pragma unroll
for (int i=0; i<8; i++) buf[i] = c_H256[i];
uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]);
uint32_t dat[16];
#pragma unroll
//for (int i=0;i<8;i++) dat[i] = cuda_swab32(input[i]);
for (int i=0; i<8; i++) dat[i] = input[i];
dat[8] = 0x80000000;
#pragma unroll
for (int i=9; i<15; i++) dat[i] = 0;
dat[15] = 0x100;
sha256_round_body(dat, buf, c_K);
// output
uint2* output = (uint2*) input;
#pragma unroll
for (int i=0;i<4;i++) {
output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
}
#ifdef PAD_ZEROS
#pragma unroll
for (int i=4; i<8; i++) output[i] = vectorize(0);
#endif
}
}
__global__
/*__launch_bounds__(256,3)*/
void lbry_sha256d_gpu_hash_112(const uint32_t threads, const uint32_t startNonce, const bool swabNonce, uint64_t *outputHash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
extern __shared__ uint32_t s_K[];
//s_K[thread & 63] = c_K[thread & 63];
if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x];
if (thread < threads)
{
const uint32_t nonce = startNonce + thread;
uint32_t dat[16];
#pragma unroll
for (int i=0; i<11; i++) dat[i] = c_dataEnd112[i];
dat[11] = swabNonce ? cuda_swab32(nonce) : nonce;
dat[12] = 0x80000000;
dat[13] = 0;
dat[14] = 0;
dat[15] = 0x380;
uint32_t __align__(8) buf[8];
#pragma unroll
for (int i=0;i<8;i++) buf[i] = c_midstate112[i];
sha256_round_body(dat, buf, s_K);
// second sha256
#pragma unroll
for (int i=0; i<8; i++) dat[i] = buf[i];
dat[8] = 0x80000000;
#pragma unroll
for (int i=9; i<15; i++) dat[i] = 0;
dat[15] = 0x100;
#pragma unroll
for (int i=0; i<8; i++) buf[i] = c_H256[i];
sha256_round_body(dat, buf, s_K);
// output
uint2* output = (uint2*) (&outputHash[thread * 8U]);
#pragma unroll
for (int i=0;i<4;i++) {
output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
//output[i] = vectorize(((uint64_t*)buf)[i]);
}
}
}
__global__
/*__launch_bounds__(256,3)*/
void lbry_sha256_gpu_hash_20x2(uint32_t threads, uint64_t *Hash512)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t __align__(8) buf[8]; // align for vectorize
#pragma unroll
for (int i=0;i<8;i++) buf[i] = c_H256[i];
uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]);
uint32_t dat[16];
#pragma unroll
for (int i=0;i<5;i++) dat[i] = cuda_swab32(input[i]);
#pragma unroll
for (int i=0;i<5;i++) dat[i+5] = cuda_swab32(input[i+8]);
dat[10] = 0x80000000;
#pragma unroll
for (int i=11;i<15;i++) dat[i] = 0;
dat[15] = 0x140;
sha256_round_body(dat, buf, c_K);
// output
uint2* output = (uint2*) input;
#pragma unroll
for (int i=0;i<4;i++) {
//output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
output[i] = vectorize(((uint64_t*)buf)[i]);
}
#ifdef PAD_ZEROS
#pragma unroll
for (int i=4; i<8; i++) output[i] = vectorize(0);
#endif
}
}
__global__
/*__launch_bounds__(256,3)*/
void lbry_sha256d_gpu_hash_20x2(uint32_t threads, uint64_t *Hash512)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
extern __shared__ uint32_t s_K[];
if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x];
if (thread < threads)
{
uint32_t __align__(8) buf[8]; // align for vectorize
#pragma unroll
for (int i=0;i<8;i++) buf[i] = c_H256[i];
uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]);
uint32_t dat[16];
#pragma unroll
for (int i=0; i<5; i++) dat[i] = cuda_swab32(input[i]);
#pragma unroll
for (int i=0; i<5; i++) dat[i+5] = cuda_swab32(input[i+8]);
dat[10] = 0x80000000;
#pragma unroll
for (int i=11;i<15;i++) dat[i] = 0;
dat[15] = 0x140;
sha256_round_body(dat, buf, s_K);
// second sha256
#pragma unroll
for (int i=0; i<8; i++) dat[i] = buf[i];
dat[8] = 0x80000000;
#pragma unroll
for (int i=9; i<15; i++) dat[i] = 0;
dat[15] = 0x100;
#pragma unroll
for (int i=0;i<8;i++) buf[i] = c_H256[i];
sha256_round_body(dat, buf, s_K);
// output
uint2* output = (uint2*) input;
#ifdef FULL_HASH
#pragma unroll
for (int i=0;i<4;i++) {
output[i] = vectorize(cuda_swab32ll(((uint64_t*)buf)[i]));
//output[i] = vectorize(((uint64_t*)buf)[i]);
}
# ifdef PAD_ZEROS
#pragma unroll
for (int i=4; i<8; i++) output[i] = vectorize(0);
# endif
#else
//input[6] = cuda_swab32(buf[6]);
//input[7] = cuda_swab32(buf[7]);
output[3] = vectorize(cuda_swab32ll(((uint64_t*)buf)[3]));
#endif
}
}
__host__
void lbry_sha256_init(int thr_id)
{
//cudaMemcpyToSymbol(c_H256, cpu_H256, sizeof(cpu_H256), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(c_K, cpu_K, sizeof(cpu_K), 0, cudaMemcpyHostToDevice);
CUDA_SAFE_CALL(cudaMalloc(&d_resNonces, 4*sizeof(uint32_t)));
}
__host__
void lbry_sha256_free(int thr_id)
{
cudaFree(d_resNonces);
}
__host__
void lbry_sha256_setBlock_112(uint32_t *pdata, uint32_t *ptarget)
{
uint32_t in[16], buf[8], end[11];
for (int i=0;i<16;i++) in[i] = cuda_swab32(pdata[i]);
for (int i=0; i<8;i++) buf[i] = cpu_H256[i];
for (int i=0;i<11;i++) end[i] = cuda_swab32(pdata[16+i]);
sha256_round_body_host(in, buf, cpu_K);
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_midstate112, buf, 32, 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_dataEnd112, end, sizeof(end), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_target, &ptarget[6], sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &ptarget[6], sizeof(uint64_t), 0, cudaMemcpyHostToDevice));
}
__host__
void lbry_sha256_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream)
{
const int threadsperblock = 256;
dim3 grid(threads/threadsperblock);
dim3 block(threadsperblock);
lbry_sha256_gpu_hash_112 <<<grid, block, 0, stream>>> (threads, startNonce, swabNonce, (uint64_t*) d_outputHash);
cudaGetLastError();
}
__host__
void lbry_sha256_hash_32(int thr_id, uint32_t threads, uint32_t *d_Hash, cudaStream_t stream)
{
const int threadsperblock = 256;
dim3 grid(threads/threadsperblock);
dim3 block(threadsperblock);
lbry_sha256_gpu_hash_32 <<<grid, block, 0, stream>>> (threads, (uint64_t*) d_Hash);
}
__host__
void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream)
{
const int threadsperblock = 256;
dim3 grid(threads/threadsperblock);
dim3 block(threadsperblock);
lbry_sha256d_gpu_hash_112 <<<grid, block, 64*4, stream>>> (threads, startNonce, swabNonce, (uint64_t*) d_outputHash);
}
__host__
void lbry_sha256_hash_20x2(int thr_id, uint32_t threads, uint32_t *d_Hash, cudaStream_t stream)
{
const int threadsperblock = 256;
dim3 grid(threads/threadsperblock);
dim3 block(threadsperblock);
lbry_sha256_gpu_hash_20x2 <<<grid, block, 0, stream>>> (threads, (uint64_t*) d_Hash);
}
__host__
void lbry_sha256d_hash_20x2(int thr_id, uint32_t threads, uint32_t *d_Hash, cudaStream_t stream)
{
const int threadsperblock = 256;
dim3 grid(threads/threadsperblock);
dim3 block(threadsperblock);
lbry_sha256d_gpu_hash_20x2 <<<grid, block, 64*4, stream>>> (threads, (uint64_t*) d_Hash);
}
__global__
__launch_bounds__(256,3)
void lbry_sha256d_gpu_hash_final(const uint32_t threads, const uint32_t startNonce, uint64_t *Hash512, uint32_t *resNonces)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t __align__(8) buf[8]; // align for vectorize
#pragma unroll
for (int i=0;i<8;i++) buf[i] = c_H256[i];
uint32_t* input = (uint32_t*) (&Hash512[thread * 8U]);
uint32_t __align__(8) dat[16];
#pragma unroll
for (int i=0;i<5;i++) dat[i] = cuda_swab32(input[i]);
#pragma unroll
for (int i=0;i<5;i++) dat[i+5] = cuda_swab32(input[i+8]);
dat[10] = 0x80000000;
#pragma unroll
for (int i=11;i<15;i++) dat[i] = 0;
dat[15] = 0x140;
sha256_round_body(dat, buf, c_K);
// second sha256
#pragma unroll
for (int i=0;i<8;i++) dat[i] = buf[i];
dat[8] = 0x80000000;
#pragma unroll
for (int i=9;i<15;i++) dat[i] = 0;
dat[15] = 0x100;
#pragma unroll
for (int i=0;i<8;i++) buf[i] = c_H256[i];
sha256_round_body(dat, buf, c_K);
// valid nonces
uint64_t high = cuda_swab32ll(((uint64_t*)buf)[3]);
if (high <= d_target[0]) {
// printf("%08x %08x - %016llx %016llx - %08x %08x\n", buf[7], buf[6], high, d_target[0], c_target[1], c_target[0]);
uint32_t nonce = startNonce + thread;
resNonces[1] = atomicExch(resNonces, nonce);
d_target[0] = high;
}
}
}
__host__
void lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_inputHash, uint32_t *resNonces, cudaStream_t stream)
{
const int threadsperblock = 256;
dim3 grid(threads/threadsperblock);
dim3 block(threadsperblock);
CUDA_SAFE_CALL(cudaMemset(d_resNonces, 0xFF, 2 * sizeof(uint32_t)));
cudaThreadSynchronize();
lbry_sha256d_gpu_hash_final <<<grid, block, 0, stream>>> (threads, startNonce, (uint64_t*) d_inputHash, d_resNonces);
cudaThreadSynchronize();
CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_resNonces, 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
if (resNonces[0] == resNonces[1]) {
resNonces[1] = UINT32_MAX;
}
}

181
lbry/cuda_sha512_lbry.cu

@ -0,0 +1,181 @@ @@ -0,0 +1,181 @@
/**
* sha-512 CUDA implementation.
*/
#include <stdio.h>
#include <stdint.h>
#include <memory.h>
#include <cuda_helper.h>
static __constant__ uint64_t K_512[80];
static const uint64_t K512[80] = {
0x428A2F98D728AE22, 0x7137449123EF65CD, 0xB5C0FBCFEC4D3B2F, 0xE9B5DBA58189DBBC,
0x3956C25BF348B538, 0x59F111F1B605D019, 0x923F82A4AF194F9B, 0xAB1C5ED5DA6D8118,
0xD807AA98A3030242, 0x12835B0145706FBE, 0x243185BE4EE4B28C, 0x550C7DC3D5FFB4E2,
0x72BE5D74F27B896F, 0x80DEB1FE3B1696B1, 0x9BDC06A725C71235, 0xC19BF174CF692694,
0xE49B69C19EF14AD2, 0xEFBE4786384F25E3, 0x0FC19DC68B8CD5B5, 0x240CA1CC77AC9C65,
0x2DE92C6F592B0275, 0x4A7484AA6EA6E483, 0x5CB0A9DCBD41FBD4, 0x76F988DA831153B5,
0x983E5152EE66DFAB, 0xA831C66D2DB43210, 0xB00327C898FB213F, 0xBF597FC7BEEF0EE4,
0xC6E00BF33DA88FC2, 0xD5A79147930AA725, 0x06CA6351E003826F, 0x142929670A0E6E70,
0x27B70A8546D22FFC, 0x2E1B21385C26C926, 0x4D2C6DFC5AC42AED, 0x53380D139D95B3DF,
0x650A73548BAF63DE, 0x766A0ABB3C77B2A8, 0x81C2C92E47EDAEE6, 0x92722C851482353B,
0xA2BFE8A14CF10364, 0xA81A664BBC423001, 0xC24B8B70D0F89791, 0xC76C51A30654BE30,
0xD192E819D6EF5218, 0xD69906245565A910, 0xF40E35855771202A, 0x106AA07032BBD1B8,
0x19A4C116B8D2D0C8, 0x1E376C085141AB53, 0x2748774CDF8EEB99, 0x34B0BCB5E19B48A8,
0x391C0CB3C5C95A63, 0x4ED8AA4AE3418ACB, 0x5B9CCA4F7763E373, 0x682E6FF3D6B2B8A3,
0x748F82EE5DEFB2FC, 0x78A5636F43172F60, 0x84C87814A1F0AB72, 0x8CC702081A6439EC,
0x90BEFFFA23631E28, 0xA4506CEBDE82BDE9, 0xBEF9A3F7B2C67915, 0xC67178F2E372532B,
0xCA273ECEEA26619C, 0xD186B8C721C0C207, 0xEADA7DD6CDE0EB1E, 0xF57D4F7FEE6ED178,
0x06F067AA72176FBA, 0x0A637DC5A2C898A6, 0x113F9804BEF90DAE, 0x1B710B35131C471B,
0x28DB77F523047D84, 0x32CAAB7B40C72493, 0x3C9EBE0A15C9BEBC, 0x431D67C49C100D4C,
0x4CC5D4BECB3E42B6, 0x597F299CFC657E2A, 0x5FCB6FAB3AD6FAEC, 0x6C44198C4A475817
};
//#undef xor3
//#define xor3(a,b,c) (a^b^c)
//#undef
static __device__ __forceinline__
uint64_t bsg5_0(const uint64_t x)
{
uint64_t r1 = ROTR64(x,28);
uint64_t r2 = ROTR64(x,34);
uint64_t r3 = ROTR64(x,39);
return xor3(r1,r2,r3);
}
static __device__ __forceinline__
uint64_t bsg5_1(const uint64_t x)
{
uint64_t r1 = ROTR64(x,14);
uint64_t r2 = ROTR64(x,18);
uint64_t r3 = ROTR64(x,41);
return xor3(r1,r2,r3);
}
static __device__ __forceinline__
uint64_t ssg5_0(const uint64_t x)
{
uint64_t r1 = ROTR64(x,1);
uint64_t r2 = ROTR64(x,8);
uint64_t r3 = shr_t64(x,7);
return xor3(r1,r2,r3);
}
static __device__ __forceinline__
uint64_t ssg5_1(const uint64_t x)
{
uint64_t r1 = ROTR64(x,19);
uint64_t r2 = ROTR64(x,61);
uint64_t r3 = shr_t64(x,6);
return xor3(r1,r2,r3);
}
static __device__ __forceinline__
uint64_t xandx64(const uint64_t a, const uint64_t b, const uint64_t c)
{
uint64_t result;
asm("{ .reg .u64 m,n; // xandx64\n\t"
"xor.b64 m, %2,%3;\n\t"
"and.b64 n, m,%1;\n\t"
"xor.b64 %0, n,%3;\n\t"
"}" : "=l"(result) : "l"(a), "l"(b), "l"(c));
return result;
}
static __device__ __forceinline__
void sha512_step2(uint64_t* r, uint64_t* W, uint64_t* K, const int ord, int i)
{
int u = 8-ord;
uint64_t a = r[(0+u) & 7];
uint64_t b = r[(1+u) & 7];
uint64_t c = r[(2+u) & 7];
uint64_t d = r[(3+u) & 7];
uint64_t e = r[(4+u) & 7];
uint64_t f = r[(5+u) & 7];
uint64_t g = r[(6+u) & 7];
uint64_t h = r[(7+u) & 7];
uint64_t T1 = h + bsg5_1(e) + xandx64(e,f,g) + W[i] + K[i];
uint64_t T2 = bsg5_0(a) + andor(a,b,c);
r[(3+u)& 7] = d + T1;
r[(7+u)& 7] = T1 + T2;
}
/**************************************************************************************************/
__global__
void lbry_sha512_gpu_hash_32(const uint32_t threads, uint64_t *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint64_t *pHash = &g_hash[thread * 8U];
uint64_t W[80];
uint64_t r[8];
uint64_t IV512[8] = {
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
#pragma unroll
for (int i = 0; i < 8; i++)
r[i] = IV512[i];
#pragma unroll
for (int i = 0; i < 4; i++) {
// 32 bytes input
W[i] = cuda_swab64(pHash[i]);
}
W[4] = 0x8000000000000000; // end tag
#pragma unroll
for (int i = 5; i < 15; i++) W[i] = 0;
W[15] = 0x100; // 256 bits
#pragma unroll
for (int i = 16; i < 80; i++) W[i] = 0;
#pragma unroll 64
for (int i = 16; i < 80; i++)
W[i] = ssg5_1(W[i - 2]) + W[i - 7] + ssg5_0(W[i - 15]) + W[i - 16];
#pragma unroll 10
for (int i = 0; i < 10; i++) {
#pragma unroll 8
for (int ord=0; ord<8; ord++)
sha512_step2(r, W, K_512, ord, 8*i + ord);
}
#pragma unroll 8
for (int i = 0; i < 8; i++)
pHash[i] = cuda_swab64(r[i] + IV512[i]);
}
}
__host__
void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStream_t stream)
{
const int threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
size_t shared_size = 80*8;
lbry_sha512_gpu_hash_32 <<<grid, block, shared_size, stream>>> (threads, (uint64_t*)d_hash);
}
/**************************************************************************************************/
__host__
void lbry_sha512_init(int thr_id)
{
cudaMemcpyToSymbol(K_512, K512, 80*sizeof(uint64_t), 0, cudaMemcpyHostToDevice);
}

225
lbry/lbry.cu

@ -0,0 +1,225 @@ @@ -0,0 +1,225 @@
/**
* Lbry CUDA Implementation
*
* by tpruvot@github - July 2016
*
*/
#include <string.h>
#include <stdint.h>
extern "C" {
#include <sph/sph_sha2.h>
#include <sph/sph_ripemd.h>
}
#include <cuda_helper.h>
#include <miner.h>
#define A 64
#define debug_cpu 0
extern "C" void lbry_hash(void* output, const void* input)
{
uint32_t _ALIGN(A) hashA[16];
uint32_t _ALIGN(A) hashB[8];
uint32_t _ALIGN(A) hashC[8];
sph_sha256_context ctx_sha256;
sph_sha512_context ctx_sha512;
sph_ripemd160_context ctx_ripemd;
sph_sha256_init(&ctx_sha256);
sph_sha256(&ctx_sha256, input, 112);
sph_sha256_close(&ctx_sha256, hashA);
sph_sha256(&ctx_sha256, hashA, 32);
sph_sha256_close(&ctx_sha256, hashA);
sph_sha512_init(&ctx_sha512);
sph_sha512(&ctx_sha512, hashA, 32);
sph_sha512_close(&ctx_sha512, hashA);
sph_ripemd160_init(&ctx_ripemd);
sph_ripemd160(&ctx_ripemd, hashA, 32); // sha512 low
sph_ripemd160_close(&ctx_ripemd, hashB);
if (debug_cpu) applog_hex(hashB, 20);
sph_ripemd160(&ctx_ripemd, &hashA[8], 32); // sha512 high
sph_ripemd160_close(&ctx_ripemd, hashC);
if (debug_cpu) applog_hex(hashC, 20);
sph_sha256(&ctx_sha256, hashB, 20);
sph_sha256(&ctx_sha256, hashC, 20);
sph_sha256_close(&ctx_sha256, hashA);
if (debug_cpu) applog_hex(hashA,32);
sph_sha256(&ctx_sha256, hashA, 32);
sph_sha256_close(&ctx_sha256, hashA);
memcpy(output, hashA, 32);
}
/* ############################################################################################################################### */
extern void lbry_ripemd160_init(int thr_id);
extern void lbry_sha256_init(int thr_id);
extern void lbry_sha256_free(int thr_id);
extern void lbry_sha256_setBlock_112(uint32_t *pdata, uint32_t *ptarget);
extern void lbry_sha256_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream);
extern void lbry_sha256d_hash_112(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash, bool swabNonce, cudaStream_t stream);
extern void lbry_sha256_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStream_t stream);
extern void lbry_sha512_init(int thr_id);
extern void lbry_sha512_hash_32(int thr_id, uint32_t threads, uint32_t *d_hash, cudaStream_t stream);
extern void lbry_ripemd160_hash_32x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream);
extern void lbry_sha256_hash_20x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream);
extern void lbry_sha256d_hash_20x2(int thr_id, uint32_t threads, uint32_t *g_Hash, cudaStream_t stream);
extern void lbry_sha256d_hash_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_inputHash, uint32_t *resNonces, cudaStream_t stream);
static __inline uint32_t swab32_if(uint32_t val, bool iftrue) {
return iftrue ? swab32(val) : val;
}
static bool init[MAX_GPUS] = { 0 };
static uint32_t *d_hash[MAX_GPUS];
// nonce position is different
#define LBC_NONCE_OFT32 27
extern "C" int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t _ALIGN(A) vhash[8];
uint32_t _ALIGN(A) endiandata[28];
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[LBC_NONCE_OFT32];
const int swap = 0; // to toggle nonce endian
const int dev_id = device_map[thr_id];
int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 22 : 20;
if (device_sm[dev_id] >= 600) intensity = 23;
if (device_sm[dev_id] < 350) intensity = 18;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark) {
ptarget[7] = 0xff;
}
if (!init[thr_id]){
cudaSetDevice(dev_id);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage (linux)
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
CUDA_LOG_ERROR();
}
CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput));
lbry_sha256_init(thr_id);
lbry_sha512_init(thr_id);
lbry_ripemd160_init(thr_id);
cuda_check_cpu_init(thr_id, throughput);
CUDA_LOG_ERROR();
init[thr_id] = true;
}
for (int i=0; i < LBC_NONCE_OFT32; i++) {
be32enc(&endiandata[i], pdata[i]);
}
lbry_sha256_setBlock_112(endiandata, ptarget);
cuda_check_cpu_setTarget(ptarget);
do {
// Hash with CUDA
#if 0
lbry_sha256_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], swap, 0);
lbry_sha256_hash_32(thr_id, throughput, d_hash[thr_id], 0);
#else
lbry_sha256d_hash_112(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], swap, 0);
#endif
CUDA_LOG_ERROR();
lbry_sha512_hash_32(thr_id, throughput, d_hash[thr_id], 0);
lbry_ripemd160_hash_32x2(thr_id, throughput, d_hash[thr_id], 0);
#if 0
lbry_sha256d_hash_20x2(thr_id, throughput, d_hash[thr_id], 0);
uint32_t foundNonce = cuda_check_hash(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id]);
#else
uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };
lbry_sha256d_hash_final(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], resNonces, 0);
uint32_t foundNonce = resNonces[0];
#endif
*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce + throughput;
if (foundNonce != UINT32_MAX)
{
endiandata[LBC_NONCE_OFT32] = swab32_if(foundNonce, !swap);
lbry_hash(vhash, endiandata);
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
int res = 1;
uint32_t secNonce = resNonces[1];
//uint32_t secNonce = cuda_check_hash_suppl(thr_id, throughput, pdata[LBC_NONCE_OFT32], d_hash[thr_id], 1);
work->nonces[0] = swab32_if(foundNonce, swap);
work_set_target_ratio(work, vhash);
if (secNonce != UINT32_MAX) {
//if (secNonce) {
if (opt_debug)
gpulog(LOG_BLUE, thr_id, "found second nonce %08x", swab32(secNonce));
endiandata[LBC_NONCE_OFT32] = swab32_if(secNonce, !swap);
lbry_hash(vhash, endiandata);
if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio) {
work_set_target_ratio(work, vhash);
xchg(work->nonces[0], work->nonces[1]);
}
work->nonces[1] = swab32_if(secNonce, swap);
res++;
}
pdata[LBC_NONCE_OFT32] = work->nonces[0];
return res;
} else {
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU %08x > %08x!", foundNonce, vhash[7], ptarget[7]);
}
}
if ((uint64_t) throughput + pdata[LBC_NONCE_OFT32] >= max_nonce) {
pdata[LBC_NONCE_OFT32] = max_nonce;
break;
}
pdata[LBC_NONCE_OFT32] += throughput;
} while (!work_restart[thr_id].restart);
//*hashes_done = pdata[LBC_NONCE_OFT32] - first_nonce;
return 0;
}
// cleanup
void free_lbry(int thr_id)
{
if (!init[thr_id])
return;
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
lbry_sha256_free(thr_id);
cuda_check_cpu_free(thr_id);
init[thr_id] = false;
cudaDeviceSynchronize();
}

3
miner.h

@ -275,6 +275,7 @@ extern int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce, @@ -275,6 +275,7 @@ extern int scanhash_fugue256(int thr_id, struct work* work, uint32_t max_nonce,
extern int scanhash_groestlcoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_heavy(int thr_id,struct work *work, uint32_t max_nonce, unsigned long *hashes_done, uint32_t maxvote, int blocklen);
extern int scanhash_jackpot(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_lyra2v2(int thr_id,struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@ -584,6 +585,7 @@ struct stratum_job { @@ -584,6 +585,7 @@ struct stratum_job {
unsigned char version[4];
unsigned char nbits[4];
unsigned char ntime[4];
unsigned char claim[32]; // lbry
bool clean;
unsigned char nreward[2];
uint32_t height;
@ -797,6 +799,7 @@ void heavycoin_hash(unsigned char* output, const unsigned char* input, int len); @@ -797,6 +799,7 @@ void heavycoin_hash(unsigned char* output, const unsigned char* input, int len);
void keccak256_hash(void *state, const void *input);
unsigned int jackpothash(void *state, const void *input);
void groestlhash(void *state, const void *input);
void lbry_hash(void *output, const void *input);
void lyra2re_hash(void *state, const void *input);
void lyra2v2_hash(void *state, const void *input);
void myriadhash(void *state, const void *input);

833
sph/ripemd.c

@ -0,0 +1,833 @@ @@ -0,0 +1,833 @@
/* $Id: ripemd.c 216 2010-06-08 09:46:57Z tp $ */
/*
* RIPEMD-160 implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#include <stddef.h>
#include <string.h>
#include "sph_ripemd.h"
/*
* Round functions for RIPEMD (original).
*/
#define F(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
#define G(x, y, z) (((x) & (y)) | (((x) | (y)) & (z)))
#define H(x, y, z) ((x) ^ (y) ^ (z))
static const sph_u32 oIV[5] = {
SPH_C32(0x67452301), SPH_C32(0xEFCDAB89),
SPH_C32(0x98BADCFE), SPH_C32(0x10325476)
};
/*
* Round functions for RIPEMD-128 and RIPEMD-160.
*/
#define F1(x, y, z) ((x) ^ (y) ^ (z))
#define F2(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
#define F3(x, y, z) (((x) | ~(y)) ^ (z))
#define F4(x, y, z) ((((x) ^ (y)) & (z)) ^ (y))
#define F5(x, y, z) ((x) ^ ((y) | ~(z)))
static const sph_u32 IV[5] = {
SPH_C32(0x67452301), SPH_C32(0xEFCDAB89), SPH_C32(0x98BADCFE),
SPH_C32(0x10325476), SPH_C32(0xC3D2E1F0)
};
#define ROTL SPH_ROTL32
/* ===================================================================== */
/*
* RIPEMD (original hash, deprecated).
*/
#define FF1(A, B, C, D, X, s) do { \
sph_u32 tmp = SPH_T32((A) + F(B, C, D) + (X)); \
(A) = ROTL(tmp, (s)); \
} while (0)
#define GG1(A, B, C, D, X, s) do { \
sph_u32 tmp = SPH_T32((A) + G(B, C, D) \
+ (X) + SPH_C32(0x5A827999)); \
(A) = ROTL(tmp, (s)); \
} while (0)
#define HH1(A, B, C, D, X, s) do { \
sph_u32 tmp = SPH_T32((A) + H(B, C, D) \
+ (X) + SPH_C32(0x6ED9EBA1)); \
(A) = ROTL(tmp, (s)); \
} while (0)
#define FF2(A, B, C, D, X, s) do { \
sph_u32 tmp = SPH_T32((A) + F(B, C, D) \
+ (X) + SPH_C32(0x50A28BE6)); \
(A) = ROTL(tmp, (s)); \
} while (0)
#define GG2(A, B, C, D, X, s) do { \
sph_u32 tmp = SPH_T32((A) + G(B, C, D) + (X)); \
(A) = ROTL(tmp, (s)); \
} while (0)
#define HH2(A, B, C, D, X, s) do { \
sph_u32 tmp = SPH_T32((A) + H(B, C, D) \
+ (X) + SPH_C32(0x5C4DD124)); \
(A) = ROTL(tmp, (s)); \
} while (0)
#define RIPEMD_ROUND_BODY(in, h) do { \
sph_u32 A1, B1, C1, D1; \
sph_u32 A2, B2, C2, D2; \
sph_u32 tmp; \
\
A1 = A2 = (h)[0]; \
B1 = B2 = (h)[1]; \
C1 = C2 = (h)[2]; \
D1 = D2 = (h)[3]; \
\
FF1(A1, B1, C1, D1, in( 0), 11); \
FF1(D1, A1, B1, C1, in( 1), 14); \
FF1(C1, D1, A1, B1, in( 2), 15); \
FF1(B1, C1, D1, A1, in( 3), 12); \
FF1(A1, B1, C1, D1, in( 4), 5); \
FF1(D1, A1, B1, C1, in( 5), 8); \
FF1(C1, D1, A1, B1, in( 6), 7); \
FF1(B1, C1, D1, A1, in( 7), 9); \
FF1(A1, B1, C1, D1, in( 8), 11); \
FF1(D1, A1, B1, C1, in( 9), 13); \
FF1(C1, D1, A1, B1, in(10), 14); \
FF1(B1, C1, D1, A1, in(11), 15); \
FF1(A1, B1, C1, D1, in(12), 6); \
FF1(D1, A1, B1, C1, in(13), 7); \
FF1(C1, D1, A1, B1, in(14), 9); \
FF1(B1, C1, D1, A1, in(15), 8); \
\
GG1(A1, B1, C1, D1, in( 7), 7); \
GG1(D1, A1, B1, C1, in( 4), 6); \
GG1(C1, D1, A1, B1, in(13), 8); \
GG1(B1, C1, D1, A1, in( 1), 13); \
GG1(A1, B1, C1, D1, in(10), 11); \
GG1(D1, A1, B1, C1, in( 6), 9); \
GG1(C1, D1, A1, B1, in(15), 7); \
GG1(B1, C1, D1, A1, in( 3), 15); \
GG1(A1, B1, C1, D1, in(12), 7); \
GG1(D1, A1, B1, C1, in( 0), 12); \
GG1(C1, D1, A1, B1, in( 9), 15); \
GG1(B1, C1, D1, A1, in( 5), 9); \
GG1(A1, B1, C1, D1, in(14), 7); \
GG1(D1, A1, B1, C1, in( 2), 11); \
GG1(C1, D1, A1, B1, in(11), 13); \
GG1(B1, C1, D1, A1, in( 8), 12); \
\
HH1(A1, B1, C1, D1, in( 3), 11); \
HH1(D1, A1, B1, C1, in(10), 13); \
HH1(C1, D1, A1, B1, in( 2), 14); \
HH1(B1, C1, D1, A1, in( 4), 7); \
HH1(A1, B1, C1, D1, in( 9), 14); \
HH1(D1, A1, B1, C1, in(15), 9); \
HH1(C1, D1, A1, B1, in( 8), 13); \
HH1(B1, C1, D1, A1, in( 1), 15); \
HH1(A1, B1, C1, D1, in(14), 6); \
HH1(D1, A1, B1, C1, in( 7), 8); \
HH1(C1, D1, A1, B1, in( 0), 13); \
HH1(B1, C1, D1, A1, in( 6), 6); \
HH1(A1, B1, C1, D1, in(11), 12); \
HH1(D1, A1, B1, C1, in(13), 5); \
HH1(C1, D1, A1, B1, in( 5), 7); \
HH1(B1, C1, D1, A1, in(12), 5); \
\
FF2(A2, B2, C2, D2, in( 0), 11); \
FF2(D2, A2, B2, C2, in( 1), 14); \
FF2(C2, D2, A2, B2, in( 2), 15); \
FF2(B2, C2, D2, A2, in( 3), 12); \
FF2(A2, B2, C2, D2, in( 4), 5); \
FF2(D2, A2, B2, C2, in( 5), 8); \
FF2(C2, D2, A2, B2, in( 6), 7); \
FF2(B2, C2, D2, A2, in( 7), 9); \
FF2(A2, B2, C2, D2, in( 8), 11); \
FF2(D2, A2, B2, C2, in( 9), 13); \
FF2(C2, D2, A2, B2, in(10), 14); \
FF2(B2, C2, D2, A2, in(11), 15); \
FF2(A2, B2, C2, D2, in(12), 6); \
FF2(D2, A2, B2, C2, in(13), 7); \
FF2(C2, D2, A2, B2, in(14), 9); \
FF2(B2, C2, D2, A2, in(15), 8); \
\
GG2(A2, B2, C2, D2, in( 7), 7); \
GG2(D2, A2, B2, C2, in( 4), 6); \
GG2(C2, D2, A2, B2, in(13), 8); \
GG2(B2, C2, D2, A2, in( 1), 13); \
GG2(A2, B2, C2, D2, in(10), 11); \
GG2(D2, A2, B2, C2, in( 6), 9); \
GG2(C2, D2, A2, B2, in(15), 7); \
GG2(B2, C2, D2, A2, in( 3), 15); \
GG2(A2, B2, C2, D2, in(12), 7); \
GG2(D2, A2, B2, C2, in( 0), 12); \
GG2(C2, D2, A2, B2, in( 9), 15); \
GG2(B2, C2, D2, A2, in( 5), 9); \
GG2(A2, B2, C2, D2, in(14), 7); \
GG2(D2, A2, B2, C2, in( 2), 11); \
GG2(C2, D2, A2, B2, in(11), 13); \
GG2(B2, C2, D2, A2, in( 8), 12); \
\
HH2(A2, B2, C2, D2, in( 3), 11); \
HH2(D2, A2, B2, C2, in(10), 13); \
HH2(C2, D2, A2, B2, in( 2), 14); \
HH2(B2, C2, D2, A2, in( 4), 7); \
HH2(A2, B2, C2, D2, in( 9), 14); \
HH2(D2, A2, B2, C2, in(15), 9); \
HH2(C2, D2, A2, B2, in( 8), 13); \
HH2(B2, C2, D2, A2, in( 1), 15); \
HH2(A2, B2, C2, D2, in(14), 6); \
HH2(D2, A2, B2, C2, in( 7), 8); \
HH2(C2, D2, A2, B2, in( 0), 13); \
HH2(B2, C2, D2, A2, in( 6), 6); \
HH2(A2, B2, C2, D2, in(11), 12); \
HH2(D2, A2, B2, C2, in(13), 5); \
HH2(C2, D2, A2, B2, in( 5), 7); \
HH2(B2, C2, D2, A2, in(12), 5); \
\
tmp = SPH_T32((h)[1] + C1 + D2); \
(h)[1] = SPH_T32((h)[2] + D1 + A2); \
(h)[2] = SPH_T32((h)[3] + A1 + B2); \
(h)[3] = SPH_T32((h)[0] + B1 + C2); \
(h)[0] = tmp; \
} while (0)
/*
* One round of RIPEMD. The data must be aligned for 32-bit access.
*/
static void
ripemd_round(const unsigned char *data, sph_u32 r[5])
{
#if SPH_LITTLE_FAST
#define RIPEMD_IN(x) sph_dec32le_aligned(data + (4 * (x)))
#else
sph_u32 X_var[16];
int i;
for (i = 0; i < 16; i ++)
X_var[i] = sph_dec32le_aligned(data + 4 * i);
#define RIPEMD_IN(x) X_var[x]
#endif
RIPEMD_ROUND_BODY(RIPEMD_IN, r);
#undef RIPEMD_IN
}
/* see sph_ripemd.h */
void
sph_ripemd_init(void *cc)
{
sph_ripemd_context *sc;
sc = cc;
memcpy(sc->val, oIV, sizeof sc->val);
#if SPH_64
sc->count = 0;
#else
sc->count_high = sc->count_low = 0;
#endif
}
#define RFUN ripemd_round
#define HASH ripemd
#define LE32 1
#include "md_helper.c"
#undef RFUN
#undef HASH
#undef LE32
/* see sph_ripemd.h */
void
sph_ripemd_close(void *cc, void *dst)
{
ripemd_close(cc, dst, 4);
sph_ripemd_init(cc);
}
/* see sph_ripemd.h */
void
sph_ripemd_comp(const sph_u32 msg[16], sph_u32 val[4])
{
#define RIPEMD_IN(x) msg[x]
RIPEMD_ROUND_BODY(RIPEMD_IN, val);
#undef RIPEMD_IN
}
/* ===================================================================== */
/*
* RIPEMD-128.
*/
/*
* Round constants for RIPEMD-128.
*/
#define sK11 SPH_C32(0x00000000)
#define sK12 SPH_C32(0x5A827999)
#define sK13 SPH_C32(0x6ED9EBA1)
#define sK14 SPH_C32(0x8F1BBCDC)
#define sK21 SPH_C32(0x50A28BE6)
#define sK22 SPH_C32(0x5C4DD124)
#define sK23 SPH_C32(0x6D703EF3)
#define sK24 SPH_C32(0x00000000)
#define sRR(a, b, c, d, f, s, r, k) do { \
a = ROTL(SPH_T32(a + f(b, c, d) + r + k), s); \
} while (0)
#define sROUND1(a, b, c, d, f, s, r, k) \
sRR(a ## 1, b ## 1, c ## 1, d ## 1, f, s, r, sK1 ## k)
#define sROUND2(a, b, c, d, f, s, r, k) \
sRR(a ## 2, b ## 2, c ## 2, d ## 2, f, s, r, sK2 ## k)
/*
* This macro defines the body for a RIPEMD-128 compression function
* implementation. The "in" parameter should evaluate, when applied to a
* numerical input parameter from 0 to 15, to an expression which yields
* the corresponding input block. The "h" parameter should evaluate to
* an array or pointer expression designating the array of 4 words which
* contains the input and output of the compression function.
*/
#define RIPEMD128_ROUND_BODY(in, h) do { \
sph_u32 A1, B1, C1, D1; \
sph_u32 A2, B2, C2, D2; \
sph_u32 tmp; \
\
A1 = A2 = (h)[0]; \
B1 = B2 = (h)[1]; \
C1 = C2 = (h)[2]; \
D1 = D2 = (h)[3]; \
\
sROUND1(A, B, C, D, F1, 11, in( 0), 1); \
sROUND1(D, A, B, C, F1, 14, in( 1), 1); \
sROUND1(C, D, A, B, F1, 15, in( 2), 1); \
sROUND1(B, C, D, A, F1, 12, in( 3), 1); \
sROUND1(A, B, C, D, F1, 5, in( 4), 1); \
sROUND1(D, A, B, C, F1, 8, in( 5), 1); \
sROUND1(C, D, A, B, F1, 7, in( 6), 1); \
sROUND1(B, C, D, A, F1, 9, in( 7), 1); \
sROUND1(A, B, C, D, F1, 11, in( 8), 1); \
sROUND1(D, A, B, C, F1, 13, in( 9), 1); \
sROUND1(C, D, A, B, F1, 14, in(10), 1); \
sROUND1(B, C, D, A, F1, 15, in(11), 1); \
sROUND1(A, B, C, D, F1, 6, in(12), 1); \
sROUND1(D, A, B, C, F1, 7, in(13), 1); \
sROUND1(C, D, A, B, F1, 9, in(14), 1); \
sROUND1(B, C, D, A, F1, 8, in(15), 1); \
\
sROUND1(A, B, C, D, F2, 7, in( 7), 2); \
sROUND1(D, A, B, C, F2, 6, in( 4), 2); \
sROUND1(C, D, A, B, F2, 8, in(13), 2); \
sROUND1(B, C, D, A, F2, 13, in( 1), 2); \
sROUND1(A, B, C, D, F2, 11, in(10), 2); \
sROUND1(D, A, B, C, F2, 9, in( 6), 2); \
sROUND1(C, D, A, B, F2, 7, in(15), 2); \
sROUND1(B, C, D, A, F2, 15, in( 3), 2); \
sROUND1(A, B, C, D, F2, 7, in(12), 2); \
sROUND1(D, A, B, C, F2, 12, in( 0), 2); \
sROUND1(C, D, A, B, F2, 15, in( 9), 2); \
sROUND1(B, C, D, A, F2, 9, in( 5), 2); \
sROUND1(A, B, C, D, F2, 11, in( 2), 2); \
sROUND1(D, A, B, C, F2, 7, in(14), 2); \
sROUND1(C, D, A, B, F2, 13, in(11), 2); \
sROUND1(B, C, D, A, F2, 12, in( 8), 2); \
\
sROUND1(A, B, C, D, F3, 11, in( 3), 3); \
sROUND1(D, A, B, C, F3, 13, in(10), 3); \
sROUND1(C, D, A, B, F3, 6, in(14), 3); \
sROUND1(B, C, D, A, F3, 7, in( 4), 3); \
sROUND1(A, B, C, D, F3, 14, in( 9), 3); \
sROUND1(D, A, B, C, F3, 9, in(15), 3); \
sROUND1(C, D, A, B, F3, 13, in( 8), 3); \
sROUND1(B, C, D, A, F3, 15, in( 1), 3); \
sROUND1(A, B, C, D, F3, 14, in( 2), 3); \
sROUND1(D, A, B, C, F3, 8, in( 7), 3); \
sROUND1(C, D, A, B, F3, 13, in( 0), 3); \
sROUND1(B, C, D, A, F3, 6, in( 6), 3); \
sROUND1(A, B, C, D, F3, 5, in(13), 3); \
sROUND1(D, A, B, C, F3, 12, in(11), 3); \
sROUND1(C, D, A, B, F3, 7, in( 5), 3); \
sROUND1(B, C, D, A, F3, 5, in(12), 3); \
\
sROUND1(A, B, C, D, F4, 11, in( 1), 4); \
sROUND1(D, A, B, C, F4, 12, in( 9), 4); \
sROUND1(C, D, A, B, F4, 14, in(11), 4); \
sROUND1(B, C, D, A, F4, 15, in(10), 4); \
sROUND1(A, B, C, D, F4, 14, in( 0), 4); \
sROUND1(D, A, B, C, F4, 15, in( 8), 4); \
sROUND1(C, D, A, B, F4, 9, in(12), 4); \
sROUND1(B, C, D, A, F4, 8, in( 4), 4); \
sROUND1(A, B, C, D, F4, 9, in(13), 4); \
sROUND1(D, A, B, C, F4, 14, in( 3), 4); \
sROUND1(C, D, A, B, F4, 5, in( 7), 4); \
sROUND1(B, C, D, A, F4, 6, in(15), 4); \
sROUND1(A, B, C, D, F4, 8, in(14), 4); \
sROUND1(D, A, B, C, F4, 6, in( 5), 4); \
sROUND1(C, D, A, B, F4, 5, in( 6), 4); \
sROUND1(B, C, D, A, F4, 12, in( 2), 4); \
\
sROUND2(A, B, C, D, F4, 8, in( 5), 1); \
sROUND2(D, A, B, C, F4, 9, in(14), 1); \
sROUND2(C, D, A, B, F4, 9, in( 7), 1); \
sROUND2(B, C, D, A, F4, 11, in( 0), 1); \
sROUND2(A, B, C, D, F4, 13, in( 9), 1); \
sROUND2(D, A, B, C, F4, 15, in( 2), 1); \
sROUND2(C, D, A, B, F4, 15, in(11), 1); \
sROUND2(B, C, D, A, F4, 5, in( 4), 1); \
sROUND2(A, B, C, D, F4, 7, in(13), 1); \
sROUND2(D, A, B, C, F4, 7, in( 6), 1); \
sROUND2(C, D, A, B, F4, 8, in(15), 1); \
sROUND2(B, C, D, A, F4, 11, in( 8), 1); \
sROUND2(A, B, C, D, F4, 14, in( 1), 1); \
sROUND2(D, A, B, C, F4, 14, in(10), 1); \
sROUND2(C, D, A, B, F4, 12, in( 3), 1); \
sROUND2(B, C, D, A, F4, 6, in(12), 1); \
\
sROUND2(A, B, C, D, F3, 9, in( 6), 2); \
sROUND2(D, A, B, C, F3, 13, in(11), 2); \
sROUND2(C, D, A, B, F3, 15, in( 3), 2); \
sROUND2(B, C, D, A, F3, 7, in( 7), 2); \
sROUND2(A, B, C, D, F3, 12, in( 0), 2); \
sROUND2(D, A, B, C, F3, 8, in(13), 2); \
sROUND2(C, D, A, B, F3, 9, in( 5), 2); \
sROUND2(B, C, D, A, F3, 11, in(10), 2); \
sROUND2(A, B, C, D, F3, 7, in(14), 2); \
sROUND2(D, A, B, C, F3, 7, in(15), 2); \
sROUND2(C, D, A, B, F3, 12, in( 8), 2); \
sROUND2(B, C, D, A, F3, 7, in(12), 2); \
sROUND2(A, B, C, D, F3, 6, in( 4), 2); \
sROUND2(D, A, B, C, F3, 15, in( 9), 2); \
sROUND2(C, D, A, B, F3, 13, in( 1), 2); \
sROUND2(B, C, D, A, F3, 11, in( 2), 2); \
\
sROUND2(A, B, C, D, F2, 9, in(15), 3); \
sROUND2(D, A, B, C, F2, 7, in( 5), 3); \
sROUND2(C, D, A, B, F2, 15, in( 1), 3); \
sROUND2(B, C, D, A, F2, 11, in( 3), 3); \
sROUND2(A, B, C, D, F2, 8, in( 7), 3); \
sROUND2(D, A, B, C, F2, 6, in(14), 3); \
sROUND2(C, D, A, B, F2, 6, in( 6), 3); \
sROUND2(B, C, D, A, F2, 14, in( 9), 3); \
sROUND2(A, B, C, D, F2, 12, in(11), 3); \
sROUND2(D, A, B, C, F2, 13, in( 8), 3); \
sROUND2(C, D, A, B, F2, 5, in(12), 3); \
sROUND2(B, C, D, A, F2, 14, in( 2), 3); \
sROUND2(A, B, C, D, F2, 13, in(10), 3); \
sROUND2(D, A, B, C, F2, 13, in( 0), 3); \
sROUND2(C, D, A, B, F2, 7, in( 4), 3); \
sROUND2(B, C, D, A, F2, 5, in(13), 3); \
\
sROUND2(A, B, C, D, F1, 15, in( 8), 4); \
sROUND2(D, A, B, C, F1, 5, in( 6), 4); \
sROUND2(C, D, A, B, F1, 8, in( 4), 4); \
sROUND2(B, C, D, A, F1, 11, in( 1), 4); \
sROUND2(A, B, C, D, F1, 14, in( 3), 4); \
sROUND2(D, A, B, C, F1, 14, in(11), 4); \
sROUND2(C, D, A, B, F1, 6, in(15), 4); \
sROUND2(B, C, D, A, F1, 14, in( 0), 4); \
sROUND2(A, B, C, D, F1, 6, in( 5), 4); \
sROUND2(D, A, B, C, F1, 9, in(12), 4); \
sROUND2(C, D, A, B, F1, 12, in( 2), 4); \
sROUND2(B, C, D, A, F1, 9, in(13), 4); \
sROUND2(A, B, C, D, F1, 12, in( 9), 4); \
sROUND2(D, A, B, C, F1, 5, in( 7), 4); \
sROUND2(C, D, A, B, F1, 15, in(10), 4); \
sROUND2(B, C, D, A, F1, 8, in(14), 4); \
\
tmp = SPH_T32((h)[1] + C1 + D2); \
(h)[1] = SPH_T32((h)[2] + D1 + A2); \
(h)[2] = SPH_T32((h)[3] + A1 + B2); \
(h)[3] = SPH_T32((h)[0] + B1 + C2); \
(h)[0] = tmp; \
} while (0)
/*
* One round of RIPEMD-128. The data must be aligned for 32-bit access.
*/
static void
ripemd128_round(const unsigned char *data, sph_u32 r[5])
{
#if SPH_LITTLE_FAST
#define RIPEMD128_IN(x) sph_dec32le_aligned(data + (4 * (x)))
#else
sph_u32 X_var[16];
int i;
for (i = 0; i < 16; i ++)
X_var[i] = sph_dec32le_aligned(data + 4 * i);
#define RIPEMD128_IN(x) X_var[x]
#endif
RIPEMD128_ROUND_BODY(RIPEMD128_IN, r);
#undef RIPEMD128_IN
}
/* see sph_ripemd.h */
void
sph_ripemd128_init(void *cc)
{
sph_ripemd128_context *sc;
sc = cc;
memcpy(sc->val, IV, sizeof sc->val);
#if SPH_64
sc->count = 0;
#else
sc->count_high = sc->count_low = 0;
#endif
}
#define RFUN ripemd128_round
#define HASH ripemd128
#define LE32 1
#include "md_helper.c"
#undef RFUN
#undef HASH
#undef LE32
/* see sph_ripemd.h */
void
sph_ripemd128_close(void *cc, void *dst)
{
ripemd128_close(cc, dst, 4);
sph_ripemd128_init(cc);
}
/* see sph_ripemd.h */
void
sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4])
{
#define RIPEMD128_IN(x) msg[x]
RIPEMD128_ROUND_BODY(RIPEMD128_IN, val);
#undef RIPEMD128_IN
}
/* ===================================================================== */
/*
* RIPEMD-160.
*/
/*
* Round constants for RIPEMD-160.
*/
#define K11 SPH_C32(0x00000000)
#define K12 SPH_C32(0x5A827999)
#define K13 SPH_C32(0x6ED9EBA1)
#define K14 SPH_C32(0x8F1BBCDC)
#define K15 SPH_C32(0xA953FD4E)
#define K21 SPH_C32(0x50A28BE6)
#define K22 SPH_C32(0x5C4DD124)
#define K23 SPH_C32(0x6D703EF3)
#define K24 SPH_C32(0x7A6D76E9)
#define K25 SPH_C32(0x00000000)
#define RR(a, b, c, d, e, f, s, r, k) do { \
a = SPH_T32(ROTL(SPH_T32(a + f(b, c, d) + r + k), s) + e); \
c = ROTL(c, 10); \
} while (0)
#define ROUND1(a, b, c, d, e, f, s, r, k) \
RR(a ## 1, b ## 1, c ## 1, d ## 1, e ## 1, f, s, r, K1 ## k)
#define ROUND2(a, b, c, d, e, f, s, r, k) \
RR(a ## 2, b ## 2, c ## 2, d ## 2, e ## 2, f, s, r, K2 ## k)
/*
* This macro defines the body for a RIPEMD-160 compression function
* implementation. The "in" parameter should evaluate, when applied to a
* numerical input parameter from 0 to 15, to an expression which yields
* the corresponding input block. The "h" parameter should evaluate to
* an array or pointer expression designating the array of 5 words which
* contains the input and output of the compression function.
*/
#define RIPEMD160_ROUND_BODY(in, h) do { \
sph_u32 A1, B1, C1, D1, E1; \
sph_u32 A2, B2, C2, D2, E2; \
sph_u32 tmp; \
\
A1 = A2 = (h)[0]; \
B1 = B2 = (h)[1]; \
C1 = C2 = (h)[2]; \
D1 = D2 = (h)[3]; \
E1 = E2 = (h)[4]; \
\
ROUND1(A, B, C, D, E, F1, 11, in( 0), 1); \
ROUND1(E, A, B, C, D, F1, 14, in( 1), 1); \
ROUND1(D, E, A, B, C, F1, 15, in( 2), 1); \
ROUND1(C, D, E, A, B, F1, 12, in( 3), 1); \
ROUND1(B, C, D, E, A, F1, 5, in( 4), 1); \
ROUND1(A, B, C, D, E, F1, 8, in( 5), 1); \
ROUND1(E, A, B, C, D, F1, 7, in( 6), 1); \
ROUND1(D, E, A, B, C, F1, 9, in( 7), 1); \
ROUND1(C, D, E, A, B, F1, 11, in( 8), 1); \
ROUND1(B, C, D, E, A, F1, 13, in( 9), 1); \
ROUND1(A, B, C, D, E, F1, 14, in(10), 1); \
ROUND1(E, A, B, C, D, F1, 15, in(11), 1); \
ROUND1(D, E, A, B, C, F1, 6, in(12), 1); \
ROUND1(C, D, E, A, B, F1, 7, in(13), 1); \
ROUND1(B, C, D, E, A, F1, 9, in(14), 1); \
ROUND1(A, B, C, D, E, F1, 8, in(15), 1); \
\
ROUND1(E, A, B, C, D, F2, 7, in( 7), 2); \
ROUND1(D, E, A, B, C, F2, 6, in( 4), 2); \
ROUND1(C, D, E, A, B, F2, 8, in(13), 2); \
ROUND1(B, C, D, E, A, F2, 13, in( 1), 2); \
ROUND1(A, B, C, D, E, F2, 11, in(10), 2); \
ROUND1(E, A, B, C, D, F2, 9, in( 6), 2); \
ROUND1(D, E, A, B, C, F2, 7, in(15), 2); \
ROUND1(C, D, E, A, B, F2, 15, in( 3), 2); \
ROUND1(B, C, D, E, A, F2, 7, in(12), 2); \
ROUND1(A, B, C, D, E, F2, 12, in( 0), 2); \
ROUND1(E, A, B, C, D, F2, 15, in( 9), 2); \
ROUND1(D, E, A, B, C, F2, 9, in( 5), 2); \
ROUND1(C, D, E, A, B, F2, 11, in( 2), 2); \
ROUND1(B, C, D, E, A, F2, 7, in(14), 2); \
ROUND1(A, B, C, D, E, F2, 13, in(11), 2); \
ROUND1(E, A, B, C, D, F2, 12, in( 8), 2); \
\
ROUND1(D, E, A, B, C, F3, 11, in( 3), 3); \
ROUND1(C, D, E, A, B, F3, 13, in(10), 3); \
ROUND1(B, C, D, E, A, F3, 6, in(14), 3); \
ROUND1(A, B, C, D, E, F3, 7, in( 4), 3); \
ROUND1(E, A, B, C, D, F3, 14, in( 9), 3); \
ROUND1(D, E, A, B, C, F3, 9, in(15), 3); \
ROUND1(C, D, E, A, B, F3, 13, in( 8), 3); \
ROUND1(B, C, D, E, A, F3, 15, in( 1), 3); \
ROUND1(A, B, C, D, E, F3, 14, in( 2), 3); \
ROUND1(E, A, B, C, D, F3, 8, in( 7), 3); \
ROUND1(D, E, A, B, C, F3, 13, in( 0), 3); \
ROUND1(C, D, E, A, B, F3, 6, in( 6), 3); \
ROUND1(B, C, D, E, A, F3, 5, in(13), 3); \
ROUND1(A, B, C, D, E, F3, 12, in(11), 3); \
ROUND1(E, A, B, C, D, F3, 7, in( 5), 3); \
ROUND1(D, E, A, B, C, F3, 5, in(12), 3); \
\
ROUND1(C, D, E, A, B, F4, 11, in( 1), 4); \
ROUND1(B, C, D, E, A, F4, 12, in( 9), 4); \
ROUND1(A, B, C, D, E, F4, 14, in(11), 4); \
ROUND1(E, A, B, C, D, F4, 15, in(10), 4); \
ROUND1(D, E, A, B, C, F4, 14, in( 0), 4); \
ROUND1(C, D, E, A, B, F4, 15, in( 8), 4); \
ROUND1(B, C, D, E, A, F4, 9, in(12), 4); \
ROUND1(A, B, C, D, E, F4, 8, in( 4), 4); \
ROUND1(E, A, B, C, D, F4, 9, in(13), 4); \
ROUND1(D, E, A, B, C, F4, 14, in( 3), 4); \
ROUND1(C, D, E, A, B, F4, 5, in( 7), 4); \
ROUND1(B, C, D, E, A, F4, 6, in(15), 4); \
ROUND1(A, B, C, D, E, F4, 8, in(14), 4); \
ROUND1(E, A, B, C, D, F4, 6, in( 5), 4); \
ROUND1(D, E, A, B, C, F4, 5, in( 6), 4); \
ROUND1(C, D, E, A, B, F4, 12, in( 2), 4); \
\
ROUND1(B, C, D, E, A, F5, 9, in( 4), 5); \
ROUND1(A, B, C, D, E, F5, 15, in( 0), 5); \
ROUND1(E, A, B, C, D, F5, 5, in( 5), 5); \
ROUND1(D, E, A, B, C, F5, 11, in( 9), 5); \
ROUND1(C, D, E, A, B, F5, 6, in( 7), 5); \
ROUND1(B, C, D, E, A, F5, 8, in(12), 5); \
ROUND1(A, B, C, D, E, F5, 13, in( 2), 5); \
ROUND1(E, A, B, C, D, F5, 12, in(10), 5); \
ROUND1(D, E, A, B, C, F5, 5, in(14), 5); \
ROUND1(C, D, E, A, B, F5, 12, in( 1), 5); \
ROUND1(B, C, D, E, A, F5, 13, in( 3), 5); \
ROUND1(A, B, C, D, E, F5, 14, in( 8), 5); \
ROUND1(E, A, B, C, D, F5, 11, in(11), 5); \
ROUND1(D, E, A, B, C, F5, 8, in( 6), 5); \
ROUND1(C, D, E, A, B, F5, 5, in(15), 5); \
ROUND1(B, C, D, E, A, F5, 6, in(13), 5); \
\
ROUND2(A, B, C, D, E, F5, 8, in( 5), 1); \
ROUND2(E, A, B, C, D, F5, 9, in(14), 1); \
ROUND2(D, E, A, B, C, F5, 9, in( 7), 1); \
ROUND2(C, D, E, A, B, F5, 11, in( 0), 1); \
ROUND2(B, C, D, E, A, F5, 13, in( 9), 1); \
ROUND2(A, B, C, D, E, F5, 15, in( 2), 1); \
ROUND2(E, A, B, C, D, F5, 15, in(11), 1); \
ROUND2(D, E, A, B, C, F5, 5, in( 4), 1); \
ROUND2(C, D, E, A, B, F5, 7, in(13), 1); \
ROUND2(B, C, D, E, A, F5, 7, in( 6), 1); \
ROUND2(A, B, C, D, E, F5, 8, in(15), 1); \
ROUND2(E, A, B, C, D, F5, 11, in( 8), 1); \
ROUND2(D, E, A, B, C, F5, 14, in( 1), 1); \
ROUND2(C, D, E, A, B, F5, 14, in(10), 1); \
ROUND2(B, C, D, E, A, F5, 12, in( 3), 1); \
ROUND2(A, B, C, D, E, F5, 6, in(12), 1); \
\
ROUND2(E, A, B, C, D, F4, 9, in( 6), 2); \
ROUND2(D, E, A, B, C, F4, 13, in(11), 2); \
ROUND2(C, D, E, A, B, F4, 15, in( 3), 2); \
ROUND2(B, C, D, E, A, F4, 7, in( 7), 2); \
ROUND2(A, B, C, D, E, F4, 12, in( 0), 2); \
ROUND2(E, A, B, C, D, F4, 8, in(13), 2); \
ROUND2(D, E, A, B, C, F4, 9, in( 5), 2); \
ROUND2(C, D, E, A, B, F4, 11, in(10), 2); \
ROUND2(B, C, D, E, A, F4, 7, in(14), 2); \
ROUND2(A, B, C, D, E, F4, 7, in(15), 2); \
ROUND2(E, A, B, C, D, F4, 12, in( 8), 2); \
ROUND2(D, E, A, B, C, F4, 7, in(12), 2); \
ROUND2(C, D, E, A, B, F4, 6, in( 4), 2); \
ROUND2(B, C, D, E, A, F4, 15, in( 9), 2); \
ROUND2(A, B, C, D, E, F4, 13, in( 1), 2); \
ROUND2(E, A, B, C, D, F4, 11, in( 2), 2); \
\
ROUND2(D, E, A, B, C, F3, 9, in(15), 3); \
ROUND2(C, D, E, A, B, F3, 7, in( 5), 3); \
ROUND2(B, C, D, E, A, F3, 15, in( 1), 3); \
ROUND2(A, B, C, D, E, F3, 11, in( 3), 3); \
ROUND2(E, A, B, C, D, F3, 8, in( 7), 3); \
ROUND2(D, E, A, B, C, F3, 6, in(14), 3); \
ROUND2(C, D, E, A, B, F3, 6, in( 6), 3); \
ROUND2(B, C, D, E, A, F3, 14, in( 9), 3); \
ROUND2(A, B, C, D, E, F3, 12, in(11), 3); \
ROUND2(E, A, B, C, D, F3, 13, in( 8), 3); \
ROUND2(D, E, A, B, C, F3, 5, in(12), 3); \
ROUND2(C, D, E, A, B, F3, 14, in( 2), 3); \
ROUND2(B, C, D, E, A, F3, 13, in(10), 3); \
ROUND2(A, B, C, D, E, F3, 13, in( 0), 3); \
ROUND2(E, A, B, C, D, F3, 7, in( 4), 3); \
ROUND2(D, E, A, B, C, F3, 5, in(13), 3); \
\
ROUND2(C, D, E, A, B, F2, 15, in( 8), 4); \
ROUND2(B, C, D, E, A, F2, 5, in( 6), 4); \
ROUND2(A, B, C, D, E, F2, 8, in( 4), 4); \
ROUND2(E, A, B, C, D, F2, 11, in( 1), 4); \
ROUND2(D, E, A, B, C, F2, 14, in( 3), 4); \
ROUND2(C, D, E, A, B, F2, 14, in(11), 4); \
ROUND2(B, C, D, E, A, F2, 6, in(15), 4); \
ROUND2(A, B, C, D, E, F2, 14, in( 0), 4); \
ROUND2(E, A, B, C, D, F2, 6, in( 5), 4); \
ROUND2(D, E, A, B, C, F2, 9, in(12), 4); \
ROUND2(C, D, E, A, B, F2, 12, in( 2), 4); \
ROUND2(B, C, D, E, A, F2, 9, in(13), 4); \
ROUND2(A, B, C, D, E, F2, 12, in( 9), 4); \
ROUND2(E, A, B, C, D, F2, 5, in( 7), 4); \
ROUND2(D, E, A, B, C, F2, 15, in(10), 4); \
ROUND2(C, D, E, A, B, F2, 8, in(14), 4); \
\
ROUND2(B, C, D, E, A, F1, 8, in(12), 5); \
ROUND2(A, B, C, D, E, F1, 5, in(15), 5); \
ROUND2(E, A, B, C, D, F1, 12, in(10), 5); \
ROUND2(D, E, A, B, C, F1, 9, in( 4), 5); \
ROUND2(C, D, E, A, B, F1, 12, in( 1), 5); \
ROUND2(B, C, D, E, A, F1, 5, in( 5), 5); \
ROUND2(A, B, C, D, E, F1, 14, in( 8), 5); \
ROUND2(E, A, B, C, D, F1, 6, in( 7), 5); \
ROUND2(D, E, A, B, C, F1, 8, in( 6), 5); \
ROUND2(C, D, E, A, B, F1, 13, in( 2), 5); \
ROUND2(B, C, D, E, A, F1, 6, in(13), 5); \
ROUND2(A, B, C, D, E, F1, 5, in(14), 5); \
ROUND2(E, A, B, C, D, F1, 15, in( 0), 5); \
ROUND2(D, E, A, B, C, F1, 13, in( 3), 5); \
ROUND2(C, D, E, A, B, F1, 11, in( 9), 5); \
ROUND2(B, C, D, E, A, F1, 11, in(11), 5); \
\
tmp = SPH_T32((h)[1] + C1 + D2); \
(h)[1] = SPH_T32((h)[2] + D1 + E2); \
(h)[2] = SPH_T32((h)[3] + E1 + A2); \
(h)[3] = SPH_T32((h)[4] + A1 + B2); \
(h)[4] = SPH_T32((h)[0] + B1 + C2); \
(h)[0] = tmp; \
} while (0)
/*
* One round of RIPEMD-160. The data must be aligned for 32-bit access.
*/
static void
ripemd160_round(const unsigned char *data, sph_u32 r[5])
{
#if SPH_LITTLE_FAST
#define RIPEMD160_IN(x) sph_dec32le_aligned(data + (4 * (x)))
#else
sph_u32 X_var[16];
int i;
for (i = 0; i < 16; i ++)
X_var[i] = sph_dec32le_aligned(data + 4 * i);
#define RIPEMD160_IN(x) X_var[x]
#endif
RIPEMD160_ROUND_BODY(RIPEMD160_IN, r);
#undef RIPEMD160_IN
}
/* see sph_ripemd.h */
void
sph_ripemd160_init(void *cc)
{
sph_ripemd160_context *sc;
sc = cc;
memcpy(sc->val, IV, sizeof sc->val);
#if SPH_64
sc->count = 0;
#else
sc->count_high = sc->count_low = 0;
#endif
}
#define RFUN ripemd160_round
#define HASH ripemd160
#define LE32 1
#include "md_helper.c"
#undef RFUN
#undef HASH
#undef LE32
/* see sph_ripemd.h */
void
sph_ripemd160_close(void *cc, void *dst)
{
ripemd160_close(cc, dst, 5);
sph_ripemd160_init(cc);
}
/* see sph_ripemd.h */
void
sph_ripemd160_comp(const sph_u32 msg[16], sph_u32 val[5])
{
#define RIPEMD160_IN(x) msg[x]
RIPEMD160_ROUND_BODY(RIPEMD160_IN, val);
#undef RIPEMD160_IN
}

273
sph/sph_ripemd.h

@ -0,0 +1,273 @@ @@ -0,0 +1,273 @@
/* $Id: sph_ripemd.h 216 2010-06-08 09:46:57Z tp $ */
/**
* RIPEMD, RIPEMD-128 and RIPEMD-160 interface.
*
* RIPEMD was first described in: Research and Development in Advanced
* Communication Technologies in Europe, "RIPE Integrity Primitives:
* Final Report of RACE Integrity Primitives Evaluation (R1040)", RACE,
* June 1992.
*
* A new, strengthened version, dubbed RIPEMD-160, was published in: H.
* Dobbertin, A. Bosselaers, and B. Preneel, "RIPEMD-160, a strengthened
* version of RIPEMD", Fast Software Encryption - FSE'96, LNCS 1039,
* Springer (1996), pp. 71--82.
*
* This article describes both RIPEMD-160, with a 160-bit output, and a
* reduced version called RIPEMD-128, which has a 128-bit output. RIPEMD-128
* was meant as a "drop-in" replacement for any hash function with 128-bit
* output, especially the original RIPEMD.
*
* @warning Collisions, and an efficient method to build other collisions,
* have been published for the original RIPEMD, which is thus considered as
* cryptographically broken. It is also very rarely encountered, and there
* seems to exist no free description or implementation of RIPEMD (except
* the sphlib code, of course). As of january 2007, RIPEMD-128 and RIPEMD-160
* seem as secure as their output length allows.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @file sph_ripemd.h
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#ifndef SPH_RIPEMD_H__
#define SPH_RIPEMD_H__
#include <stddef.h>
#include "sph_types.h"
/**
* Output size (in bits) for RIPEMD.
*/
#define SPH_SIZE_ripemd 128
/**
* Output size (in bits) for RIPEMD-128.
*/
#define SPH_SIZE_ripemd128 128
/**
* Output size (in bits) for RIPEMD-160.
*/
#define SPH_SIZE_ripemd160 160
/**
* This structure is a context for RIPEMD computations: it contains the
* intermediate values and some data from the last entered block. Once
* a RIPEMD computation has been performed, the context can be reused for
* another computation.
*
* The contents of this structure are private. A running RIPEMD computation
* can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char buf[64]; /* first field, for alignment */
sph_u32 val[4];
#if SPH_64
sph_u64 count;
#else
sph_u32 count_high, count_low;
#endif
#endif
} sph_ripemd_context;
/**
* Initialize a RIPEMD context. This process performs no memory allocation.
*
* @param cc the RIPEMD context (pointer to
* a <code>sph_ripemd_context</code>)
*/
void sph_ripemd_init(void *cc);
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing).
*
* @param cc the RIPEMD context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_ripemd(void *cc, const void *data, size_t len);
/**
* Terminate the current RIPEMD computation and output the result into the
* provided buffer. The destination buffer must be wide enough to
* accomodate the result (16 bytes). The context is automatically
* reinitialized.
*
* @param cc the RIPEMD context
* @param dst the destination buffer
*/
void sph_ripemd_close(void *cc, void *dst);
/**
* Apply the RIPEMD compression function on the provided data. The
* <code>msg</code> parameter contains the 16 32-bit input blocks,
* as numerical values (hence after the little-endian decoding). The
* <code>val</code> parameter contains the 5 32-bit input blocks for
* the compression function; the output is written in place in this
* array.
*
* @param msg the message block (16 values)
* @param val the function 128-bit input and output
*/
void sph_ripemd_comp(const sph_u32 msg[16], sph_u32 val[4]);
/* ===================================================================== */
/**
* This structure is a context for RIPEMD-128 computations: it contains the
* intermediate values and some data from the last entered block. Once
* a RIPEMD-128 computation has been performed, the context can be reused for
* another computation.
*
* The contents of this structure are private. A running RIPEMD-128 computation
* can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char buf[64]; /* first field, for alignment */
sph_u32 val[4];
#if SPH_64
sph_u64 count;
#else
sph_u32 count_high, count_low;
#endif
#endif
} sph_ripemd128_context;
/**
* Initialize a RIPEMD-128 context. This process performs no memory allocation.
*
* @param cc the RIPEMD-128 context (pointer to
* a <code>sph_ripemd128_context</code>)
*/
void sph_ripemd128_init(void *cc);
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing).
*
* @param cc the RIPEMD-128 context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_ripemd128(void *cc, const void *data, size_t len);
/**
* Terminate the current RIPEMD-128 computation and output the result into the
* provided buffer. The destination buffer must be wide enough to
* accomodate the result (16 bytes). The context is automatically
* reinitialized.
*
* @param cc the RIPEMD-128 context
* @param dst the destination buffer
*/
void sph_ripemd128_close(void *cc, void *dst);
/**
* Apply the RIPEMD-128 compression function on the provided data. The
* <code>msg</code> parameter contains the 16 32-bit input blocks,
* as numerical values (hence after the little-endian decoding). The
* <code>val</code> parameter contains the 5 32-bit input blocks for
* the compression function; the output is written in place in this
* array.
*
* @param msg the message block (16 values)
* @param val the function 128-bit input and output
*/
void sph_ripemd128_comp(const sph_u32 msg[16], sph_u32 val[4]);
/* ===================================================================== */
/**
* This structure is a context for RIPEMD-160 computations: it contains the
* intermediate values and some data from the last entered block. Once
* a RIPEMD-160 computation has been performed, the context can be reused for
* another computation.
*
* The contents of this structure are private. A running RIPEMD-160 computation
* can be cloned by copying the context (e.g. with a simple
* <code>memcpy()</code>).
*/
typedef struct {
#ifndef DOXYGEN_IGNORE
unsigned char buf[64]; /* first field, for alignment */
sph_u32 val[5];
#if SPH_64
sph_u64 count;
#else
sph_u32 count_high, count_low;
#endif
#endif
} sph_ripemd160_context;
/**
* Initialize a RIPEMD-160 context. This process performs no memory allocation.
*
* @param cc the RIPEMD-160 context (pointer to
* a <code>sph_ripemd160_context</code>)
*/
void sph_ripemd160_init(void *cc);
/**
* Process some data bytes. It is acceptable that <code>len</code> is zero
* (in which case this function does nothing).
*
* @param cc the RIPEMD-160 context
* @param data the input data
* @param len the input data length (in bytes)
*/
void sph_ripemd160(void *cc, const void *data, size_t len);
/**
* Terminate the current RIPEMD-160 computation and output the result into the
* provided buffer. The destination buffer must be wide enough to
* accomodate the result (20 bytes). The context is automatically
* reinitialized.
*
* @param cc the RIPEMD-160 context
* @param dst the destination buffer
*/
void sph_ripemd160_close(void *cc, void *dst);
/**
* Apply the RIPEMD-160 compression function on the provided data. The
* <code>msg</code> parameter contains the 16 32-bit input blocks,
* as numerical values (hence after the little-endian decoding). The
* <code>val</code> parameter contains the 5 32-bit input blocks for
* the compression function; the output is written in place in this
* array.
*
* @param msg the message block (16 values)
* @param val the function 160-bit input and output
*/
void sph_ripemd160_comp(const sph_u32 msg[16], sph_u32 val[5]);
#endif

691
sph/sph_sha2.c

@ -0,0 +1,691 @@ @@ -0,0 +1,691 @@
/* $Id: sha2.c 227 2010-06-16 17:28:38Z tp $ */
/*
* SHA-224 / SHA-256 implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2007-2010 Projet RNRT SAPHIR
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*
* @author Thomas Pornin <thomas.pornin@cryptolog.com>
*/
#include <stddef.h>
#include <string.h>
#include "sph_sha2.h"
#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_SHA2
#define SPH_SMALL_FOOTPRINT_SHA2 1
#endif
#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z))
#define MAJ(X, Y, Z) (((Y) & (Z)) | (((Y) | (Z)) & (X)))
#define ROTR SPH_ROTR32
#define BSG2_0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
#define BSG2_1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
#define SSG2_0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SPH_T32((x) >> 3))
#define SSG2_1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SPH_T32((x) >> 10))
static const sph_u32 H224[8] = {
SPH_C32(0xC1059ED8), SPH_C32(0x367CD507), SPH_C32(0x3070DD17),
SPH_C32(0xF70E5939), SPH_C32(0xFFC00B31), SPH_C32(0x68581511),
SPH_C32(0x64F98FA7), SPH_C32(0xBEFA4FA4)
};
static const sph_u32 H256[8] = {
SPH_C32(0x6A09E667), SPH_C32(0xBB67AE85), SPH_C32(0x3C6EF372),
SPH_C32(0xA54FF53A), SPH_C32(0x510E527F), SPH_C32(0x9B05688C),
SPH_C32(0x1F83D9AB), SPH_C32(0x5BE0CD19)
};
/*
* The SHA2_ROUND_BODY defines the body for a SHA-224 / SHA-256
* compression function implementation. The "in" parameter should
* evaluate, when applied to a numerical input parameter from 0 to 15,
* to an expression which yields the corresponding input block. The "r"
* parameter should evaluate to an array or pointer expression
* designating the array of 8 words which contains the input and output
* of the compression function.
*/
#if SPH_SMALL_FOOTPRINT_SHA2
static const sph_u32 K[64] = {
SPH_C32(0x428A2F98), SPH_C32(0x71374491),
SPH_C32(0xB5C0FBCF), SPH_C32(0xE9B5DBA5),
SPH_C32(0x3956C25B), SPH_C32(0x59F111F1),
SPH_C32(0x923F82A4), SPH_C32(0xAB1C5ED5),
SPH_C32(0xD807AA98), SPH_C32(0x12835B01),
SPH_C32(0x243185BE), SPH_C32(0x550C7DC3),
SPH_C32(0x72BE5D74), SPH_C32(0x80DEB1FE),
SPH_C32(0x9BDC06A7), SPH_C32(0xC19BF174),
SPH_C32(0xE49B69C1), SPH_C32(0xEFBE4786),
SPH_C32(0x0FC19DC6), SPH_C32(0x240CA1CC),
SPH_C32(0x2DE92C6F), SPH_C32(0x4A7484AA),
SPH_C32(0x5CB0A9DC), SPH_C32(0x76F988DA),
SPH_C32(0x983E5152), SPH_C32(0xA831C66D),
SPH_C32(0xB00327C8), SPH_C32(0xBF597FC7),
SPH_C32(0xC6E00BF3), SPH_C32(0xD5A79147),
SPH_C32(0x06CA6351), SPH_C32(0x14292967),
SPH_C32(0x27B70A85), SPH_C32(0x2E1B2138),
SPH_C32(0x4D2C6DFC), SPH_C32(0x53380D13),
SPH_C32(0x650A7354), SPH_C32(0x766A0ABB),
SPH_C32(0x81C2C92E), SPH_C32(0x92722C85),
SPH_C32(0xA2BFE8A1), SPH_C32(0xA81A664B),
SPH_C32(0xC24B8B70), SPH_C32(0xC76C51A3),
SPH_C32(0xD192E819), SPH_C32(0xD6990624),
SPH_C32(0xF40E3585), SPH_C32(0x106AA070),
SPH_C32(0x19A4C116), SPH_C32(0x1E376C08),
SPH_C32(0x2748774C), SPH_C32(0x34B0BCB5),
SPH_C32(0x391C0CB3), SPH_C32(0x4ED8AA4A),
SPH_C32(0x5B9CCA4F), SPH_C32(0x682E6FF3),
SPH_C32(0x748F82EE), SPH_C32(0x78A5636F),
SPH_C32(0x84C87814), SPH_C32(0x8CC70208),
SPH_C32(0x90BEFFFA), SPH_C32(0xA4506CEB),
SPH_C32(0xBEF9A3F7), SPH_C32(0xC67178F2)
};
#define SHA2_MEXP1(in, pc) do { \
W[pc] = in(pc); \
} while (0)
#define SHA2_MEXP2(in, pc) do { \
W[(pc) & 0x0F] = SPH_T32(SSG2_1(W[((pc) - 2) & 0x0F]) \
+ W[((pc) - 7) & 0x0F] \
+ SSG2_0(W[((pc) - 15) & 0x0F]) + W[(pc) & 0x0F]); \
} while (0)
#define SHA2_STEPn(n, a, b, c, d, e, f, g, h, in, pc) do { \
sph_u32 t1, t2; \
SHA2_MEXP ## n(in, pc); \
t1 = SPH_T32(h + BSG2_1(e) + CH(e, f, g) \
+ K[pcount + (pc)] + W[(pc) & 0x0F]); \
t2 = SPH_T32(BSG2_0(a) + MAJ(a, b, c)); \
d = SPH_T32(d + t1); \
h = SPH_T32(t1 + t2); \
} while (0)
#define SHA2_STEP1(a, b, c, d, e, f, g, h, in, pc) \
SHA2_STEPn(1, a, b, c, d, e, f, g, h, in, pc)
#define SHA2_STEP2(a, b, c, d, e, f, g, h, in, pc) \
SHA2_STEPn(2, a, b, c, d, e, f, g, h, in, pc)
#define SHA2_ROUND_BODY(in, r) do { \
sph_u32 A, B, C, D, E, F, G, H; \
sph_u32 W[16]; \
unsigned pcount; \
\
A = (r)[0]; \
B = (r)[1]; \
C = (r)[2]; \
D = (r)[3]; \
E = (r)[4]; \
F = (r)[5]; \
G = (r)[6]; \
H = (r)[7]; \
pcount = 0; \
SHA2_STEP1(A, B, C, D, E, F, G, H, in, 0); \
SHA2_STEP1(H, A, B, C, D, E, F, G, in, 1); \
SHA2_STEP1(G, H, A, B, C, D, E, F, in, 2); \
SHA2_STEP1(F, G, H, A, B, C, D, E, in, 3); \
SHA2_STEP1(E, F, G, H, A, B, C, D, in, 4); \
SHA2_STEP1(D, E, F, G, H, A, B, C, in, 5); \
SHA2_STEP1(C, D, E, F, G, H, A, B, in, 6); \
SHA2_STEP1(B, C, D, E, F, G, H, A, in, 7); \
SHA2_STEP1(A, B, C, D, E, F, G, H, in, 8); \
SHA2_STEP1(H, A, B, C, D, E, F, G, in, 9); \
SHA2_STEP1(G, H, A, B, C, D, E, F, in, 10); \
SHA2_STEP1(F, G, H, A, B, C, D, E, in, 11); \
SHA2_STEP1(E, F, G, H, A, B, C, D, in, 12); \
SHA2_STEP1(D, E, F, G, H, A, B, C, in, 13); \
SHA2_STEP1(C, D, E, F, G, H, A, B, in, 14); \
SHA2_STEP1(B, C, D, E, F, G, H, A, in, 15); \
for (pcount = 16; pcount < 64; pcount += 16) { \
SHA2_STEP2(A, B, C, D, E, F, G, H, in, 0); \
SHA2_STEP2(H, A, B, C, D, E, F, G, in, 1); \
SHA2_STEP2(G, H, A, B, C, D, E, F, in, 2); \
SHA2_STEP2(F, G, H, A, B, C, D, E, in, 3); \
SHA2_STEP2(E, F, G, H, A, B, C, D, in, 4); \
SHA2_STEP2(D, E, F, G, H, A, B, C, in, 5); \
SHA2_STEP2(C, D, E, F, G, H, A, B, in, 6); \
SHA2_STEP2(B, C, D, E, F, G, H, A, in, 7); \
SHA2_STEP2(A, B, C, D, E, F, G, H, in, 8); \
SHA2_STEP2(H, A, B, C, D, E, F, G, in, 9); \
SHA2_STEP2(G, H, A, B, C, D, E, F, in, 10); \
SHA2_STEP2(F, G, H, A, B, C, D, E, in, 11); \
SHA2_STEP2(E, F, G, H, A, B, C, D, in, 12); \
SHA2_STEP2(D, E, F, G, H, A, B, C, in, 13); \
SHA2_STEP2(C, D, E, F, G, H, A, B, in, 14); \
SHA2_STEP2(B, C, D, E, F, G, H, A, in, 15); \
} \
(r)[0] = SPH_T32((r)[0] + A); \
(r)[1] = SPH_T32((r)[1] + B); \
(r)[2] = SPH_T32((r)[2] + C); \
(r)[3] = SPH_T32((r)[3] + D); \
(r)[4] = SPH_T32((r)[4] + E); \
(r)[5] = SPH_T32((r)[5] + F); \
(r)[6] = SPH_T32((r)[6] + G); \
(r)[7] = SPH_T32((r)[7] + H); \
} while (0)
#else
#define SHA2_ROUND_BODY(in, r) do { \
sph_u32 A, B, C, D, E, F, G, H, T1, T2; \
sph_u32 W00, W01, W02, W03, W04, W05, W06, W07; \
sph_u32 W08, W09, W10, W11, W12, W13, W14, W15; \
int i; \
\
A = (r)[0]; \
B = (r)[1]; \
C = (r)[2]; \
D = (r)[3]; \
E = (r)[4]; \
F = (r)[5]; \
G = (r)[6]; \
H = (r)[7]; \
W00 = in(0); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0x428A2F98) + W00); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \
W01 = in(1); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0x71374491) + W01); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \
W02 = in(2); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0xB5C0FBCF) + W02); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \
W03 = in(3); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0xE9B5DBA5) + W03); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \
W04 = in(4); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0x3956C25B) + W04); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \
W05 = in(5); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0x59F111F1) + W05); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \
W06 = in(6); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0x923F82A4) + W06); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \
W07 = in(7); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0xAB1C5ED5) + W07); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \
W08 = in(8); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0xD807AA98) + W08); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \
W09 = in(9); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0x12835B01) + W09); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \
W10 = in(10); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0x243185BE) + W10); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \
W11 = in(11); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0x550C7DC3) + W11); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \
W12 = in(12); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0x72BE5D74) + W12); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \
W13 = in(13); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0x80DEB1FE) + W13); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \
W14 = in(14); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0x9BDC06A7) + W14); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \
W15 = in(15); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0xC19BF174) + W15); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0xE49B69C1) + W00); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0xEFBE4786) + W01); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0x0FC19DC6) + W02); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0x240CA1CC) + W03); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0x2DE92C6F) + W04); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0x4A7484AA) + W05); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0x5CB0A9DC) + W06); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0x76F988DA) + W07); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0x983E5152) + W08); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0xA831C66D) + W09); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0xB00327C8) + W10); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0xBF597FC7) + W11); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0xC6E00BF3) + W12); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0xD5A79147) + W13); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0x06CA6351) + W14); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0x14292967) + W15); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0x27B70A85) + W00); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0x2E1B2138) + W01); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0x4D2C6DFC) + W02); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0x53380D13) + W03); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0x650A7354) + W04); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0x766A0ABB) + W05); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0x81C2C92E) + W06); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0x92722C85) + W07); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0xA2BFE8A1) + W08); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0xA81A664B) + W09); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0xC24B8B70) + W10); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0xC76C51A3) + W11); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0xD192E819) + W12); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0xD6990624) + W13); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0xF40E3585) + W14); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0x106AA070) + W15); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \
W00 = SPH_T32(SSG2_1(W14) + W09 + SSG2_0(W01) + W00); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0x19A4C116) + W00); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \
W01 = SPH_T32(SSG2_1(W15) + W10 + SSG2_0(W02) + W01); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0x1E376C08) + W01); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \
W02 = SPH_T32(SSG2_1(W00) + W11 + SSG2_0(W03) + W02); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0x2748774C) + W02); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \
W03 = SPH_T32(SSG2_1(W01) + W12 + SSG2_0(W04) + W03); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0x34B0BCB5) + W03); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \
W04 = SPH_T32(SSG2_1(W02) + W13 + SSG2_0(W05) + W04); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0x391C0CB3) + W04); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \
W05 = SPH_T32(SSG2_1(W03) + W14 + SSG2_0(W06) + W05); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0x4ED8AA4A) + W05); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \
W06 = SPH_T32(SSG2_1(W04) + W15 + SSG2_0(W07) + W06); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0x5B9CCA4F) + W06); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \
W07 = SPH_T32(SSG2_1(W05) + W00 + SSG2_0(W08) + W07); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0x682E6FF3) + W07); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \
W08 = SPH_T32(SSG2_1(W06) + W01 + SSG2_0(W09) + W08); \
T1 = SPH_T32(H + BSG2_1(E) + CH(E, F, G) \
+ SPH_C32(0x748F82EE) + W08); \
T2 = SPH_T32(BSG2_0(A) + MAJ(A, B, C)); \
D = SPH_T32(D + T1); \
H = SPH_T32(T1 + T2); \
W09 = SPH_T32(SSG2_1(W07) + W02 + SSG2_0(W10) + W09); \
T1 = SPH_T32(G + BSG2_1(D) + CH(D, E, F) \
+ SPH_C32(0x78A5636F) + W09); \
T2 = SPH_T32(BSG2_0(H) + MAJ(H, A, B)); \
C = SPH_T32(C + T1); \
G = SPH_T32(T1 + T2); \
W10 = SPH_T32(SSG2_1(W08) + W03 + SSG2_0(W11) + W10); \
T1 = SPH_T32(F + BSG2_1(C) + CH(C, D, E) \
+ SPH_C32(0x84C87814) + W10); \
T2 = SPH_T32(BSG2_0(G) + MAJ(G, H, A)); \
B = SPH_T32(B + T1); \
F = SPH_T32(T1 + T2); \
W11 = SPH_T32(SSG2_1(W09) + W04 + SSG2_0(W12) + W11); \
T1 = SPH_T32(E + BSG2_1(B) + CH(B, C, D) \
+ SPH_C32(0x8CC70208) + W11); \
T2 = SPH_T32(BSG2_0(F) + MAJ(F, G, H)); \
A = SPH_T32(A + T1); \
E = SPH_T32(T1 + T2); \
W12 = SPH_T32(SSG2_1(W10) + W05 + SSG2_0(W13) + W12); \
T1 = SPH_T32(D + BSG2_1(A) + CH(A, B, C) \
+ SPH_C32(0x90BEFFFA) + W12); \
T2 = SPH_T32(BSG2_0(E) + MAJ(E, F, G)); \
H = SPH_T32(H + T1); \
D = SPH_T32(T1 + T2); \
W13 = SPH_T32(SSG2_1(W11) + W06 + SSG2_0(W14) + W13); \
T1 = SPH_T32(C + BSG2_1(H) + CH(H, A, B) \
+ SPH_C32(0xA4506CEB) + W13); \
T2 = SPH_T32(BSG2_0(D) + MAJ(D, E, F)); \
G = SPH_T32(G + T1); \
C = SPH_T32(T1 + T2); \
W14 = SPH_T32(SSG2_1(W12) + W07 + SSG2_0(W15) + W14); \
T1 = SPH_T32(B + BSG2_1(G) + CH(G, H, A) \
+ SPH_C32(0xBEF9A3F7) + W14); \
T2 = SPH_T32(BSG2_0(C) + MAJ(C, D, E)); \
F = SPH_T32(F + T1); \
B = SPH_T32(T1 + T2); \
W15 = SPH_T32(SSG2_1(W13) + W08 + SSG2_0(W00) + W15); \
T1 = SPH_T32(A + BSG2_1(F) + CH(F, G, H) \
+ SPH_C32(0xC67178F2) + W15); \
T2 = SPH_T32(BSG2_0(B) + MAJ(B, C, D)); \
E = SPH_T32(E + T1); \
A = SPH_T32(T1 + T2); \
(r)[0] = SPH_T32((r)[0] + A); \
(r)[1] = SPH_T32((r)[1] + B); \
(r)[2] = SPH_T32((r)[2] + C); \
(r)[3] = SPH_T32((r)[3] + D); \
(r)[4] = SPH_T32((r)[4] + E); \
(r)[5] = SPH_T32((r)[5] + F); \
(r)[6] = SPH_T32((r)[6] + G); \
(r)[7] = SPH_T32((r)[7] + H); \
} while (0)
#endif
/*
* One round of SHA-224 / SHA-256. The data must be aligned for 32-bit access.
*/
static void
sha2_round(const unsigned char *data, sph_u32 r[8])
{
#define SHA2_IN(x) sph_dec32be_aligned(data + (4 * (x)))
SHA2_ROUND_BODY(SHA2_IN, r);
#undef SHA2_IN
}
/* see sph_sha2.h */
void
sph_sha224_init(void *cc)
{
sph_sha224_context *sc;
sc = cc;
memcpy(sc->val, H224, sizeof H224);
#if SPH_64
sc->count = 0;
#else
sc->count_high = sc->count_low = 0;
#endif
}
/* see sph_sha2.h */
void
sph_sha256_init(void *cc)
{
sph_sha256_context *sc;
sc = cc;
memcpy(sc->val, H256, sizeof H256);
#if SPH_64
sc->count = 0;
#else
sc->count_high = sc->count_low = 0;
#endif
}
#define RFUN sha2_round
#define HASH sha224
#define BE32 1
#include "md_helper.c"
/* see sph_sha2.h */
void
sph_sha224_close(void *cc, void *dst)
{
sha224_close(cc, dst, 7);
sph_sha224_init(cc);
}
/* see sph_sha2.h */
void
sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
sha224_addbits_and_close(cc, ub, n, dst, 7);
sph_sha224_init(cc);
}
/* see sph_sha2.h */
void
sph_sha256_close(void *cc, void *dst)
{
sha224_close(cc, dst, 8);
sph_sha256_init(cc);
}
/* see sph_sha2.h */
void
sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst)
{
sha224_addbits_and_close(cc, ub, n, dst, 8);
sph_sha256_init(cc);
}
/* see sph_sha2.h */
void
sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8])
{
#define SHA2_IN(x) msg[x]
SHA2_ROUND_BODY(SHA2_IN, val);
#undef SHA2_IN
}

45
util.cpp

@ -1421,28 +1421,39 @@ static uint32_t getblocheight(struct stratum_ctx *sctx) @@ -1421,28 +1421,39 @@ static uint32_t getblocheight(struct stratum_ctx *sctx)
static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
{
const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime, *nreward;
const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime;
const char *claim = NULL, *nreward = NULL;
size_t coinb1_size, coinb2_size;
bool clean, ret = false;
int merkle_count, i;
int merkle_count, i, p=0;
json_t *merkle_arr;
uchar **merkle = NULL;
// uchar(*merkle_tree)[32] = { 0 };
int ntime;
job_id = json_string_value(json_array_get(params, 0));
prevhash = json_string_value(json_array_get(params, 1));
coinb1 = json_string_value(json_array_get(params, 2));
coinb2 = json_string_value(json_array_get(params, 3));
merkle_arr = json_array_get(params, 4);
char algo[64] = { 0 };
get_currentalgo(algo, sizeof(algo));
bool has_claim = !strcasecmp(algo, "lbry");
job_id = json_string_value(json_array_get(params, p++));
prevhash = json_string_value(json_array_get(params, p++));
if (has_claim) {
claim = json_string_value(json_array_get(params, p++));
if (!claim || strlen(claim) != 64) {
applog(LOG_ERR, "Stratum notify: invalid claim parameter");
goto out;
}
}
coinb1 = json_string_value(json_array_get(params, p++));
coinb2 = json_string_value(json_array_get(params, p++));
merkle_arr = json_array_get(params, p++);
if (!merkle_arr || !json_is_array(merkle_arr))
goto out;
merkle_count = (int) json_array_size(merkle_arr);
version = json_string_value(json_array_get(params, 5));
nbits = json_string_value(json_array_get(params, 6));
stime = json_string_value(json_array_get(params, 7));
clean = json_is_true(json_array_get(params, 8));
nreward = json_string_value(json_array_get(params, 9));
version = json_string_value(json_array_get(params, p++));
nbits = json_string_value(json_array_get(params, p++));
stime = json_string_value(json_array_get(params, p++));
clean = json_is_true(json_array_get(params, p)); p++;
nreward = json_string_value(json_array_get(params, p++));
if (!job_id || !prevhash || !coinb1 || !coinb2 || !version || !nbits || !stime ||
strlen(prevhash) != 64 || strlen(version) != 8 ||
@ -1494,6 +1505,7 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params) @@ -1494,6 +1505,7 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
free(sctx->job.job_id);
sctx->job.job_id = strdup(job_id);
hex2bin(sctx->job.prevhash, prevhash, 32);
if (has_claim) hex2bin(sctx->job.claim, claim, 32);
sctx->job.height = getblocheight(sctx);
@ -2079,7 +2091,8 @@ void do_gpu_tests(void) @@ -2079,7 +2091,8 @@ void do_gpu_tests(void)
//scanhash_scrypt_jane(0, &work, NULL, 1, &done, &tv, &tv);
memset(work.data, 0, sizeof(work.data));
scanhash_decred(0, &work, 1, &done);
work.data[0] = 0;
scanhash_lbry(0, &work, 1, &done);
free(work_restart);
work_restart = NULL;
@ -2142,6 +2155,10 @@ void print_hash_tests(void) @@ -2142,6 +2155,10 @@ void print_hash_tests(void)
keccak256_hash(&hash[0], &buf[0]);
printpfx("keccak", hash);
memset(buf, 0, 128);
lbry_hash(&hash[0], &buf[0]);
printpfx("lbry", hash);
luffa_hash(&hash[0], &buf[0]);
printpfx("luffa", hash);

Loading…
Cancel
Save