Browse Source

Merge branch 'windows' of https://github.com/tpruvot/ccminer

2upstream
R4SAS 7 years ago
parent
commit
ded6a391b2
  1. 5
      Makefile.am
  2. 16
      README.txt
  3. 6
      algos.h
  4. 4
      api.cpp
  5. 3
      bench.cpp
  6. 16
      ccminer.cpp
  7. 13
      ccminer.vcxproj
  8. 36
      ccminer.vcxproj.filters
  9. 2
      compat/ccminer-config.h
  10. 37
      equi/cuda_equi.cu
  11. 9
      miner.h
  12. 15
      neoscrypt/cuda_neoscrypt.cu
  13. 1
      neoscrypt/neoscrypt.cpp
  14. 12
      res/ccminer.rc
  15. 3
      scrypt/salsa_kernel.cu
  16. 25
      util.cpp
  17. 236
      x12/x12.cu
  18. 715
      x13/cuda_x13_hamsi512.cu
  19. 64
      x15/cuda_x15_whirlpool_sm3.cu
  20. 80
      x16/cuda_x16.h
  21. 214
      x16/cuda_x16_echo512.cu
  22. 248
      x16/cuda_x16_echo512_64.cu
  23. 467
      x16/cuda_x16_fugue512.cu
  24. 350
      x16/cuda_x16_shabal512.cu
  25. 1836
      x16/cuda_x16_simd512_80.cu
  26. 622
      x16/x16r.cu
  27. 601
      x16/x16s.cu
  28. 77
      x17/cuda_x17_sha512.cu

5
Makefile.am

@ -69,13 +69,16 @@ ccminer_SOURCES = elist.h miner.h compat.h \
lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu lbry/cuda_lbry_merged.cu \ lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu lbry/cuda_lbry_merged.cu \
qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \ qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
tribus/tribus.cu tribus/cuda_echo512_final.cu \ tribus/tribus.cu tribus/cuda_echo512_final.cu \
x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \ x11/x11.cu x12/x12.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \ x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu x11/timetravel.cu x11/bitcore.cu \ x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu x11/timetravel.cu x11/bitcore.cu \
x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \ x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
x13/hsr.cu x13/cuda_hsr_sm3.cu x13/sm3.c \ x13/hsr.cu x13/cuda_hsr_sm3.cu x13/sm3.c \
x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \ x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \
x15/whirlpool.cu x15/cuda_x15_whirlpool_sm3.cu \ x15/whirlpool.cu x15/cuda_x15_whirlpool_sm3.cu \
x16/x16r.cu x16/x16s.cu x16/cuda_x16_echo512.cu x16/cuda_x16_fugue512.cu \
x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \
x16/cuda_x16_echo512_64.cu \
x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \ x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
x11/phi.cu x11/cuda_streebog_maxwell.cu \ x11/phi.cu x11/cuda_streebog_maxwell.cu \
x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu \ x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu \

16
README.txt

@ -1,5 +1,5 @@
ccminer 2.2.4 (Jan. 2018) "lyra2v2 and keccak improvements" ccminer 2.2.5 (Apr 2018) "x12, x16r and x16s algos"
--------------------------------------------------------------- ---------------------------------------------------------------
*************************************************************** ***************************************************************
@ -120,8 +120,12 @@ its command line interface and options.
tribus use to mine Denarius tribus use to mine Denarius
x11evo use to mine Revolver x11evo use to mine Revolver
x11 use to mine DarkCoin x11 use to mine DarkCoin
x14 use to mine X14Coin x12 use to mine GalaxyCash
x13 use to mine X13
x14 use to mine X14
x15 use to mine Halcyon x15 use to mine Halcyon
x16r use to mine Raven
x16s use to mine Pigeon and Eden
x17 use to mine X17 x17 use to mine X17
vanilla use to mine Vanilla (Blake256) vanilla use to mine Vanilla (Blake256)
veltor use to mine VeltorCoin veltor use to mine VeltorCoin
@ -277,7 +281,13 @@ so we can more efficiently implement new algorithms using the latest hardware
features. features.
>>> RELEASE HISTORY <<< >>> RELEASE HISTORY <<<
Jan. 04th 2017 v2.2.4 Apr. 02nd 2018 v2.2.5
New x16r algo for Raven
New x16s algo for Pigeon and Eden
New x12 algo for Galaxycash
Equihash (SIMT) sync issues for the Volta generation
Jan. 04th 2018 v2.2.4
Improve lyra2v2 Improve lyra2v2
Higher keccak default intensity Higher keccak default intensity
Drop SM 2.x support by default, for CUDA 9 and more recent Drop SM 2.x support by default, for CUDA 9 and more recent

6
algos.h

@ -57,9 +57,12 @@ enum sha_algos {
ALGO_BITCORE, ALGO_BITCORE,
ALGO_X11EVO, ALGO_X11EVO,
ALGO_X11, ALGO_X11,
ALGO_X12,
ALGO_X13, ALGO_X13,
ALGO_X14, ALGO_X14,
ALGO_X15, ALGO_X15,
ALGO_X16R,
ALGO_X16S,
ALGO_X17, ALGO_X17,
ALGO_VANILLA, ALGO_VANILLA,
ALGO_VELTOR, ALGO_VELTOR,
@ -127,9 +130,12 @@ static const char *algo_names[] = {
"bitcore", "bitcore",
"x11evo", "x11evo",
"x11", "x11",
"x12",
"x13", "x13",
"x14", "x14",
"x15", "x15",
"x16r",
"x16s",
"x17", "x17",
"vanilla", "vanilla",
"veltor", "veltor",

4
api.cpp

@ -1252,7 +1252,7 @@ static void api()
char *wskey = NULL; char *wskey = NULL;
n = recv(c, &buf[0], SOCK_REC_BUFSZ, 0); n = recv(c, &buf[0], SOCK_REC_BUFSZ, 0);
fail = SOCKETFAIL(n); fail = SOCKETFAIL(n) || n < 0;
if (fail) if (fail)
buf[0] = '\0'; buf[0] = '\0';
else if (n > 0 && buf[n-1] == '\n') { else if (n > 0 && buf[n-1] == '\n') {
@ -1261,7 +1261,7 @@ static void api()
if (n > 0 && buf[n-1] == '\r') if (n > 0 && buf[n-1] == '\r')
buf[n-1] = '\0'; buf[n-1] = '\0';
} }
buf[n] = '\0'; else buf[n] = '\0';
//if (opt_debug && opt_protocol && n > 0) //if (opt_debug && opt_protocol && n > 0)
// applog(LOG_DEBUG, "API: recv command: (%d) '%s'+char(%x)", n, buf, buf[n-1]); // applog(LOG_DEBUG, "API: recv command: (%d) '%s'+char(%x)", n, buf, buf[n-1]);

3
bench.cpp

@ -100,9 +100,12 @@ void algo_free_all(int thr_id)
free_wildkeccak(thr_id); free_wildkeccak(thr_id);
free_x11evo(thr_id); free_x11evo(thr_id);
free_x11(thr_id); free_x11(thr_id);
free_x12(thr_id);
free_x13(thr_id); free_x13(thr_id);
free_x14(thr_id); free_x14(thr_id);
free_x15(thr_id); free_x15(thr_id);
free_x16r(thr_id);
free_x16s(thr_id);
free_x17(thr_id); free_x17(thr_id);
free_zr5(thr_id); free_zr5(thr_id);
free_scrypt(thr_id); free_scrypt(thr_id);

16
ccminer.cpp

@ -291,9 +291,12 @@ Options:\n\
whirlpool Whirlpool algo\n\ whirlpool Whirlpool algo\n\
x11evo Permuted x11 (Revolver)\n\ x11evo Permuted x11 (Revolver)\n\
x11 X11 (DarkCoin)\n\ x11 X11 (DarkCoin)\n\
x12 X12 (GalaxyCash)\n\
x13 X13 (MaruCoin)\n\ x13 X13 (MaruCoin)\n\
x14 X14\n\ x14 X14\n\
x15 X15\n\ x15 X15\n\
x16r X16R (Raven)\n\
x16s X16S\n\
x17 X17\n\ x17 X17\n\
wildkeccak Boolberry\n\ wildkeccak Boolberry\n\
zr5 ZR5 (ZiftrCoin)\n\ zr5 ZR5 (ZiftrCoin)\n\
@ -1714,6 +1717,8 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
case ALGO_LYRA2Z: case ALGO_LYRA2Z:
case ALGO_TIMETRAVEL: case ALGO_TIMETRAVEL:
case ALGO_BITCORE: case ALGO_BITCORE:
case ALGO_X16R:
case ALGO_X16S:
work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty)); work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));
break; break;
case ALGO_KECCAK: case ALGO_KECCAK:
@ -2253,6 +2258,7 @@ static void *miner_thread(void *userdata)
case ALGO_BITCORE: case ALGO_BITCORE:
case ALGO_X11EVO: case ALGO_X11EVO:
case ALGO_X11: case ALGO_X11:
case ALGO_X12:
case ALGO_X13: case ALGO_X13:
case ALGO_WHIRLCOIN: case ALGO_WHIRLCOIN:
case ALGO_WHIRLPOOL: case ALGO_WHIRLPOOL:
@ -2503,6 +2509,9 @@ static void *miner_thread(void *userdata)
case ALGO_X11: case ALGO_X11:
rc = scanhash_x11(thr_id, &work, max_nonce, &hashes_done); rc = scanhash_x11(thr_id, &work, max_nonce, &hashes_done);
break; break;
case ALGO_X12:
rc = scanhash_x12(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_X13: case ALGO_X13:
rc = scanhash_x13(thr_id, &work, max_nonce, &hashes_done); rc = scanhash_x13(thr_id, &work, max_nonce, &hashes_done);
break; break;
@ -2512,6 +2521,12 @@ static void *miner_thread(void *userdata)
case ALGO_X15: case ALGO_X15:
rc = scanhash_x15(thr_id, &work, max_nonce, &hashes_done); rc = scanhash_x15(thr_id, &work, max_nonce, &hashes_done);
break; break;
case ALGO_X16R:
rc = scanhash_x16r(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_X16S:
rc = scanhash_x16s(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_X17: case ALGO_X17:
rc = scanhash_x17(thr_id, &work, max_nonce, &hashes_done); rc = scanhash_x17(thr_id, &work, max_nonce, &hashes_done);
break; break;
@ -3159,6 +3174,7 @@ void parse_arg(int key, char *arg)
if (v < 1 || v > 65535) // sanity check if (v < 1 || v > 65535) // sanity check
show_usage_and_exit(1); show_usage_and_exit(1);
opt_api_mcast_port = v; opt_api_mcast_port = v;
break;
case 'B': case 'B':
opt_background = true; opt_background = true;
break; break;

13
ccminer.vcxproj

@ -269,6 +269,7 @@
<ClCompile Include="neoscrypt\neoscrypt-cpu.c" /> <ClCompile Include="neoscrypt\neoscrypt-cpu.c" />
<ClInclude Include="neoscrypt\cuda_vectors.h" /> <ClInclude Include="neoscrypt\cuda_vectors.h" />
<ClInclude Include="x11\cuda_x11_simd512_sm2.cuh" /> <ClInclude Include="x11\cuda_x11_simd512_sm2.cuh" />
<ClInclude Include="x16\cuda_x16.h" />
<CudaCompile Include="Algo256\bmw.cu" /> <CudaCompile Include="Algo256\bmw.cu" />
<CudaCompile Include="Algo256\cuda_bmw.cu"> <CudaCompile Include="Algo256\cuda_bmw.cu">
<MaxRegCount>76</MaxRegCount> <MaxRegCount>76</MaxRegCount>
@ -573,6 +574,7 @@
<CudaCompile Include="x11\veltor.cu" /> <CudaCompile Include="x11\veltor.cu" />
<CudaCompile Include="x11\x11.cu" /> <CudaCompile Include="x11\x11.cu" />
<CudaCompile Include="x11\x11evo.cu" /> <CudaCompile Include="x11\x11evo.cu" />
<CudaCompile Include="x12\x12.cu" />
<CudaCompile Include="x13\cuda_x13_hamsi512.cu"> <CudaCompile Include="x13\cuda_x13_hamsi512.cu">
<MaxRegCount>72</MaxRegCount> <MaxRegCount>72</MaxRegCount>
</CudaCompile> </CudaCompile>
@ -587,8 +589,17 @@
<CudaCompile Include="x17\hmq17.cu" /> <CudaCompile Include="x17\hmq17.cu" />
<CudaCompile Include="x15\x15.cu" /> <CudaCompile Include="x15\x15.cu" />
<CudaCompile Include="x15\whirlpool.cu" /> <CudaCompile Include="x15\whirlpool.cu" />
<CudaCompile Include="x17\x17.cu"> <CudaCompile Include="x15\cuda_x15_whirlpool_sm3.cu" />
<CudaCompile Include="x16\x16r.cu" />
<CudaCompile Include="x16\x16s.cu" />
<CudaCompile Include="x16\cuda_x16_echo512.cu" />
<CudaCompile Include="x16\cuda_x16_fugue512.cu" />
<CudaCompile Include="x16\cuda_x16_shabal512.cu" />
<CudaCompile Include="x16\cuda_x16_simd512_80.cu" />
<CudaCompile Include="x16\cuda_x16_echo512_64.cu">
<CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
</CudaCompile> </CudaCompile>
<CudaCompile Include="x17\x17.cu" />
<CudaCompile Include="x17\cuda_x17_haval256.cu"> <CudaCompile Include="x17\cuda_x17_haval256.cu">
</CudaCompile> </CudaCompile>
<CudaCompile Include="x17\cuda_x17_sha512.cu"> <CudaCompile Include="x17\cuda_x17_sha512.cu">

36
ccminer.vcxproj.filters

@ -58,6 +58,9 @@
<Filter Include="Source Files\CUDA\x15"> <Filter Include="Source Files\CUDA\x15">
<UniqueIdentifier>{a2403c22-6777-46ab-a55a-3fcc7386c974}</UniqueIdentifier> <UniqueIdentifier>{a2403c22-6777-46ab-a55a-3fcc7386c974}</UniqueIdentifier>
</Filter> </Filter>
<Filter Include="Source Files\CUDA\x16">
<UniqueIdentifier>{55dfae6a-66ba-43e2-8ceb-98ee70cbdf16}</UniqueIdentifier>
</Filter>
<Filter Include="Source Files\CUDA\x17"> <Filter Include="Source Files\CUDA\x17">
<UniqueIdentifier>{85dfae6a-66ca-4332-8cec-98ee70cbdf2f}</UniqueIdentifier> <UniqueIdentifier>{85dfae6a-66ca-4332-8cec-98ee70cbdf2f}</UniqueIdentifier>
</Filter> </Filter>
@ -112,6 +115,9 @@
<Filter Include="Source Files\CUDA\tribus"> <Filter Include="Source Files\CUDA\tribus">
<UniqueIdentifier>{1e548d79-c217-4203-989a-a592fe2b2de3}</UniqueIdentifier> <UniqueIdentifier>{1e548d79-c217-4203-989a-a592fe2b2de3}</UniqueIdentifier>
</Filter> </Filter>
<Filter Include="Source Files\CUDA\x12">
<UniqueIdentifier>{xde48d89-fx12-1323-129a-b592fe2b2de3}</UniqueIdentifier>
</Filter>
<Filter Include="Source Files\CUDA\gost"> <Filter Include="Source Files\CUDA\gost">
<UniqueIdentifier>{6a99bc95-f402-465e-9e64-b042bd241bb7}</UniqueIdentifier> <UniqueIdentifier>{6a99bc95-f402-465e-9e64-b042bd241bb7}</UniqueIdentifier>
</Filter> </Filter>
@ -599,6 +605,9 @@
<ClInclude Include="equi\equihash.h"> <ClInclude Include="equi\equihash.h">
<Filter>Source Files\equi</Filter> <Filter>Source Files\equi</Filter>
</ClInclude> </ClInclude>
<ClInclude Include="x16\cuda_x16.h">
<Filter>Header Files\CUDA</Filter>
</ClInclude>
</ItemGroup> </ItemGroup>
<ItemGroup> <ItemGroup>
<CudaCompile Include="cuda.cpp"> <CudaCompile Include="cuda.cpp">
@ -808,6 +817,9 @@
<CudaCompile Include="x11\s3.cu"> <CudaCompile Include="x11\s3.cu">
<Filter>Source Files\CUDA\x11</Filter> <Filter>Source Files\CUDA\x11</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="x12\x12.cu">
<Filter>Source Files\CUDA\x12</Filter>
</CudaCompile>
<CudaCompile Include="x11\timetravel.cu"> <CudaCompile Include="x11\timetravel.cu">
<Filter>Source Files\CUDA\x11</Filter> <Filter>Source Files\CUDA\x11</Filter>
</CudaCompile> </CudaCompile>
@ -970,6 +982,30 @@
<CudaCompile Include="equi\cuda_equi.cu"> <CudaCompile Include="equi\cuda_equi.cu">
<Filter>Source Files\equi</Filter> <Filter>Source Files\equi</Filter>
</CudaCompile> </CudaCompile>
<CudaCompile Include="x15\cuda_x15_whirlpool_sm3.cu">
<Filter>Source Files\CUDA\x15</Filter>
</CudaCompile>
<CudaCompile Include="x16\cuda_x16_echo512.cu">
<Filter>Source Files\CUDA\x16</Filter>
</CudaCompile>
<CudaCompile Include="x16\cuda_x16_echo512_64.cu">
<Filter>Source Files\CUDA\x16</Filter>
</CudaCompile>
<CudaCompile Include="x16\cuda_x16_fugue512.cu">
<Filter>Source Files\CUDA\x16</Filter>
</CudaCompile>
<CudaCompile Include="x16\cuda_x16_shabal512.cu">
<Filter>Source Files\CUDA\x16</Filter>
</CudaCompile>
<CudaCompile Include="x16\cuda_x16_simd512_80.cu">
<Filter>Source Files\CUDA\x16</Filter>
</CudaCompile>
<CudaCompile Include="x16\x16r.cu">
<Filter>Source Files\CUDA\x16</Filter>
</CudaCompile>
<CudaCompile Include="x16\x16s.cu">
<Filter>Source Files\CUDA\x16</Filter>
</CudaCompile>
<CudaCompile Include="gost\cuda_gosthash.cu"> <CudaCompile Include="gost\cuda_gosthash.cu">
<Filter>Source Files\CUDA\gost</Filter> <Filter>Source Files\CUDA\gost</Filter>
</CudaCompile> </CudaCompile>

2
compat/ccminer-config.h

@ -164,7 +164,7 @@
#define PACKAGE_URL "http://github.com/tpruvot/ccminer" #define PACKAGE_URL "http://github.com/tpruvot/ccminer"
/* Define to the version of this package. */ /* Define to the version of this package. */
#define PACKAGE_VERSION "2.2.4" #define PACKAGE_VERSION "2.2.5"
/* If using the C implementation of alloca, define if you know the /* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be direction of stack growth for your system; otherwise it will be

37
equi/cuda_equi.cu

@ -80,6 +80,8 @@ u32 umin(const u32, const u32);
u32 umax(const u32, const u32); u32 umax(const u32, const u32);
#endif #endif
#define OPT_SYNC_ALL
#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300 #if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
#define __shfl2(var, srcLane) __shfl_sync(0xFFFFFFFFu, var, srcLane) #define __shfl2(var, srcLane) __shfl_sync(0xFFFFFFFFu, var, srcLane)
#undef __any #undef __any
@ -514,10 +516,11 @@ __global__ void digit_1(equi<RB, SM>* eq)
u32 si[2]; u32 si[2];
#ifdef OPT_SYNC_ALL
// enable this to make fully safe shared mem operations; // enable this to make fully safe shared mem operations;
// disabled gains some speed, but can rarely cause a crash // disabled gains some speed, but can rarely cause a crash
//__syncthreads(); __syncthreads();
#endif
#pragma unroll #pragma unroll
for (u32 i = 0; i != 2; ++i) for (u32 i = 0; i != 2; ++i)
{ {
@ -654,11 +657,9 @@ __global__ void digit_2(equi<RB, SM>* eq)
uint4 tt[2]; uint4 tt[2];
u32 si[2]; u32 si[2];
#ifdef OPT_SYNC_ALL
// enable this to make fully safe shared mem operations; __syncthreads();
// disabled gains some speed, but can rarely cause a crash #endif
//__syncthreads();
#pragma unroll 2 #pragma unroll 2
for (u32 i = 0; i < 2; i++) for (u32 i = 0; i < 2; i++)
{ {
@ -785,9 +786,9 @@ __global__ void digit_3(equi<RB, SM>* eq)
uint4 tt[2]; uint4 tt[2];
u32 ta[2]; u32 ta[2];
// enable this to make fully safe shared mem operations; #ifdef OPT_SYNC_ALL
// disabled gains some speed, but can rarely cause a crash __syncthreads();
//__syncthreads(); #endif
#pragma unroll 2 #pragma unroll 2
for (u32 i = 0; i < 2; i++) for (u32 i = 0; i < 2; i++)
@ -919,11 +920,9 @@ __global__ void digit_4(equi<RB, SM>* eq)
u32 si[2]; u32 si[2];
uint4 tt[2]; uint4 tt[2];
#ifdef OPT_SYNC_ALL
// enable this to make fully safe shared mem operations; __syncthreads();
// disabled gains some speed, but can rarely cause a crash #endif
//__syncthreads();
#pragma unroll 2 #pragma unroll 2
for (u32 i = 0; i < 2; i++) for (u32 i = 0; i < 2; i++)
{ {
@ -1035,11 +1034,9 @@ __global__ void digit_5(equi<RB, SM>* eq)
u32 si[2]; u32 si[2];
uint4 tt[2]; uint4 tt[2];
#ifdef OPT_SYNC_ALL
// enable this to make fully safe shared mem operations; __syncthreads();
// disabled gains some speed, but can rarely cause a crash #endif
//__syncthreads();
#pragma unroll 2 #pragma unroll 2
for (u32 i = 0; i < 2; i++) for (u32 i = 0; i < 2; i++)
{ {

9
miner.h

@ -324,9 +324,12 @@ extern int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, uns
extern int scanhash_wildkeccak(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_wildkeccak(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x11evo(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x11evo(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_zr5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_zr5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
@ -389,9 +392,12 @@ extern void free_whirl(int thr_id);
extern void free_wildkeccak(int thr_id); extern void free_wildkeccak(int thr_id);
extern void free_x11evo(int thr_id); extern void free_x11evo(int thr_id);
extern void free_x11(int thr_id); extern void free_x11(int thr_id);
extern void free_x12(int thr_id);
extern void free_x13(int thr_id); extern void free_x13(int thr_id);
extern void free_x14(int thr_id); extern void free_x14(int thr_id);
extern void free_x15(int thr_id); extern void free_x15(int thr_id);
extern void free_x16r(int thr_id);
extern void free_x16s(int thr_id);
extern void free_x17(int thr_id); extern void free_x17(int thr_id);
extern void free_zr5(int thr_id); extern void free_zr5(int thr_id);
//extern void free_sha256d(int thr_id); //extern void free_sha256d(int thr_id);
@ -934,9 +940,12 @@ void wcoinhash(void *state, const void *input);
void whirlxHash(void *state, const void *input); void whirlxHash(void *state, const void *input);
void x11evo_hash(void *output, const void *input); void x11evo_hash(void *output, const void *input);
void x11hash(void *output, const void *input); void x11hash(void *output, const void *input);
void x12hash(void *output, const void *input);
void x13hash(void *output, const void *input); void x13hash(void *output, const void *input);
void x14hash(void *output, const void *input); void x14hash(void *output, const void *input);
void x15hash(void *output, const void *input); void x15hash(void *output, const void *input);
void x16r_hash(void *output, const void *input);
void x16s_hash(void *output, const void *input);
void x17hash(void *output, const void *input); void x17hash(void *output, const void *input);
void wildkeccak_hash(void *output, const void *input, uint64_t* scratchpad, uint64_t ssize); void wildkeccak_hash(void *output, const void *input, uint64_t* scratchpad, uint64_t ssize);
void zr5hash(void *output, const void *input); void zr5hash(void *output, const void *input);

15
neoscrypt/cuda_neoscrypt.cu

@ -1319,7 +1319,6 @@ static void Blake2Shost(uint32_t * inout, const uint32_t * inkey)
} }
#define SHIFT 128U
#define TPB 32 #define TPB 32
#define TPB2 64 #define TPB2 64
@ -1346,7 +1345,7 @@ __launch_bounds__(TPB, 1)
void neoscrypt_gpu_hash_chacha1() void neoscrypt_gpu_hash_chacha1()
{ {
const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y); const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
const uint32_t shift = SHIFT * 8U * (thread & 8191); const uint32_t threads = (gridDim.x * blockDim.y);
const uint32_t shiftTr = 8U * thread; const uint32_t shiftTr = 8U * thread;
uint4 X[4]; uint4 X[4];
@ -1361,7 +1360,7 @@ void neoscrypt_gpu_hash_chacha1()
#pragma nounroll #pragma nounroll
for (int i = 0; i < 128; i++) for (int i = 0; i < 128; i++)
{ {
uint32_t offset = shift + i * 8U; uint32_t offset = 8U * (thread + threads * i);
for (int j = 0; j < 4; j++) for (int j = 0; j < 4; j++)
((uint4*)(W + offset))[j * 4 + threadIdx.x] = X[j]; ((uint4*)(W + offset))[j * 4 + threadIdx.x] = X[j];
neoscrypt_chacha(X); neoscrypt_chacha(X);
@ -1370,7 +1369,7 @@ void neoscrypt_gpu_hash_chacha1()
#pragma nounroll #pragma nounroll
for (int t = 0; t < 128; t++) for (int t = 0; t < 128; t++)
{ {
uint32_t offset = shift + (WarpShuffle(X[3].x, 0, 4) & 0x7F) * 8U; uint32_t offset = 8U * (thread + threads * (WarpShuffle(X[3].x, 0, 4) & 0x7F));
for (int j = 0; j < 4; j++) for (int j = 0; j < 4; j++)
X[j] ^= ((uint4*)(W + offset))[j * 4 + threadIdx.x]; X[j] ^= ((uint4*)(W + offset))[j * 4 + threadIdx.x];
neoscrypt_chacha(X); neoscrypt_chacha(X);
@ -1391,7 +1390,7 @@ __launch_bounds__(TPB, 1)
void neoscrypt_gpu_hash_salsa1() void neoscrypt_gpu_hash_salsa1()
{ {
const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y); const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
const uint32_t shift = SHIFT * 8U * (thread & 8191); const uint32_t threads = (gridDim.x * blockDim.y);
const uint32_t shiftTr = 8U * thread; const uint32_t shiftTr = 8U * thread;
uint4 Z[4]; uint4 Z[4];
@ -1406,7 +1405,7 @@ void neoscrypt_gpu_hash_salsa1()
#pragma nounroll #pragma nounroll
for (int i = 0; i < 128; i++) for (int i = 0; i < 128; i++)
{ {
uint32_t offset = shift + i * 8U; uint32_t offset = 8U * (thread + threads * i);
for (int j = 0; j < 4; j++) for (int j = 0; j < 4; j++)
((uint4*)(W + offset))[j * 4 + threadIdx.x] = Z[j]; ((uint4*)(W + offset))[j * 4 + threadIdx.x] = Z[j];
neoscrypt_salsa(Z); neoscrypt_salsa(Z);
@ -1415,7 +1414,7 @@ void neoscrypt_gpu_hash_salsa1()
#pragma nounroll #pragma nounroll
for (int t = 0; t < 128; t++) for (int t = 0; t < 128; t++)
{ {
uint32_t offset = shift + (WarpShuffle(Z[3].x, 0, 4) & 0x7F) * 8U; uint32_t offset = 8U * (thread + threads * (WarpShuffle(Z[3].x, 0, 4) & 0x7F));
for (int j = 0; j < 4; j++) for (int j = 0; j < 4; j++)
Z[j] ^= ((uint4*)(W + offset))[j * 4 + threadIdx.x]; Z[j] ^= ((uint4*)(W + offset))[j * 4 + threadIdx.x];
neoscrypt_salsa(Z); neoscrypt_salsa(Z);
@ -1474,7 +1473,7 @@ void neoscrypt_init(int thr_id, uint32_t threads)
cuda_get_arch(thr_id); cuda_get_arch(thr_id);
CUDA_SAFE_CALL(cudaMalloc(&d_NNonce[thr_id], 2 * sizeof(uint32_t))); CUDA_SAFE_CALL(cudaMalloc(&d_NNonce[thr_id], 2 * sizeof(uint32_t)));
CUDA_SAFE_CALL(cudaMalloc(&hash1, 32 * 128 * sizeof(uint64_t) * min(8192, threads))); CUDA_SAFE_CALL(cudaMalloc(&hash1, 32 * 128 * sizeof(uint64_t) * threads));
CUDA_SAFE_CALL(cudaMalloc(&Trans1, 32 * sizeof(uint64_t) * threads)); CUDA_SAFE_CALL(cudaMalloc(&Trans1, 32 * sizeof(uint64_t) * threads));
CUDA_SAFE_CALL(cudaMalloc(&Trans2, 32 * sizeof(uint64_t) * threads)); CUDA_SAFE_CALL(cudaMalloc(&Trans2, 32 * sizeof(uint64_t) * threads));
CUDA_SAFE_CALL(cudaMalloc(&Trans3, 32 * sizeof(uint64_t) * threads)); CUDA_SAFE_CALL(cudaMalloc(&Trans3, 32 * sizeof(uint64_t) * threads));

1
neoscrypt/neoscrypt.cpp

@ -22,6 +22,7 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
int dev_id = device_map[thr_id]; int dev_id = device_map[thr_id];
int intensity = is_windows() ? 18 : 19; int intensity = is_windows() ? 18 : 19;
if (strstr(device_name[dev_id], "GTX 10")) intensity = 21; // >= 20 need more than 2GB if (strstr(device_name[dev_id], "GTX 10")) intensity = 21; // >= 20 need more than 2GB
if (strstr(device_name[dev_id], "TITAN")) intensity = 21;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
throughput = throughput / 32; /* set for max intensity ~= 20 */ throughput = throughput / 32; /* set for max intensity ~= 20 */

12
res/ccminer.rc

@ -13,7 +13,7 @@
#undef APSTUDIO_READONLY_SYMBOLS #undef APSTUDIO_READONLY_SYMBOLS
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
// Anglais (États-Unis) resources // English (United States) resources
#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU) #if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU)
LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico"
// //
VS_VERSION_INFO VERSIONINFO VS_VERSION_INFO VERSIONINFO
FILEVERSION 2,2,4,0 FILEVERSION 2,2,5,0
PRODUCTVERSION 2,2,4,0 PRODUCTVERSION 2,2,5,0
FILEFLAGSMASK 0x3fL FILEFLAGSMASK 0x3fL
#ifdef _DEBUG #ifdef _DEBUG
FILEFLAGS 0x21L FILEFLAGS 0x21L
@ -76,10 +76,10 @@ BEGIN
BEGIN BEGIN
BLOCK "040904e4" BLOCK "040904e4"
BEGIN BEGIN
VALUE "FileVersion", "2.2.4" VALUE "FileVersion", "2.2.5"
VALUE "LegalCopyright", "Copyright (C) 2018" VALUE "LegalCopyright", "Copyright (C) 2018"
VALUE "ProductName", "ccminer" VALUE "ProductName", "ccminer"
VALUE "ProductVersion", "2.2.4" VALUE "ProductVersion", "2.2.5"
END END
END END
BLOCK "VarFileInfo" BLOCK "VarFileInfo"
@ -88,7 +88,7 @@ BEGIN
END END
END END
#endif // Anglais (États-Unis) resources #endif // English (United States) resources
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////

3
scrypt/salsa_kernel.cu

@ -240,8 +240,9 @@ inline int _ConvertSMVer2Cores(int major, int minor)
{ 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class { 0x21, 48 }, // Fermi Generation (SM 2.1) GF10x class
{ 0x30, 192 }, // Kepler Generation (SM 3.0) GK10x class - GK104 = 1536 cores / 8 SMs { 0x30, 192 }, // Kepler Generation (SM 3.0) GK10x class - GK104 = 1536 cores / 8 SMs
{ 0x35, 192 }, // Kepler Generation (SM 3.5) GK11x class { 0x35, 192 }, // Kepler Generation (SM 3.5) GK11x class
{ 0x50, 128 }, // Maxwell Generation (SM 5.0) GTX750/750Ti { 0x50, 128 }, // Maxwell First Generation (SM 5.0) GTX750/750Ti
{ 0x52, 128 }, // Maxwell Second Generation (SM 5.2) GTX980 = 2048 cores / 16 SMs - GTX970 1664 cores / 13 SMs { 0x52, 128 }, // Maxwell Second Generation (SM 5.2) GTX980 = 2048 cores / 16 SMs - GTX970 1664 cores / 13 SMs
{ 0x61, 128 }, // Pascal GeForce (SM 6.1)
{ -1, -1 }, { -1, -1 },
}; };

25
util.cpp

@ -1354,7 +1354,11 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p
if (!res_val || json_is_false(res_val) || if (!res_val || json_is_false(res_val) ||
(err_val && !json_is_null(err_val))) { (err_val && !json_is_null(err_val))) {
applog(LOG_ERR, "Stratum authentication failed"); if (err_val && json_is_array(err_val)) {
const char* reason = json_string_value(json_array_get(err_val, 1));
applog(LOG_ERR, "Stratum authentication failed (%s)", reason);
}
else applog(LOG_ERR, "Stratum authentication failed");
goto out; goto out;
} }
@ -2313,19 +2317,28 @@ void print_hash_tests(void)
printpfx("x11evo", hash); printpfx("x11evo", hash);
x11hash(&hash[0], &buf[0]); x11hash(&hash[0], &buf[0]);
printpfx("X11", hash); printpfx("x11", hash);
x12hash(&hash[0], &buf[0]);
printpfx("x12", hash);
x13hash(&hash[0], &buf[0]); x13hash(&hash[0], &buf[0]);
printpfx("X13", hash); printpfx("x13", hash);
x14hash(&hash[0], &buf[0]); x14hash(&hash[0], &buf[0]);
printpfx("X14", hash); printpfx("x14", hash);
x15hash(&hash[0], &buf[0]); x15hash(&hash[0], &buf[0]);
printpfx("X15", hash); printpfx("x15", hash);
x16r_hash(&hash[0], &buf[0]);
printpfx("x16r", hash);
x16s_hash(&hash[0], &buf[0]);
printpfx("x16s", hash);
x17hash(&hash[0], &buf[0]); x17hash(&hash[0], &buf[0]);
printpfx("X17", hash); printpfx("x17", hash);
//memcpy(buf, zrtest, 80); //memcpy(buf, zrtest, 80);
zr5hash(&hash[0], &buf[0]); zr5hash(&hash[0], &buf[0]);

236
x12/x12.cu

@ -0,0 +1,236 @@
/*
* X12 algorithm
*/
extern "C" {
#include "sph/sph_blake.h"
#include "sph/sph_bmw.h"
#include "sph/sph_luffa.h"
#include "sph/sph_cubehash.h"
#include "sph/sph_shavite.h"
#include "sph/sph_simd.h"
#include "sph/sph_echo.h"
#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
#include "sph/sph_hamsi.h"
}
#include "miner.h"
#include "cuda_helper.h"
#include "x11/cuda_x11.h"
static uint32_t *d_hash[MAX_GPUS];
extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
// X12 CPU Hash
extern "C" void x12hash(void *output, const void *input)
{
sph_blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
sph_luffa512_context ctx_luffa;
sph_cubehash512_context ctx_cubehash;
sph_shavite512_context ctx_shavite;
sph_simd512_context ctx_simd;
sph_echo512_context ctx_echo;
sph_groestl512_context ctx_groestl;
sph_skein512_context ctx_skein;
sph_jh512_context ctx_jh;
sph_keccak512_context ctx_keccak;
sph_hamsi512_context ctx_hamsi;
uint32_t hash[32];
memset(hash, 0, sizeof hash);
sph_blake512_init(&ctx_blake);
sph_blake512 (&ctx_blake, input, 80);
sph_blake512_close(&ctx_blake, (void*) hash);
sph_bmw512_init(&ctx_bmw);
sph_bmw512(&ctx_bmw, (const void*) hash, 64);
sph_bmw512_close(&ctx_bmw, (void*) hash);
sph_luffa512_init(&ctx_luffa);
sph_luffa512(&ctx_luffa, (const void*)hash, 64);
sph_luffa512_close(&ctx_luffa, (void*)hash);
sph_cubehash512_init(&ctx_cubehash);
sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
sph_cubehash512_close(&ctx_cubehash, (void*)hash);
sph_shavite512_init(&ctx_shavite);
sph_shavite512(&ctx_shavite, (const void*)hash, 64);
sph_shavite512_close(&ctx_shavite, (void*)hash);
sph_simd512_init(&ctx_simd);
sph_simd512(&ctx_simd, (const void*)hash, 64);
sph_simd512_close(&ctx_simd, (void*)hash);
sph_echo512_init(&ctx_echo);
sph_echo512(&ctx_echo, (const void*)hash, 64);
sph_echo512_close(&ctx_echo, (void*)hash);
sph_groestl512_init(&ctx_groestl);
sph_groestl512(&ctx_groestl, (const void*) hash, 64);
sph_groestl512_close(&ctx_groestl, (void*) hash);
sph_skein512_init(&ctx_skein);
sph_skein512(&ctx_skein, (const void*) hash, 64);
sph_skein512_close(&ctx_skein, (void*) hash);
sph_jh512_init(&ctx_jh);
sph_jh512(&ctx_jh, (const void*) hash, 64);
sph_jh512_close(&ctx_jh, (void*) hash);
sph_keccak512_init(&ctx_keccak);
sph_keccak512(&ctx_keccak, (const void*) hash, 64);
sph_keccak512_close(&ctx_keccak, (void*) hash);
sph_hamsi512_init(&ctx_hamsi);
sph_hamsi512(&ctx_hamsi, (const void*) hash, 64);
sph_hamsi512_close(&ctx_hamsi, (void*) hash);
memcpy(output, hash, 32);
}
static bool init[MAX_GPUS] = { 0 };
extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
int intensity = (device_sm[device_map[thr_id]] > 500 && !is_windows()) ? 20 : 19;
uint32_t throughput = cuda_default_throughput(thr_id, 1 << intensity); // 19=256*256*8;
//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (opt_benchmark)
((uint32_t*)ptarget)[7] = 0x000f;
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
CUDA_LOG_ERROR();
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
quark_blake512_cpu_init(thr_id, throughput);
x11_luffaCubehash512_cpu_init(thr_id, throughput);
x11_shavite512_cpu_init(thr_id, throughput);
if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
return 0;
}
x11_echo512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
x13_hamsi512_cpu_init(thr_id, throughput);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
uint32_t endiandata[20];
for (int k=0; k < 20; k++)
be32enc(&endiandata[k], pdata[k]);
quark_blake512_cpu_setBlock_80(thr_id, endiandata);
cuda_check_cpu_setTarget(ptarget);
do {
int order = 0;
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
*hashes_done = pdata[19] - first_nonce + throughput;
CUDA_LOG_ERROR();
work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
if (work->nonces[0] != UINT32_MAX)
{
const uint32_t Htarg = ptarget[7];
uint32_t _ALIGN(64) vhash[8];
be32enc(&endiandata[19], work->nonces[0]);
x12hash(vhash, endiandata);
if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
work->valid_nonces = 1;
work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
work_set_target_ratio(work, vhash);
if (work->nonces[1] != 0) {
be32enc(&endiandata[19], work->nonces[1]);
x12hash(vhash, endiandata);
bn_set_target_ratio(work, vhash, 1);
work->valid_nonces++;
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
} else {
pdata[19] = work->nonces[0] + 1; // cursor
}
return work->valid_nonces;
}
else if (vhash[7] > Htarg) {
gpu_increment_reject(thr_id);
if (!opt_quiet)
gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
pdata[19] = work->nonces[0] + 1;
continue;
}
}
if ((uint64_t)throughput + pdata[19] >= max_nonce) {
pdata[19] = max_nonce;
break;
}
pdata[19] += throughput;
} while (!work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce;
CUDA_LOG_ERROR();
return 0;
}
// cleanup
extern "C" void free_x12(int thr_id)
{
if (!init[thr_id])
return;
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
quark_blake512_cpu_free(thr_id);
quark_groestl512_cpu_free(thr_id);
x11_simd512_cpu_free(thr_id);
cuda_check_cpu_free(thr_id);
CUDA_LOG_ERROR();
cudaDeviceSynchronize();
init[thr_id] = false;
}

715
x13/cuda_x13_hamsi512.cu

@ -1,6 +1,6 @@
/* /*
* Quick Hamsi-512 for X13 * Quick Hamsi-512 for X13 by tsiv - 2014
* by tsiv - 2014 * + Hamsi-512 80 by tpruvot - 2018
*/ */
#include <stdio.h> #include <stdio.h>
@ -16,31 +16,17 @@ static __constant__ uint32_t d_alpha_f[32];
static __constant__ uint32_t d_T512[64][16]; static __constant__ uint32_t d_T512[64][16];
static const uint32_t alpha_n[] = { static const uint32_t alpha_n[] = {
SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc), 0xff00f0f0, 0xccccaaaa, 0xf0f0cccc, 0xff00aaaa, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xf0f0ff00,
SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), 0xf0f0cccc, 0xaaaaff00, 0xccccff00, 0xaaaaf0f0, 0xaaaaf0f0, 0xff00cccc, 0xccccf0f0, 0xff00aaaa,
SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc), 0xccccaaaa, 0xff00f0f0, 0xff00aaaa, 0xf0f0cccc, 0xf0f0ff00, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc,
SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0), 0xaaaaff00, 0xf0f0cccc, 0xaaaaf0f0, 0xccccff00, 0xff00cccc, 0xaaaaf0f0, 0xff00aaaa, 0xccccf0f0
SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
}; };
static const uint32_t alpha_f[] = { static const uint32_t alpha_f[] = {
SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0), 0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0, 0xcaf9f9c0, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, 0x639ccaf9,
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), 0x639c0ff0, 0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c, 0xf9c0639c, 0xcaf90ff0, 0x0ff0639c, 0xcaf9f9c0,
SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0), 0x0ff0f9c0, 0xcaf9639c, 0xcaf9f9c0, 0x639c0ff0, 0x639ccaf9, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0,
SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c), 0xf9c0caf9, 0x639c0ff0, 0xf9c0639c, 0x0ff0caf9, 0xcaf90ff0, 0xf9c0639c, 0xcaf9f9c0, 0x0ff0639c
SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
}; };
#define hamsi_s00 m0 #define hamsi_s00 m0
@ -200,390 +186,134 @@ static const uint32_t alpha_f[] = {
static const uint32_t T512[64][16] = { static const uint32_t T512[64][16] = {
{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000), { 0xef0b0270, 0x3afd0000, 0x5dae0000, 0x69490000, 0x9b0f3c06, 0x4405b5f9, 0x66140a51, 0x924f5d0a, // 0
SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9), 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68 },
SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030), { 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68,
SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000), 0x26600240, 0xddd80000, 0x722a0000, 0x4f060000, 0x936667ff, 0x29f944ce, 0x368b63d5, 0x0c26f262 },
SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984), { 0x145a3c00, 0xb9e90000, 0x61270000, 0xf1610000, 0xce613d6c, 0xb0493d78, 0x47a96720, 0xe18e24c5,
SPH_C32(0x9e69af68) }, 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f },
{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000), { 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f,
SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), 0x373d2800, 0x71500000, 0x95e00000, 0x0a140000, 0xbdac1909, 0x48ef9831, 0x456d6d1f, 0x3daac2da },
SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240), { 0x54285c00, 0xeaed0000, 0xc5d60000, 0xa1c50000, 0xb3a26770, 0x94a5c4e1, 0x6bb0419d, 0x551b3782,
SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000), 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29 },
SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5), { 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29,
SPH_C32(0x0c26f262) }, 0xc8934400, 0x5a3e0000, 0x57870000, 0x4c560000, 0xea982435, 0x75b11115, 0x28b67247, 0x2dd1f9ab },
{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000), { 0x29449c00, 0x64e70000, 0xf24b0000, 0xc2f30000, 0x0ede4e8f, 0x56c23745, 0xf3e04259, 0x8d0d9ec4,
SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78), 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2 },
SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400), { 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2, // 7
SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000), 0x6f299000, 0x6c850000, 0x2f160000, 0x782e0000, 0x644c37cd, 0x12dd1cd6, 0xd26a8c36, 0x32219526 },
SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f), { 0xf6800005, 0x3443c000, 0x24070000, 0x8f3d0000, 0x21373bfb, 0x0ab8d5ae, 0xcdc58b19, 0xd795ba31,
SPH_C32(0xdc24e61f) }, 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88 },
{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), { 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88,
SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), 0x50ff0004, 0x45744000, 0x3dfb0000, 0x19e60000, 0x1bbc5606, 0xe1727b5d, 0xe1a8cc96, 0x7b1bd6b9 },
SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800), { 0xf7750009, 0xcf3cc000, 0xc3d60000, 0x04920000, 0x029519a9, 0xf8e836ba, 0x7a87f14e, 0x9e16981a,
SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000), 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320 },
SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f), { 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320,
SPH_C32(0x3daac2da) }, 0x231f0009, 0x42f40000, 0x66790000, 0x4ebb0000, 0xfedb5bd3, 0x315cb0d6, 0xe2b1674a, 0x69505b3a },
{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000), { 0x774400f0, 0xf15a0000, 0xf5b20000, 0x34140000, 0x89377e8c, 0x5a8bec25, 0x0bc3cd1e, 0xcf3775cb,
SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1), 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574 },
SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800), { 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574,
SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000), 0x832800a0, 0x67420000, 0xe1170000, 0x370b0000, 0xcba30034, 0x3c34923c, 0x9767bdcc, 0x450360bf },
SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da), { 0xe8870170, 0x9d720000, 0x12db0000, 0xd4220000, 0xf2886b27, 0xa921e543, 0x4ef8b518, 0x618813b1, // 14
SPH_C32(0x78cace29) }, 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758 },
{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000), { 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758,
SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), 0x5cb00110, 0x913e0000, 0x44190000, 0x888c0000, 0x66dc7418, 0x921f1d66, 0x55ceea25, 0x925c44e9 },
SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400), { 0x0c720000, 0x49e50f00, 0x42790000, 0x5cea0000, 0x33aa301a, 0x15822514, 0x95a34b7b, 0xb44b0090,
SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000), 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f },
SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247), { 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f,
SPH_C32(0x2dd1f9ab) }, 0xf2500000, 0xeebd0a00, 0x67a80000, 0xab8a0000, 0xba9b48c0, 0x0a56dd74, 0xdb73e86e, 0x1568ff0f },
{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000), { 0x45180000, 0xa5b51700, 0xf96a0000, 0x3b480000, 0x1ecc142c, 0x231395d6, 0x16bca6b0, 0xdf33f4df,
SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745), 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e },
SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00), { 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e,
SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000), 0xfd250000, 0xb3c41100, 0xcef00000, 0xcef90000, 0x3c4d7580, 0x8d5b6493, 0x7098b0a6, 0x1af21fe1 },
SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f), { 0x75a40000, 0xc28b2700, 0x94a40000, 0x90f50000, 0xfb7857e0, 0x49ce0bae, 0x1767c483, 0xaedf667e,
SPH_C32(0xbf2c0be2) }, 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b },
{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000), { 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b, // 21
SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93), 0xa4c20000, 0xd9372400, 0x0a480000, 0x66610000, 0xf87a12c7, 0x86bef75c, 0xa324df94, 0x2ba05a55 },
SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000), { 0x75c90003, 0x0e10c000, 0xd1200000, 0xbaea0000, 0x8bc42f3e, 0x8758b757, 0xbb28761d, 0x00b72e2b,
SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000), 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254 },
SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36), { 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254,
SPH_C32(0x32219526) }, 0x9b060002, 0x61468000, 0x221e0000, 0x1d740000, 0x36715d27, 0x30495c92, 0xf11336a7, 0xfe1cdc7f },
{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000), { 0x86790000, 0x3f390002, 0xe19ae000, 0x98560000, 0x9565670e, 0x4e88c8ea, 0xd3dd4944, 0x161ddab9,
SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae), 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834 },
SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001), { 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834,
SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000), 0xb6ce0000, 0xdae90002, 0x156e8000, 0xda920000, 0xf6dd5a64, 0x36325c8a, 0xf272e8ae, 0xa6b8c28d },
SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f), { 0x14190000, 0x23ca003c, 0x50df0000, 0x44b60000, 0x1b6c67b0, 0x3cf3ac75, 0x61e610b0, 0xdbcadb80,
SPH_C32(0xac8e6c88) }, 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7 },
{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000), { 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7,
SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), 0xf75a0000, 0x19840028, 0xa2190000, 0xeef80000, 0xc0722516, 0x19981260, 0x73dba1e6, 0xe1844257 },
SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004), { 0x54500000, 0x0671005c, 0x25ae0000, 0x6a1e0000, 0x2ea54edf, 0x664e8512, 0xbfba18c3, 0x7e715d17, // 28
SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000), 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e },
SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96), { 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e,
SPH_C32(0x7b1bd6b9) }, 0xe8dd0000, 0xfa4a0044, 0x3c2d0000, 0xbb150000, 0x80bd361b, 0x24e81d44, 0xbfa8c2f4, 0x524a0d59 },
{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000), { 0x69510000, 0xd4e1009c, 0xc3230000, 0xac2f0000, 0xe4950bae, 0xcea415dc, 0x87ec287c, 0xbce1a3ce,
SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba), 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173 },
SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000), { 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173,
SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000), 0xaf220000, 0x7b6c0090, 0x67e20000, 0x8da20000, 0xc7841e29, 0xb7b744f3, 0x9ac484f4, 0x8b6c72bd },
SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604), { 0xcc140000, 0xa5630000, 0x5ab90780, 0x3b500000, 0x4bd013ff, 0x879b3418, 0x694348c1, 0xca5a87fe,
SPH_C32(0xf746c320) }, 0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa },
{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), { 0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa,
SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), 0x4d8a0000, 0x49340000, 0x3c8b0500, 0xaea30000, 0x16793bfd, 0xcf6f08a4, 0x8f19eaec, 0x443d3004 },
SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009), { 0x78230000, 0x12fc0000, 0xa93a0b80, 0x90a50000, 0x713e2879, 0x7ee98924, 0xf08ca062, 0x636f8bab,
SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000), 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b },
SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a), { 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b, // 35
SPH_C32(0x69505b3a) }, 0x7a8c0000, 0xa5d40000, 0x13260880, 0xc63d0000, 0xcbb36daa, 0xfea14f43, 0x59d0b4f8, 0x979961d0 },
{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000), { 0xac480000, 0x1ba60000, 0x45fb1380, 0x03430000, 0x5a85316a, 0x1fb250b6, 0xfe72c7fe, 0x91e478f6,
SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25), 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e },
SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050), { 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e,
SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000), 0xb2060000, 0xc5690000, 0x28031200, 0x74670000, 0xb6c236f4, 0xeb1239f8, 0x33d1dfec, 0x094e3198 },
SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2), { 0xaec30000, 0x9c4f0001, 0x79d1e000, 0x2c150000, 0x45cc75b3, 0x6650b736, 0xab92f78f, 0xa312567b,
SPH_C32(0x8a341574) }, 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e },
{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000), { 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e,
SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), 0x75e60000, 0x95660001, 0x307b2000, 0xadf40000, 0x8f321eea, 0x24298307, 0xe8c49cf9, 0x4b7eec55 },
SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0), { 0x58430000, 0x807e0000, 0x78330001, 0xc66b3800, 0xe7375cdc, 0x79ad3fdd, 0xac73fe6f, 0x3a4479b1,
SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000), 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6 },
SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc), { 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6,
SPH_C32(0x450360bf) }, 0x45190000, 0xab0c0000, 0x30be0001, 0x690a2000, 0xc2fc7219, 0xb1d4800d, 0x2dd1fa46, 0x24314f17 },
{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000), { 0xa53b0000, 0x14260000, 0x4e30001e, 0x7cae0000, 0x8f9e0dd5, 0x78dfaa3d, 0xf73168d8, 0x0b1b4946, // 42
SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543), 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce },
SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060), { 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce,
SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000), 0xa2d60000, 0xa6760000, 0xc9440014, 0xeba30000, 0xccec2e7b, 0x3018c499, 0x03490afa, 0x9b6ef888 },
SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d), { 0x88980000, 0x1f940000, 0x7fcf002e, 0xfb4e0000, 0xf158079a, 0x61ae9167, 0xa895706c, 0xe6107494,
SPH_C32(0xf3d45758) }, 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463 },
{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), { 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463,
SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), 0x835a0000, 0xc4f70000, 0x01470022, 0xeec80000, 0x60a54f69, 0x142f2a24, 0x5cf534f2, 0x3ea660f7 },
SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110), { 0x52500000, 0x29540000, 0x6a61004e, 0xf0ff0000, 0x9a317eec, 0x452341ce, 0xcf568fe5, 0x5303130f,
SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000), 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691 },
SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25), { 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691,
SPH_C32(0x925c44e9) }, 0x01dd0000, 0x80a80000, 0xf4960048, 0xa6000000, 0x90d57ea2, 0xd7e68c37, 0x6612cffd, 0x2c94459e },
{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000), { 0xe6280000, 0x4c4b0000, 0xa8550000, 0xd3d002e0, 0xd86130b8, 0x98a7b0da, 0x289506b4, 0xd75a4897,
SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514), 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f },
SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000), { 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f, // 49
SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000), 0x16ed0000, 0x15680000, 0xedd70000, 0x325d0220, 0xe30c3689, 0x5a4ae643, 0xe375f8a8, 0x81fdf908 },
SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315), { 0xb4310000, 0x77330000, 0xb15d0000, 0x7fd004e0, 0x78a26138, 0xd116c35d, 0xd256d489, 0x4e6f74de,
SPH_C32(0xa123ff9f) }, 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539 },
{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000), { 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539,
SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860), 0x57370000, 0xcaf20000, 0x364e0000, 0xc0220480, 0x56186b22, 0x5ca3f40c, 0xa1937f8f, 0x15b961e7 },
SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000), { 0x02f20000, 0xa2810000, 0x873f0000, 0xe36c7800, 0x1e1d74ef, 0x073d2bd6, 0xc4c23237, 0x7f32259e,
SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000), 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0 },
SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e), { 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0,
SPH_C32(0x1568ff0f) }, 0xb82f0000, 0xb12c0000, 0x30d80000, 0x14445000, 0xc15860a2, 0x3127e8ec, 0x2e98bf23, 0x551e3d6e },
{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000), { 0x1e6c0000, 0xc4420000, 0x8a2e0000, 0xbcb6b800, 0x2c4413b6, 0x8bfdd3da, 0x6a0c1bc8, 0xb99dc2eb,
SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6), 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f },
SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000), { 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f,
SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000), 0x8c3a0000, 0xda980000, 0x607f0000, 0x54078800, 0x85714513, 0x6006b243, 0xdb50399c, 0x8a58e6a4 },
SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616), { 0x033d0000, 0x08b30000, 0xf33a0000, 0x3ac20007, 0x51298a50, 0x6b6e661f, 0x0ea5cfe3, 0xe6da7ffe, // 56
SPH_C32(0xc5c1eb3e) }, 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000 },
{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000), { 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000,
SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145), 0xabe70000, 0x9e0d0000, 0xaf270000, 0x3d180005, 0x2c4f1fd3, 0x74f61695, 0xb5c347eb, 0x3c5dfffe },
SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000), { 0x01930000, 0xe7820000, 0xedfb0000, 0xcf0c000b, 0x8dd08d58, 0xbca3b42e, 0x063661e1, 0x536f9e7b,
SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000), 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7 },
SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6), { 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7,
SPH_C32(0x1af21fe1) }, 0x93bb0000, 0x3b070000, 0xba010000, 0x99d00008, 0x3739ae4e, 0xe64c1722, 0x96f896b3, 0x2879ebac },
{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000), { 0x5fa80000, 0x56030000, 0x43ae0000, 0x64f30013, 0x257e86bf, 0x1311944e, 0x541e95bf, 0x8ea4db69,
SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae), 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e },
SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000), { 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e,
SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000), 0x5fec0000, 0x294b0000, 0x99d20000, 0x4ed00012, 0x1ed34f73, 0xbaa708c9, 0x57140bdf, 0x30aebcf7 },
SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17), { 0xee930000, 0xd6070000, 0x92c10000, 0x2b9801e0, 0x9451287c, 0x3b6cfb57, 0x45312374, 0x201f6a64,
SPH_C32(0x857f3c2b) }, 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0 },
{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), { 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0,
SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), 0x95bb0000, 0x81450000, 0x3b240000, 0x48db0140, 0x0a8a6c53, 0x56f56eec, 0x62c91877, 0xe7e00a94 }
SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
SPH_C32(0x2ba05a55) },
{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
SPH_C32(0xfeabf254) },
{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
SPH_C32(0xfe1cdc7f) },
{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
SPH_C32(0xb0a51834) },
{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
SPH_C32(0xa6b8c28d) },
{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
SPH_C32(0x3a4e99d7) },
{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
SPH_C32(0xe1844257) },
{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
SPH_C32(0x2c3b504e) },
{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
SPH_C32(0x524a0d59) },
{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
SPH_C32(0x378dd173) },
{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
SPH_C32(0x8b6c72bd) },
{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
SPH_C32(0x8e67b7fa) },
{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
SPH_C32(0x443d3004) },
{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
SPH_C32(0xf4f6ea7b) },
{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
SPH_C32(0x979961d0) },
{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
SPH_C32(0x98aa496e) },
{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
SPH_C32(0x094e3198) },
{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
SPH_C32(0xe86cba2e) },
{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
SPH_C32(0x4b7eec55) },
{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
SPH_C32(0x1e7536a6) },
{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
SPH_C32(0x24314f17) },
{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
SPH_C32(0x9075b1ce) },
{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
SPH_C32(0x9b6ef888) },
{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
SPH_C32(0xd8b61463) },
{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
SPH_C32(0x3ea660f7) },
{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
SPH_C32(0x7f975691) },
{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
SPH_C32(0x2c94459e) },
{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
SPH_C32(0x56a7b19f) },
{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
SPH_C32(0x81fdf908) },
{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
SPH_C32(0x5bd61539) },
{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
SPH_C32(0x15b961e7) },
{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
SPH_C32(0x2a2c18f0) },
{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
SPH_C32(0x551e3d6e) },
{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
SPH_C32(0x33c5244f) },
{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
SPH_C32(0x8a58e6a4) },
{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
SPH_C32(0xda878000) },
{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
SPH_C32(0x3c5dfffe) },
{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
SPH_C32(0x7b1675d7) },
{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
SPH_C32(0x2879ebac) },
{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
SPH_C32(0xbe0a679e) },
{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
SPH_C32(0x30aebcf7) },
{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
SPH_C32(0xc7ff60f0) },
{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
SPH_C32(0xe7e00a94) }
}; };
__global__ __global__
@ -598,12 +328,12 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
unsigned char *h1 = (unsigned char *)Hash; unsigned char *h1 = (unsigned char *)Hash;
uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265); uint32_t c0 = 0x73746565, c1 = 0x6c706172, c2 = 0x6b204172, c3 = 0x656e6265;
uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c); uint32_t c4 = 0x72672031, c5 = 0x302c2062, c6 = 0x75732032, c7 = 0x3434362c;
uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48); uint32_t c8 = 0x20422d33, c9 = 0x30303120, cA = 0x4c657576, cB = 0x656e2d48;
uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d); uint32_t cC = 0x65766572, cD = 0x6c65652c, cE = 0x2042656c, cF = 0x6769756d;
uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
uint32_t *tp, db, dm; uint32_t *tp, db, dm;
for(int i = 0; i < 64; i += 8) { for(int i = 0; i < 64; i += 8) {
@ -637,16 +367,16 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
T_BIG; T_BIG;
} }
// precomputed for 64 bytes blocks ?
tp = &d_T512[0][0] + 112; tp = &d_T512[0][0] + 112;
m0 = tp[ 0]; m1 = tp[ 1];
m0 = *(tp+ 0); m1 = *(tp+ 1); m2 = tp[ 2]; m3 = tp[ 3];
m2 = *(tp+ 2); m3 = *(tp+ 3); m4 = tp[ 4]; m5 = tp[ 5];
m4 = *(tp+ 4); m5 = *(tp+ 5); m6 = tp[ 6]; m7 = tp[ 7];
m6 = *(tp+ 6); m7 = *(tp+ 7); m8 = tp[ 8]; m9 = tp[ 9];
m8 = *(tp+ 8); m9 = *(tp+ 9); mA = tp[10]; mB = tp[11];
mA = *(tp+10); mB = *(tp+11); mC = tp[12]; mD = tp[13];
mC = *(tp+12); mD = *(tp+13); mE = tp[14]; mF = tp[15];
mE = *(tp+14); mF = *(tp+15);
for( int r = 0; r < 6; r += 2 ) { for( int r = 0; r < 6; r += 2 ) {
ROUND_BIG(r, d_alpha_n); ROUND_BIG(r, d_alpha_n);
@ -655,15 +385,14 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
T_BIG; T_BIG;
tp = &d_T512[0][0] + 784; tp = &d_T512[0][0] + 784;
m0 = tp[ 0]; m1 = tp[ 1];
m0 = *(tp+ 0); m1 = *(tp+ 1); m2 = tp[ 2]; m3 = tp[ 3];
m2 = *(tp+ 2); m3 = *(tp+ 3); m4 = tp[ 4]; m5 = tp[ 5];
m4 = *(tp+ 4); m5 = *(tp+ 5); m6 = tp[ 6]; m7 = tp[ 7];
m6 = *(tp+ 6); m7 = *(tp+ 7); m8 = tp[ 8]; m9 = tp[ 9];
m8 = *(tp+ 8); m9 = *(tp+ 9); mA = tp[10]; mB = tp[11];
mA = *(tp+10); mB = *(tp+11); mC = tp[12]; mD = tp[13];
mC = *(tp+12); mD = *(tp+13); mE = tp[14]; mF = tp[15];
mE = *(tp+14); mF = *(tp+15);
for( int r = 0; r < 12; r += 2 ) { for( int r = 0; r < 12; r += 2 ) {
ROUND_BIG(r, d_alpha_f); ROUND_BIG(r, d_alpha_f);
@ -696,3 +425,127 @@ void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce
x13_hamsi512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); x13_hamsi512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
//MyStreamSynchronize(NULL, order, thr_id); //MyStreamSynchronize(NULL, order, thr_id);
} }
__constant__ static uint64_t c_PaddedMessage80[10];
__host__
void x16_hamsi512_setBlock_80(void *pdata)
{
cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
}
__global__
void x16_hamsi512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
unsigned char h1[80];
#pragma unroll
for (int i = 0; i < 10; i++)
((uint2*)h1)[i] = ((uint2*)c_PaddedMessage80)[i];
//((uint64_t*)h1)[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(startNonce + thread));
((uint32_t*)h1)[19] = cuda_swab32(startNonce + thread);
uint32_t c0 = 0x73746565, c1 = 0x6c706172, c2 = 0x6b204172, c3 = 0x656e6265;
uint32_t c4 = 0x72672031, c5 = 0x302c2062, c6 = 0x75732032, c7 = 0x3434362c;
uint32_t c8 = 0x20422d33, c9 = 0x30303120, cA = 0x4c657576, cB = 0x656e2d48;
uint32_t cC = 0x65766572, cD = 0x6c65652c, cE = 0x2042656c, cF = 0x6769756d;
uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
uint32_t *tp, db, dm;
for(int i = 0; i < 80; i += 8)
{
m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0;
m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0;
tp = &d_T512[0][0];
#pragma unroll
for (int u = 0; u < 8; u++) {
db = h1[i + u];
#pragma unroll 2
for (int v = 0; v < 8; v++, db >>= 1) {
dm = -(uint32_t)(db & 1);
m0 ^= dm & tp[ 0]; m1 ^= dm & tp[ 1];
m2 ^= dm & tp[ 2]; m3 ^= dm & tp[ 3];
m4 ^= dm & tp[ 4]; m5 ^= dm & tp[ 5];
m6 ^= dm & tp[ 6]; m7 ^= dm & tp[ 7];
m8 ^= dm & tp[ 8]; m9 ^= dm & tp[ 9];
mA ^= dm & tp[10]; mB ^= dm & tp[11];
mC ^= dm & tp[12]; mD ^= dm & tp[13];
mE ^= dm & tp[14]; mF ^= dm & tp[15];
tp += 16;
}
}
#pragma unroll
for (int r = 0; r < 6; r++) {
ROUND_BIG(r, d_alpha_n);
}
T_BIG;
}
#define INPUT_BIG { \
m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0; \
m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0; \
tp = &d_T512[0][0]; \
for (int u = 0; u < 8; u++) { \
db = endtag[u]; \
for (int v = 0; v < 8; v++, db >>= 1) { \
dm = -(uint32_t)(db & 1); \
m0 ^= dm & tp[ 0]; m1 ^= dm & tp[ 1]; \
m2 ^= dm & tp[ 2]; m3 ^= dm & tp[ 3]; \
m4 ^= dm & tp[ 4]; m5 ^= dm & tp[ 5]; \
m6 ^= dm & tp[ 6]; m7 ^= dm & tp[ 7]; \
m8 ^= dm & tp[ 8]; m9 ^= dm & tp[ 9]; \
mA ^= dm & tp[10]; mB ^= dm & tp[11]; \
mC ^= dm & tp[12]; mD ^= dm & tp[13]; \
mE ^= dm & tp[14]; mF ^= dm & tp[15]; \
tp += 16; \
} \
} \
}
// close
uint8_t endtag[8] = { 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
INPUT_BIG;
#pragma unroll
for (int r = 0; r < 6; r++) {
ROUND_BIG(r, d_alpha_n);
}
T_BIG;
endtag[0] = endtag[1] = 0x00;
endtag[6] = 0x02;
endtag[7] = 0x80;
INPUT_BIG;
// PF_BIG
#pragma unroll
for(int r = 0; r < 12; r++) {
ROUND_BIG(r, d_alpha_f);
}
T_BIG;
uint64_t hashPosition = thread;
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition << 3];
#pragma unroll 16
for(int i = 0; i < 16; i++)
Hash[i] = cuda_swab32(h[i]);
#undef INPUT_BIG
}
}
__host__
void x16_hamsi512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
x16_hamsi512_gpu_hash_80 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash);
}

64
x15/cuda_x15_whirlpool_sm3.cu

@ -1998,7 +1998,7 @@ const int i0, const int i1, const int i2, const int i3, const int i4, const int
__global__ __global__
void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, int swab) void oldwhirlpool_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, void *outputHash, int swab)
{ {
__shared__ uint64_t sharedMemory[2048]; __shared__ uint64_t sharedMemory[2048];
@ -2014,7 +2014,8 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x]; sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
#endif #endif
} }
__threadfence_block(); // ensure shared mem is ready //__threadfence_block(); // ensure shared mem is ready
__syncthreads();
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads) if (thread < threads)
@ -2028,7 +2029,8 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
uint64_t state[8]; uint64_t state[8];
#pragma unroll 8 #pragma unroll 8
for (int i=0; i < 8; i++) { for (int i=0; i < 8; i++) {
state[i] = c_PaddedMessage80[i]; //state[i] = c_PaddedMessage80[i];
AS_UINT2(&state[i]) = AS_UINT2(&c_PaddedMessage80[i]);
} }
#else #else
#pragma unroll 8 #pragma unroll 8
@ -2050,6 +2052,7 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
state[i] = xor1(n[i],c_PaddedMessage80[i]); state[i] = xor1(n[i],c_PaddedMessage80[i]);
} }
#endif #endif
/// round 2 /////// /// round 2 ///////
////////////////////////////////// //////////////////////////////////
n[0] = c_PaddedMessage80[8]; //read data n[0] = c_PaddedMessage80[8]; //read data
@ -2331,7 +2334,7 @@ extern uint32_t whirlpool512_finalhash_64(int thr_id, uint32_t threads, uint32_t
} }
__host__ __host__
void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash) void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash)
{ {
dim3 grid((threads + threadsperblock-1) / threadsperblock); dim3 grid((threads + threadsperblock-1) / threadsperblock);
dim3 block(threadsperblock); dim3 block(threadsperblock);
@ -2339,7 +2342,7 @@ void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNounce
if (threads < 256) if (threads < 256)
applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!"); applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!");
oldwhirlpool_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_outputHash, 1); oldwhirlpool_gpu_hash_80<<<grid, block>>>(threads, startNonce, d_outputHash, 1);
} }
extern void whirl_midstate(void *state, const void *input); extern void whirl_midstate(void *state, const void *input);
@ -2363,3 +2366,54 @@ void whirlpool512_setBlock_80_sm3(void *pdata, const void *ptarget)
cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice);
} }
// ------------------------------------------------------------------------------------------------
__host__
void x16_whirlpool512_init(int thr_id, uint32_t threads)
{
cudaMemcpyToSymbol(InitVector_RC, plain_RC, sizeof(plain_RC), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(mixTob0Tox, plain_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice);
#if USE_ALL_TABLES
cudaMemcpyToSymbol(mixTob1Tox, plain_T1, (256 * 8), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(mixTob2Tox, plain_T2, (256 * 8), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(mixTob3Tox, plain_T3, (256 * 8), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(mixTob4Tox, plain_T4, (256 * 8), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(mixTob5Tox, plain_T5, (256 * 8), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(mixTob6Tox, plain_T6, (256 * 8), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(mixTob7Tox, plain_T7, (256 * 8), 0, cudaMemcpyHostToDevice);
#endif
}
extern void whirlpool_midstate(void *state, const void *input);
__host__
void x16_whirlpool512_setBlock_80(void *pdata)
{
unsigned char PaddedMessage[128];
memcpy(PaddedMessage, pdata, 80);
memset(PaddedMessage + 80, 0, 48);
PaddedMessage[80] = 0x80; /* ending */
#if HOST_MIDSTATE
// compute constant first block
unsigned char midstate[64] = { 0 };
whirlpool_midstate(midstate, pdata);
memcpy(PaddedMessage, midstate, 64);
#endif
cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice);
}
__host__
void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_outputHash)
{
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
if (threads < 256)
applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!");
oldwhirlpool_gpu_hash_80 <<<grid, block>>> (threads, startNonce, d_outputHash, 1);
}

80
x16/cuda_x16.h

@ -0,0 +1,80 @@
#include "x11/cuda_x11.h"
extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x13_fugue512_cpu_free(int thr_id);
extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag);
extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x15_whirlpool_cpu_free(int thr_id);
extern void x17_sha512_cpu_init(int thr_id, uint32_t threads);
extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash);
extern void x17_haval256_cpu_init(int thr_id, uint32_t threads);
extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, const int outlen);
void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
// ---- optimised but non compatible kernels
void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
// ---- 80 bytes kernels
void quark_bmw512_cpu_setBlock_80(void *pdata);
void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
void groestl512_setBlock_80(int thr_id, uint32_t *endiandata);
void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
void skein512_cpu_setBlock_80(void *pdata);
void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap);
void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
void qubit_luffa512_cpu_setBlock_80(void *pdata);
void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
void jh512_setBlock_80(int thr_id, uint32_t *endiandata);
void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
void keccak512_setBlock_80(int thr_id, uint32_t *endiandata);
void keccak512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
void x11_shavite512_setBlock_80(void *pdata);
void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
void x16_shabal512_setBlock_80(void *pdata);
void x16_shabal512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
void x16_simd512_setBlock_80(void *pdata);
void x16_simd512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
void x16_echo512_cuda_init(int thr_id, const uint32_t threads);
void x16_echo512_setBlock_80(void *pdata);
void x16_echo512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
void x16_hamsi512_setBlock_80(void *pdata);
void x16_hamsi512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
void x16_fugue512_cpu_init(int thr_id, uint32_t threads);
void x16_fugue512_cpu_free(int thr_id);
void x16_fugue512_setBlock_80(void *pdata);
void x16_fugue512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
void x16_whirlpool512_init(int thr_id, uint32_t threads);
void x16_whirlpool512_setBlock_80(void* endiandata);
void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
void x16_sha512_setBlock_80(void *pdata);
void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);

214
x16/cuda_x16_echo512.cu

@ -0,0 +1,214 @@
/**
* echo512-80 cuda kernel for X16R algorithm
*
* tpruvot 2018 - GPL code
*/
#include <stdio.h>
#include <memory.h>
#include "cuda_helper.h"
extern __device__ __device_builtin__ void __threadfence_block(void);
#include "../x11/cuda_x11_aes.cuh"
__device__ __forceinline__ void AES_2ROUND(const uint32_t* __restrict__ sharedMemory,
uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3,
uint32_t &k0)
{
uint32_t y0, y1, y2, y3;
aes_round(sharedMemory,
x0, x1, x2, x3,
k0,
y0, y1, y2, y3);
aes_round(sharedMemory,
y0, y1, y2, y3,
x0, x1, x2, x3);
k0++;
}
__device__
static void echo_round(uint32_t* const sharedMemory, uint32_t *W, uint32_t &k0)
{
// Big Sub Words
#pragma unroll 16
for (int idx = 0; idx < 16; idx++) {
AES_2ROUND(sharedMemory, W[(idx << 2) + 0], W[(idx << 2) + 1], W[(idx << 2) + 2], W[(idx << 2) + 3], k0);
}
// Shift Rows
#pragma unroll 4
for (int i = 0; i < 4; i++)
{
uint32_t t[4];
/// 1, 5, 9, 13
t[0] = W[i + 4];
t[1] = W[i + 8];
t[2] = W[i + 24];
t[3] = W[i + 60];
W[i + 4] = W[i + 20];
W[i + 8] = W[i + 40];
W[i + 24] = W[i + 56];
W[i + 60] = W[i + 44];
W[i + 20] = W[i + 36];
W[i + 40] = t[1];
W[i + 56] = t[2];
W[i + 44] = W[i + 28];
W[i + 28] = W[i + 12];
W[i + 12] = t[3];
W[i + 36] = W[i + 52];
W[i + 52] = t[0];
}
// Mix Columns
#pragma unroll 4
for (int i = 0; i < 4; i++)
{
#pragma unroll 4
for (int idx = 0; idx < 64; idx += 16)
{
uint32_t a[4];
a[0] = W[idx + i];
a[1] = W[idx + i + 4];
a[2] = W[idx + i + 8];
a[3] = W[idx + i + 12];
uint32_t ab = a[0] ^ a[1];
uint32_t bc = a[1] ^ a[2];
uint32_t cd = a[2] ^ a[3];
uint32_t t, t2, t3;
t = (ab & 0x80808080);
t2 = (bc & 0x80808080);
t3 = (cd & 0x80808080);
uint32_t abx = (t >> 7) * 27U ^ ((ab^t) << 1);
uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
W[idx + i] = bc ^ a[3] ^ abx;
W[idx + i + 4] = a[0] ^ cd ^ bcx;
W[idx + i + 8] = ab ^ a[3] ^ cdx;
W[idx + i + 12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx);
}
}
}
__device__ __forceinline__
void cuda_echo_round_80(uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__ data, const uint32_t nonce, uint32_t *hash)
{
uint32_t h[29]; // <= 127 bytes input
#pragma unroll 8
for (int i = 0; i < 18; i += 2)
AS_UINT2(&h[i]) = AS_UINT2(&data[i]);
h[18] = data[18];
h[19] = cuda_swab32(nonce);
h[20] = 0x80;
h[21] = h[22] = h[23] = h[24] = h[25] = h[26] = 0;
//((uint8_t*)h)[80] = 0x80;
//((uint8_t*)h)[128-17] = 0x02;
//((uint8_t*)h)[128-16] = 0x80;
//((uint8_t*)h)[128-15] = 0x02;
h[27] = 0x2000000;
h[28] = 0x280;
//h[29] = h[30] = h[31] = 0;
uint32_t k0 = 640; // bitlen
uint32_t W[64];
#pragma unroll 8
for (int i = 0; i < 32; i+=4) {
W[i] = 512; // L
W[i+1] = 0; // H
W[i+2] = 0; // X
W[i+3] = 0;
}
uint32_t Z[16];
#pragma unroll
for (int i = 0; i<16; i++) Z[i] = W[i];
#pragma unroll
for (int i = 32; i<61; i++) W[i] = h[i - 32];
#pragma unroll
for (int i = 61; i<64; i++) W[i] = 0;
for (int i = 0; i < 10; i++)
echo_round(sharedMemory, W, k0);
#pragma unroll 16
for (int i = 0; i < 16; i++) {
Z[i] ^= h[i] ^ W[i] ^ W[i + 32];
}
#pragma unroll 8
for (int i = 0; i < 16; i += 2)
AS_UINT2(&hash[i]) = AS_UINT2(&Z[i]);
}
__device__ __forceinline__
void echo_gpu_init(uint32_t *const __restrict__ sharedMemory)
{
/* each thread startup will fill a uint32 */
if (threadIdx.x < 128) {
sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
}
}
__host__
void x16_echo512_cuda_init(int thr_id, const uint32_t threads)
{
aes_cpu_init(thr_id);
}
__constant__ static uint32_t c_PaddedMessage80[20];
__host__
void x16_echo512_setBlock_80(void *endiandata)
{
cudaMemcpyToSymbol(c_PaddedMessage80, endiandata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
}
__global__ __launch_bounds__(128, 7) /* will force 72 registers */
void x16_echo512_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint64_t *g_hash)
{
__shared__ uint32_t sharedMemory[1024];
echo_gpu_init(sharedMemory);
__threadfence_block();
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint64_t hashPosition = thread;
uint32_t *pHash = (uint32_t*)&g_hash[hashPosition<<3];
cuda_echo_round_80(sharedMemory, c_PaddedMessage80, startNonce + thread, pHash);
}
}
__host__
void x16_echo512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
x16_echo512_gpu_hash_80<<<grid, block>>>(threads, startNonce, (uint64_t*)d_hash);
}

248
x16/cuda_x16_echo512_64.cu

@ -0,0 +1,248 @@
/**
* Echo512-64 kernel for maxwell, based on alexis work
*/
#include <cuda_helper.h>
#include <cuda_vector_uint2x4.h>
#include <cuda_vectors.h>
#define INTENSIVE_GMF
#include "tribus/cuda_echo512_aes.cuh"
#ifdef __INTELLISENSE__
#define __byte_perm(x, y, b) x
#define atomicExch(p,y) (*p) = y
#endif
__device__
static void echo_round_alexis(const uint32_t sharedMemory[4][256], uint32_t *W, uint32_t &k0)
{
// Big Sub Words
#pragma unroll 16
for (int idx = 0; idx < 16; idx++)
AES_2ROUND(sharedMemory,W[(idx<<2) + 0], W[(idx<<2) + 1], W[(idx<<2) + 2], W[(idx<<2) + 3], k0);
// Shift Rows
#pragma unroll 4
for (int i = 0; i < 4; i++){
uint32_t t[4];
/// 1, 5, 9, 13
t[0] = W[i+ 4];
t[1] = W[i+ 8];
t[2] = W[i+24];
t[3] = W[i+60];
W[i + 4] = W[i + 20];
W[i + 8] = W[i + 40];
W[i +24] = W[i + 56];
W[i +60] = W[i + 44];
W[i +20] = W[i +36];
W[i +40] = t[1];
W[i +56] = t[2];
W[i +44] = W[i +28];
W[i +28] = W[i +12];
W[i +12] = t[3];
W[i +36] = W[i +52];
W[i +52] = t[0];
}
// Mix Columns
#pragma unroll 4
for (int i = 0; i < 4; i++){ // Schleife über je 2*uint32_t
#pragma unroll 4
for (int idx = 0; idx < 64; idx += 16){ // Schleife über die elemnte
uint32_t a[4];
a[0] = W[idx + i];
a[1] = W[idx + i + 4];
a[2] = W[idx + i + 8];
a[3] = W[idx + i +12];
uint32_t ab = a[0] ^ a[1];
uint32_t bc = a[1] ^ a[2];
uint32_t cd = a[2] ^ a[3];
uint32_t t, t2, t3;
t = (ab & 0x80808080);
t2 = (bc & 0x80808080);
t3 = (cd & 0x80808080);
uint32_t abx = (t >> 7) * 27U ^ ((ab^t) << 1);
uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
W[idx + i] = bc ^ a[3] ^ abx;
W[idx + i + 4] = a[0] ^ cd ^ bcx;
W[idx + i + 8] = ab ^ a[3] ^ cdx;
W[idx + i +12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx);
}
}
}
__global__ __launch_bounds__(128, 5) /* will force 80 registers */
static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
{
__shared__ uint32_t sharedMemory[4][256];
aes_gpu_init128(sharedMemory);
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint32_t k0;
uint32_t h[16];
uint32_t hash[16];
if (thread < threads)
{
uint32_t *Hash = &g_hash[thread<<4];
*(uint2x4*)&h[ 0] = __ldg4((uint2x4*)&Hash[ 0]);
*(uint2x4*)&h[ 8] = __ldg4((uint2x4*)&Hash[ 8]);
*(uint2x4*)&hash[ 0] = *(uint2x4*)&h[ 0];
*(uint2x4*)&hash[ 8] = *(uint2x4*)&h[ 8];
__syncthreads();
const uint32_t P[48] = {
0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
//8-12
0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
//21-25
0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751, 0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
//34-38
0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7, 0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968,
0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af
//58-61
};
k0 = 520;
#pragma unroll 4
for (uint32_t idx = 0; idx < 16; idx += 4) {
AES_2ROUND(sharedMemory, h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0);
}
k0 += 4;
uint32_t W[64];
#pragma unroll 4
for (uint32_t i = 0; i < 4; i++)
{
uint32_t a = P[i];
uint32_t b = P[i + 4];
uint32_t c = h[i + 8];
uint32_t d = P[i + 8];
uint32_t ab = a ^ b;
uint32_t bc = b ^ c;
uint32_t cd = c ^ d;
uint32_t t = (ab & 0x80808080);
uint32_t t2 = (bc & 0x80808080);
uint32_t t3 = (cd & 0x80808080);
uint32_t abx = (t >> 7) * 27U ^ ((ab^t) << 1);
uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
W[i] = abx ^ bc ^ d;
W[i + 4] = bcx ^ a ^ cd;
W[i + 8] = cdx ^ ab ^ d;
W[i +12] = abx ^ bcx ^ cdx ^ ab ^ c;
a = P[i +12];
b = h[i + 4];
c = P[i +16];
d = P[i +20];
ab = a ^ b;
bc = b ^ c;
cd = c ^ d;
t = (ab & 0x80808080);
t2 = (bc & 0x80808080);
t3 = (cd & 0x80808080);
abx = (t >> 7) * 27U ^ ((ab^t) << 1);
bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
W[16 + i] = bc ^ d ^ abx;
W[16 + i + 4] = a ^ cd ^ bcx;
W[16 + i + 8] = d ^ ab ^ cdx;
W[16 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx;
a = h[i];
b = P[24 + i + 0];
c = P[24 + i + 4];
d = P[24 + i + 8];
ab = a ^ b;
bc = b ^ c;
cd = c ^ d;
t = (ab & 0x80808080);
t2 = (bc & 0x80808080);
t3 = (cd & 0x80808080);
abx = (t >> 7) * 27U ^ ((ab^t) << 1);
bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
W[32 + i] = bc ^ d ^ abx;
W[32 + i + 4] = a ^ cd ^ bcx;
W[32 + i + 8] = d ^ ab ^ cdx;
W[32 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx;
a = P[36 + i ];
b = P[36 + i + 4];
c = P[36 + i + 8];
d = h[i + 12];
ab = a ^ b;
bc = b ^ c;
cd = c ^ d;
t = (ab & 0x80808080);
t2 = (bc & 0x80808080);
t3 = (cd & 0x80808080);
abx = (t >> 7) * 27U ^ ((ab^t) << 1);
bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
W[48 + i] = bc ^ d ^ abx;
W[48 + i + 4] = a ^ cd ^ bcx;
W[48 + i + 8] = d ^ ab ^ cdx;
W[48 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx;
}
for (int k = 1; k < 10; k++)
echo_round_alexis(sharedMemory,W,k0);
#pragma unroll 4
for (int i = 0; i < 16; i += 4)
{
W[i] ^= W[32 + i] ^ 512;
W[i + 1] ^= W[32 + i + 1];
W[i + 2] ^= W[32 + i + 2];
W[i + 3] ^= W[32 + i + 3];
}
*(uint2x4*)&Hash[ 0] = *(uint2x4*)&hash[ 0] ^ *(uint2x4*)&W[ 0];
*(uint2x4*)&Hash[ 8] = *(uint2x4*)&hash[ 8] ^ *(uint2x4*)&W[ 8];
}
}
__host__
void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
x16_echo512_gpu_hash_64<<<grid, block>>>(threads, d_hash);
}

467
x16/cuda_x16_fugue512.cu

@ -0,0 +1,467 @@
#include <stdio.h>
#include <cuda_helper.h>
#define TPB 256
/*
* fugue512-80 x16r kernel implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2018 tpruvot
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*/
#ifdef __INTELLISENSE__
#define __byte_perm(x, y, m) (x|y)
#define tex1Dfetch(t, n) (n)
#define __CUDACC__
#include <cuda_texture_types.h>
#endif
// store allocated textures device addresses
static unsigned int* d_textures[MAX_GPUS][1];
#define mixtab0(x) mixtabs[(x)]
#define mixtab1(x) mixtabs[(x)+256]
#define mixtab2(x) mixtabs[(x)+512]
#define mixtab3(x) mixtabs[(x)+768]
static texture<unsigned int, 1, cudaReadModeElementType> mixTab0Tex;
static const uint32_t mixtab0[] = {
0x63633297, 0x7c7c6feb, 0x77775ec7, 0x7b7b7af7, 0xf2f2e8e5, 0x6b6b0ab7, 0x6f6f16a7, 0xc5c56d39,
0x303090c0, 0x01010704, 0x67672e87, 0x2b2bd1ac, 0xfefeccd5, 0xd7d71371, 0xabab7c9a, 0x767659c3,
0xcaca4005, 0x8282a33e, 0xc9c94909, 0x7d7d68ef, 0xfafad0c5, 0x5959947f, 0x4747ce07, 0xf0f0e6ed,
0xadad6e82, 0xd4d41a7d, 0xa2a243be, 0xafaf608a, 0x9c9cf946, 0xa4a451a6, 0x727245d3, 0xc0c0762d,
0xb7b728ea, 0xfdfdc5d9, 0x9393d47a, 0x2626f298, 0x363682d8, 0x3f3fbdfc, 0xf7f7f3f1, 0xcccc521d,
0x34348cd0, 0xa5a556a2, 0xe5e58db9, 0xf1f1e1e9, 0x71714cdf, 0xd8d83e4d, 0x313197c4, 0x15156b54,
0x04041c10, 0xc7c76331, 0x2323e98c, 0xc3c37f21, 0x18184860, 0x9696cf6e, 0x05051b14, 0x9a9aeb5e,
0x0707151c, 0x12127e48, 0x8080ad36, 0xe2e298a5, 0xebeba781, 0x2727f59c, 0xb2b233fe, 0x757550cf,
0x09093f24, 0x8383a43a, 0x2c2cc4b0, 0x1a1a4668, 0x1b1b416c, 0x6e6e11a3, 0x5a5a9d73, 0xa0a04db6,
0x5252a553, 0x3b3ba1ec, 0xd6d61475, 0xb3b334fa, 0x2929dfa4, 0xe3e39fa1, 0x2f2fcdbc, 0x8484b126,
0x5353a257, 0xd1d10169, 0x00000000, 0xededb599, 0x2020e080, 0xfcfcc2dd, 0xb1b13af2, 0x5b5b9a77,
0x6a6a0db3, 0xcbcb4701, 0xbebe17ce, 0x3939afe4, 0x4a4aed33, 0x4c4cff2b, 0x5858937b, 0xcfcf5b11,
0xd0d0066d, 0xefefbb91, 0xaaaa7b9e, 0xfbfbd7c1, 0x4343d217, 0x4d4df82f, 0x333399cc, 0x8585b622,
0x4545c00f, 0xf9f9d9c9, 0x02020e08, 0x7f7f66e7, 0x5050ab5b, 0x3c3cb4f0, 0x9f9ff04a, 0xa8a87596,
0x5151ac5f, 0xa3a344ba, 0x4040db1b, 0x8f8f800a, 0x9292d37e, 0x9d9dfe42, 0x3838a8e0, 0xf5f5fdf9,
0xbcbc19c6, 0xb6b62fee, 0xdada3045, 0x2121e784, 0x10107040, 0xffffcbd1, 0xf3f3efe1, 0xd2d20865,
0xcdcd5519, 0x0c0c2430, 0x1313794c, 0xececb29d, 0x5f5f8667, 0x9797c86a, 0x4444c70b, 0x1717655c,
0xc4c46a3d, 0xa7a758aa, 0x7e7e61e3, 0x3d3db3f4, 0x6464278b, 0x5d5d886f, 0x19194f64, 0x737342d7,
0x60603b9b, 0x8181aa32, 0x4f4ff627, 0xdcdc225d, 0x2222ee88, 0x2a2ad6a8, 0x9090dd76, 0x88889516,
0x4646c903, 0xeeeebc95, 0xb8b805d6, 0x14146c50, 0xdede2c55, 0x5e5e8163, 0x0b0b312c, 0xdbdb3741,
0xe0e096ad, 0x32329ec8, 0x3a3aa6e8, 0x0a0a3628, 0x4949e43f, 0x06061218, 0x2424fc90, 0x5c5c8f6b,
0xc2c27825, 0xd3d30f61, 0xacac6986, 0x62623593, 0x9191da72, 0x9595c662, 0xe4e48abd, 0x797974ff,
0xe7e783b1, 0xc8c84e0d, 0x373785dc, 0x6d6d18af, 0x8d8d8e02, 0xd5d51d79, 0x4e4ef123, 0xa9a97292,
0x6c6c1fab, 0x5656b943, 0xf4f4fafd, 0xeaeaa085, 0x6565208f, 0x7a7a7df3, 0xaeae678e, 0x08083820,
0xbaba0bde, 0x787873fb, 0x2525fb94, 0x2e2ecab8, 0x1c1c5470, 0xa6a65fae, 0xb4b421e6, 0xc6c66435,
0xe8e8ae8d, 0xdddd2559, 0x747457cb, 0x1f1f5d7c, 0x4b4bea37, 0xbdbd1ec2, 0x8b8b9c1a, 0x8a8a9b1e,
0x70704bdb, 0x3e3ebaf8, 0xb5b526e2, 0x66662983, 0x4848e33b, 0x0303090c, 0xf6f6f4f5, 0x0e0e2a38,
0x61613c9f, 0x35358bd4, 0x5757be47, 0xb9b902d2, 0x8686bf2e, 0xc1c17129, 0x1d1d5374, 0x9e9ef74e,
0xe1e191a9, 0xf8f8decd, 0x9898e556, 0x11117744, 0x696904bf, 0xd9d93949, 0x8e8e870e, 0x9494c166,
0x9b9bec5a, 0x1e1e5a78, 0x8787b82a, 0xe9e9a989, 0xcece5c15, 0x5555b04f, 0x2828d8a0, 0xdfdf2b51,
0x8c8c8906, 0xa1a14ab2, 0x89899212, 0x0d0d2334, 0xbfbf10ca, 0xe6e684b5, 0x4242d513, 0x686803bb,
0x4141dc1f, 0x9999e252, 0x2d2dc3b4, 0x0f0f2d3c, 0xb0b03df6, 0x5454b74b, 0xbbbb0cda, 0x16166258
};
#define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \
x22 ^= x00; \
x00 = (q); \
x08 ^= x00; \
x01 ^= x24; \
x04 ^= x27; \
x07 ^= x30; \
}
#define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20) { \
x00 ^= x04; \
x01 ^= x05; \
x02 ^= x06; \
x18 ^= x04; \
x19 ^= x05; \
x20 ^= x06; \
}
#define SMIX(x0, x1, x2, x3) { \
uint32_t tmp; \
uint32_t r0 = 0; \
uint32_t r1 = 0; \
uint32_t r2 = 0; \
uint32_t r3 = 0; \
uint32_t c0 = mixtab0(x0 >> 24); \
tmp = mixtab1((x0 >> 16) & 0xFF); \
c0 ^= tmp; \
r1 ^= tmp; \
tmp = mixtab2((x0 >> 8) & 0xFF); \
c0 ^= tmp; \
r2 ^= tmp; \
tmp = mixtab3(x0 & 0xFF); \
c0 ^= tmp; \
r3 ^= tmp; \
tmp = mixtab0(x1 >> 24); \
uint32_t c1 = tmp; \
r0 ^= tmp; \
tmp = mixtab1((x1 >> 16) & 0xFF); \
c1 ^= tmp; \
tmp = mixtab2((x1 >> 8) & 0xFF); \
c1 ^= tmp; \
r2 ^= tmp; \
tmp = mixtab3(x1 & 0xFF); \
c1 ^= tmp; \
r3 ^= tmp; \
tmp = mixtab0(x2 >> 24); \
uint32_t c2 = tmp; \
r0 ^= tmp; \
tmp = mixtab1((x2 >> 16) & 0xFF); \
c2 ^= tmp; \
r1 ^= tmp; \
tmp = mixtab2((x2 >> 8) & 0xFF); \
c2 ^= tmp; \
tmp = mixtab3(x2 & 0xFF); \
c2 ^= tmp; \
r3 ^= tmp; \
tmp = mixtab0(x3 >> 24); \
uint32_t c3 = tmp; \
r0 ^= tmp; \
tmp = mixtab1((x3 >> 16) & 0xFF); \
c3 ^= tmp; \
r1 ^= tmp; \
tmp = mixtab2((x3 >> 8) & 0xFF); \
c3 ^= tmp; \
r2 ^= tmp; \
tmp = mixtab3(x3 & 0xFF); \
c3 ^= tmp; \
x0 = ((c0 ^ r0) & 0xFF000000) | ((c1 ^ r1) & 0x00FF0000) \
| ((c2 ^ r2) & 0x0000FF00) | ((c3 ^ r3) & 0x000000FF); \
x1 = ((c1 ^ (r0 << 8)) & 0xFF000000) | ((c2 ^ (r1 << 8)) & 0x00FF0000) \
| ((c3 ^ (r2 << 8)) & 0x0000FF00) | ((c0 ^ (r3 >> 24)) & 0x000000FF); \
x2 = ((c2 ^ (r0 << 16)) & 0xFF000000) | ((c3 ^ (r1 << 16)) & 0x00FF0000) \
| ((c0 ^ (r2 >> 16)) & 0x0000FF00) | ((c1 ^ (r3 >> 16)) & 0x000000FF); \
x3 = ((c3 ^ (r0 << 24)) & 0xFF000000) | ((c0 ^ (r1 >> 8)) & 0x00FF0000) \
| ((c1 ^ (r2 >> 8)) & 0x0000FF00) | ((c2 ^ (r3 >> 8)) & 0x000000FF); \
}
#define SUB_ROR3 { \
B33 = S33, B34 = S34, B35 = S35; \
S35 = S32; S34 = S31; S33 = S30; S32 = S29; S31 = S28; S30 = S27; S29 = S26; S28 = S25; S27 = S24; \
S26 = S23; S25 = S22; S24 = S21; S23 = S20; S22 = S19; S21 = S18; S20 = S17; S19 = S16; S18 = S15; \
S17 = S14; S16 = S13; S15 = S12; S14 = S11; S13 = S10; S12 = S09; S11 = S08; S10 = S07; S09 = S06; \
S08 = S05; S07 = S04; S06 = S03; S05 = S02; S04 = S01; S03 = S00; S02 = B35; S01 = B34; S00 = B33; \
}
#define SUB_ROR8 { \
B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
S35 = S27; S34 = S26; S33 = S25; S32 = S24; S31 = S23; S30 = S22; S29 = S21; S28 = S20; S27 = S19; \
S26 = S18; S25 = S17; S24 = S16; S23 = S15; S22 = S14; S21 = S13; S20 = S12; S19 = S11; S18 = S10; \
S17 = S09; S16 = S08; S15 = S07; S14 = S06; S13 = S05; S12 = S04; S11 = S03; S10 = S02; S09 = S01; \
S08 = S00; S07 = B35; S06 = B34; S05 = B33; S04 = B32; S03 = B31; S02 = B30; S01 = B29; S00 = B28; \
}
#define SUB_ROR9 { \
B27 = S27, B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
S35 = S26; S34 = S25; S33 = S24; S32 = S23; S31 = S22; S30 = S21; S29 = S20; S28 = S19; S27 = S18; \
S26 = S17; S25 = S16; S24 = S15; S23 = S14; S22 = S13; S21 = S12; S20 = S11; S19 = S10; S18 = S09; \
S17 = S08; S16 = S07; S15 = S06; S14 = S05; S13 = S04; S12 = S03; S11 = S02; S10 = S01; S09 = S00; \
S08 = B35; S07 = B34; S06 = B33; S05 = B32; S04 = B31; S03 = B30; S02 = B29; S01 = B28; S00 = B27; \
}
#define SUB_ROR9_3 { \
SUB_ROR3; SUB_ROR3; SUB_ROR3; \
}
#define SUB_ROR12 { /* to fix */ \
B24 = S00; B25 = S01; B26 = S02; B27 = S03; B28 = S04; B29 = S05; B30 = S06; B31 = S07; B32 = S08; B33 = S09; B34 = S10; B35 = S11; \
S00 = S12; S01 = S13; S02 = S14; S03 = S15; S04 = S16; S05 = S17; S06 = S18; S07 = S19; S08 = S20; S09 = S21; S10 = S22; S11 = S23; \
S12 = S24; S13 = S25; S14 = S26; S15 = S27; S16 = S28; S17 = S29; S18 = S30; S19 = S31; S20 = S32; S21 = S33; S22 = S34; S23 = S35; \
S24 = B24; S25 = B25; S26 = B26; S27 = B27; S28 = B28; S29 = B29; S30 = B30; S31 = B31; S32 = B32; S33 = B33; S34 = B34; S35 = B35; \
}
#define FUGUE512_3(x, y, z) { \
TIX4(x, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
SMIX(S33, S34, S35, S00); \
CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
SMIX(S30, S31, S32, S33); \
CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
SMIX(S27, S28, S29, S30); \
CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
SMIX(S24, S25, S26, S27); \
\
TIX4(y, S24, S25, S28, S31, S32, S10, S12, S15, S18); \
CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \
SMIX(S21, S22, S23, S24); \
CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \
SMIX(S18, S19, S20, S21); \
CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \
SMIX(S15, S16, S17, S18); \
CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \
SMIX(S12, S13, S14, S15); \
\
TIX4(z, S12, S13, S16, S19, S20, S34, S00, S03, S06); \
CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \
SMIX(S09, S10, S11, S12); \
CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \
SMIX(S06, S07, S08, S09); \
CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \
SMIX(S03, S04, S05, S06); \
CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \
SMIX(S00, S01, S02, S03); \
}
#define FUGUE512_F(w, x, y, z) { \
TIX4(w, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
SMIX(S33, S34, S35, S00); \
CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
SMIX(S30, S31, S32, S33); \
CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
SMIX(S27, S28, S29, S30); \
CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
SMIX(S24, S25, S26, S27); \
\
TIX4(x, S24, S25, S28, S31, S32, S10, S12, S15, S18); \
CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \
SMIX(S21, S22, S23, S24); \
CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \
SMIX(S18, S19, S20, S21); \
CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \
SMIX(S15, S16, S17, S18); \
CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \
SMIX(S12, S13, S14, S15); \
\
TIX4(y, S12, S13, S16, S19, S20, S34, S00, S03, S06); \
CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \
SMIX(S09, S10, S11, S12); \
CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \
SMIX(S06, S07, S08, S09); \
CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \
SMIX(S03, S04, S05, S06); \
CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \
SMIX(S00, S01, S02, S03); \
\
TIX4(z, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
SMIX(S33, S34, S35, S00); \
CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
SMIX(S30, S31, S32, S33); \
CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
SMIX(S27, S28, S29, S30); \
CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
SMIX(S24, S25, S26, S27); \
}
#undef ROL8
#ifdef __CUDA_ARCH__
__device__ __forceinline__
uint32_t ROL8(const uint32_t a) {
return __byte_perm(a, 0, 0x2103);
}
__device__ __forceinline__
uint32_t ROR8(const uint32_t a) {
return __byte_perm(a, 0, 0x0321);
}
__device__ __forceinline__
uint32_t ROL16(const uint32_t a) {
return __byte_perm(a, 0, 0x1032);
}
#else
#define ROL8(u) ROTL32(u, 8)
#define ROR8(u) ROTR32(u, 8)
#define ROL16(u) ROTL32(u,16)
#endif
//#define AS_UINT4(addr) *((uint4*)(addr))
__constant__ static uint64_t c_PaddedMessage80[10];
__host__
void x16_fugue512_setBlock_80(void *pdata)
{
cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
}
/***************************************************/
__global__
__launch_bounds__(TPB)
void x16_fugue512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
{
__shared__ uint32_t mixtabs[1024];
// load shared mem (with 256 threads)
const uint32_t thr = threadIdx.x & 0xFF;
const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);
mixtabs[thr] = tmp;
mixtabs[thr+256] = ROR8(tmp);
mixtabs[thr+512] = ROL16(tmp);
mixtabs[thr+768] = ROL8(tmp);
#if TPB <= 256
if (blockDim.x < 256) {
const uint32_t thr = (threadIdx.x + 0x80) & 0xFF;
const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);
mixtabs[thr] = tmp;
mixtabs[thr + 256] = ROR8(tmp);
mixtabs[thr + 512] = ROL16(tmp);
mixtabs[thr + 768] = ROL8(tmp);
}
#endif
__syncthreads();
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t Data[20];
#pragma unroll
for(int i = 0; i < 10; i++)
AS_UINT2(&Data[i * 2]) = AS_UINT2(&c_PaddedMessage80[i]);
Data[19] = (startNonce + thread);
uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09, S10, S11;
uint32_t S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23;
uint32_t S24, S25, S26, S27, S28, S29, S30, S31, S32, S33, S34, S35;
//uint32_t B24, B25, B26,
uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35;
//const uint64_t bc = 640 bits to hash
//const uint32_t bclo = (uint32_t)(bc);
//const uint32_t bchi = (uint32_t)(bc >> 32);
S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = 0;
S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
S20 = 0x8807a57e; S21 = 0xe616af75; S22 = 0xc5d3e4db; S23 = 0xac9ab027;
S24 = 0xd915f117; S25 = 0xb6eecc54; S26 = 0x06e8020b; S27 = 0x4a92efd1;
S28 = 0xaac6e2c9; S29 = 0xddb21398; S30 = 0xcae65838; S31 = 0x437f203f;
S32 = 0x25ea78e7; S33 = 0x951fddd6; S34 = 0xda6ed11d; S35 = 0xe13e3567;
FUGUE512_3((Data[ 0]), (Data[ 1]), (Data[ 2]));
FUGUE512_3((Data[ 3]), (Data[ 4]), (Data[ 5]));
FUGUE512_3((Data[ 6]), (Data[ 7]), (Data[ 8]));
FUGUE512_3((Data[ 9]), (Data[10]), (Data[11]));
FUGUE512_3((Data[12]), (Data[13]), (Data[14]));
FUGUE512_3((Data[15]), (Data[16]), (Data[17]));
FUGUE512_F((Data[18]), (Data[19]), 0/*bchi*/, (80*8)/*bclo*/);
// rotate right state by 3 dwords (S00 = S33, S03 = S00)
SUB_ROR3;
SUB_ROR9;
#pragma unroll 32
for (int i = 0; i < 32; i++) {
SUB_ROR3;
CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
SMIX(S00, S01, S02, S03);
}
#pragma unroll 13
for (int i = 0; i < 13; i++) {
S04 ^= S00;
S09 ^= S00;
S18 ^= S00;
S27 ^= S00;
SUB_ROR9;
SMIX(S00, S01, S02, S03);
S04 ^= S00;
S10 ^= S00;
S18 ^= S00;
S27 ^= S00;
SUB_ROR9;
SMIX(S00, S01, S02, S03);
S04 ^= S00;
S10 ^= S00;
S19 ^= S00;
S27 ^= S00;
SUB_ROR9;
SMIX(S00, S01, S02, S03);
S04 ^= S00;
S10 ^= S00;
S19 ^= S00;
S28 ^= S00;
SUB_ROR8;
SMIX(S00, S01, S02, S03);
}
S04 ^= S00;
S09 ^= S00;
S18 ^= S00;
S27 ^= S00;
Data[ 0] = cuda_swab32(S01);
Data[ 1] = cuda_swab32(S02);
Data[ 2] = cuda_swab32(S03);
Data[ 3] = cuda_swab32(S04);
Data[ 4] = cuda_swab32(S09);
Data[ 5] = cuda_swab32(S10);
Data[ 6] = cuda_swab32(S11);
Data[ 7] = cuda_swab32(S12);
Data[ 8] = cuda_swab32(S18);
Data[ 9] = cuda_swab32(S19);
Data[10] = cuda_swab32(S20);
Data[11] = cuda_swab32(S21);
Data[12] = cuda_swab32(S27);
Data[13] = cuda_swab32(S28);
Data[14] = cuda_swab32(S29);
Data[15] = cuda_swab32(S30);
const size_t hashPosition = thread;
uint64_t* pHash = &g_hash[hashPosition << 3];
#pragma unroll 4
for(int i = 0; i < 4; i++)
AS_UINT4(&pHash[i * 2]) = AS_UINT4(&Data[i * 4]);
}
}
#define texDef(id, texname, texmem, texsource, texsize) { \
unsigned int *texmem; \
cudaMalloc(&texmem, texsize); \
d_textures[thr_id][id] = texmem; \
cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
texname.normalized = 0; \
texname.filterMode = cudaFilterModePoint; \
texname.addressMode[0] = cudaAddressModeClamp; \
{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
} \
}
__host__
void x16_fugue512_cpu_init(int thr_id, uint32_t threads)
{
texDef(0, mixTab0Tex, mixTab0m, mixtab0, sizeof(uint32_t)*256);
}
__host__
void x16_fugue512_cpu_free(int thr_id)
{
cudaFree(d_textures[thr_id][0]);
}
__host__
void x16_fugue512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
{
const uint32_t threadsperblock = TPB;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
x16_fugue512_gpu_hash_80 <<<grid, block>>> (threads, startNonce, (uint64_t*)d_hash);
}

350
x16/cuda_x16_shabal512.cu

@ -0,0 +1,350 @@
/*
* Shabal-512 for X16R
* tpruvot 2018, based on alexis x14 and xevan kernlx code
*/
#include <cuda_helper.h>
#include <cuda_vectors.h>
#include <cuda_vector_uint2x4.h>
typedef uint32_t sph_u32;
#define C32(x) (x)
#define T32(x) (x)
#define INPUT_BLOCK_ADD do { \
B0 = T32(B0 + M0); \
B1 = T32(B1 + M1); \
B2 = T32(B2 + M2); \
B3 = T32(B3 + M3); \
B4 = T32(B4 + M4); \
B5 = T32(B5 + M5); \
B6 = T32(B6 + M6); \
B7 = T32(B7 + M7); \
B8 = T32(B8 + M8); \
B9 = T32(B9 + M9); \
BA = T32(BA + MA); \
BB = T32(BB + MB); \
BC = T32(BC + MC); \
BD = T32(BD + MD); \
BE = T32(BE + ME); \
BF = T32(BF + MF); \
} while (0)
#define INPUT_BLOCK_SUB do { \
C0 = T32(C0 - M0); \
C1 = T32(C1 - M1); \
C2 = T32(C2 - M2); \
C3 = T32(C3 - M3); \
C4 = T32(C4 - M4); \
C5 = T32(C5 - M5); \
C6 = T32(C6 - M6); \
C7 = T32(C7 - M7); \
C8 = T32(C8 - M8); \
C9 = T32(C9 - M9); \
CA = T32(CA - MA); \
CB = T32(CB - MB); \
CC = T32(CC - MC); \
CD = T32(CD - MD); \
CE = T32(CE - ME); \
CF = T32(CF - MF); \
} while (0)
#define XOR_W do { \
A00 ^= Wlow; \
A01 ^= Whigh; \
} while (0)
#define SWAP(v1, v2) do { \
sph_u32 tmp = (v1); \
(v1) = (v2); \
(v2) = tmp; \
} while (0)
#define SWAP_BC do { \
SWAP(B0, C0); \
SWAP(B1, C1); \
SWAP(B2, C2); \
SWAP(B3, C3); \
SWAP(B4, C4); \
SWAP(B5, C5); \
SWAP(B6, C6); \
SWAP(B7, C7); \
SWAP(B8, C8); \
SWAP(B9, C9); \
SWAP(BA, CA); \
SWAP(BB, CB); \
SWAP(BC, CC); \
SWAP(BD, CD); \
SWAP(BE, CE); \
SWAP(BF, CF); \
} while (0)
#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) do { \
xa0 = T32((xa0 \
^ (((xa1 << 15) | (xa1 >> 17)) * 5U) \
^ xc) * 3U) \
^ xb1 ^ (xb2 & ~xb3) ^ xm; \
xb0 = T32(~(((xb0 << 1) | (xb0 >> 31)) ^ xa0)); \
} while (0)
#define PERM_STEP_0 do { \
PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
} while (0)
#define PERM_STEP_1 do { \
PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
} while (0)
#define PERM_STEP_2 do { \
PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
} while (0)
#define APPLY_P do { \
B0 = T32(B0 << 17) | (B0 >> 15); \
B1 = T32(B1 << 17) | (B1 >> 15); \
B2 = T32(B2 << 17) | (B2 >> 15); \
B3 = T32(B3 << 17) | (B3 >> 15); \
B4 = T32(B4 << 17) | (B4 >> 15); \
B5 = T32(B5 << 17) | (B5 >> 15); \
B6 = T32(B6 << 17) | (B6 >> 15); \
B7 = T32(B7 << 17) | (B7 >> 15); \
B8 = T32(B8 << 17) | (B8 >> 15); \
B9 = T32(B9 << 17) | (B9 >> 15); \
BA = T32(BA << 17) | (BA >> 15); \
BB = T32(BB << 17) | (BB >> 15); \
BC = T32(BC << 17) | (BC >> 15); \
BD = T32(BD << 17) | (BD >> 15); \
BE = T32(BE << 17) | (BE >> 15); \
BF = T32(BF << 17) | (BF >> 15); \
PERM_STEP_0; \
PERM_STEP_1; \
PERM_STEP_2; \
A0B = T32(A0B + C6); \
A0A = T32(A0A + C5); \
A09 = T32(A09 + C4); \
A08 = T32(A08 + C3); \
A07 = T32(A07 + C2); \
A06 = T32(A06 + C1); \
A05 = T32(A05 + C0); \
A04 = T32(A04 + CF); \
A03 = T32(A03 + CE); \
A02 = T32(A02 + CD); \
A01 = T32(A01 + CC); \
A00 = T32(A00 + CB); \
A0B = T32(A0B + CA); \
A0A = T32(A0A + C9); \
A09 = T32(A09 + C8); \
A08 = T32(A08 + C7); \
A07 = T32(A07 + C6); \
A06 = T32(A06 + C5); \
A05 = T32(A05 + C4); \
A04 = T32(A04 + C3); \
A03 = T32(A03 + C2); \
A02 = T32(A02 + C1); \
A01 = T32(A01 + C0); \
A00 = T32(A00 + CF); \
A0B = T32(A0B + CE); \
A0A = T32(A0A + CD); \
A09 = T32(A09 + CC); \
A08 = T32(A08 + CB); \
A07 = T32(A07 + CA); \
A06 = T32(A06 + C9); \
A05 = T32(A05 + C8); \
A04 = T32(A04 + C7); \
A03 = T32(A03 + C6); \
A02 = T32(A02 + C5); \
A01 = T32(A01 + C4); \
A00 = T32(A00 + C3); \
} while (0)
#define INCR_W do { \
if ((Wlow = T32(Wlow + 1)) == 0) \
Whigh = T32(Whigh + 1); \
} while (0)
__constant__ static const sph_u32 A_init_512[] = {
C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
};
__constant__ static const sph_u32 B_init_512[] = {
C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
};
__constant__ static const sph_u32 C_init_512[] = {
C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
};
__constant__ static uint32_t c_PaddedMessage80[20];
__host__
void x16_shabal512_setBlock_80(void *pdata)
{
cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
}
#define TPB_SHABAL 256
__global__ __launch_bounds__(TPB_SHABAL, 2)
void x16_shabal512_gpu_hash_80(uint32_t threads, const uint32_t startNonce, uint32_t *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint32_t B[] = {
0xC1099CB7, 0x07B385F3, 0xE7442C26, 0xCC8AD640, 0xEB6F56C7, 0x1EA81AA9, 0x73B9D314, 0x1DE85D08,
0x48910A5A, 0x893B22DB, 0xC5A0DF44, 0xBBC4324E, 0x72D2F240, 0x75941D99, 0x6D8BDE82, 0xA1A7502B
};
uint32_t M[16];
if (thread < threads)
{
// todo: try __ldc
*(uint2x4*)&M[0] = *(uint2x4*)&c_PaddedMessage80[0];
*(uint2x4*)&M[8] = *(uint2x4*)&c_PaddedMessage80[8];
sph_u32 A00 = A_init_512[0], A01 = A_init_512[1], A02 = A_init_512[ 2], A03 = A_init_512[ 3];
sph_u32 A04 = A_init_512[4], A05 = A_init_512[5], A06 = A_init_512[ 6], A07 = A_init_512[ 7];
sph_u32 A08 = A_init_512[8], A09 = A_init_512[9], A0A = A_init_512[10], A0B = A_init_512[11];
sph_u32 B0 = B_init_512[ 0], B1 = B_init_512[ 1], B2 = B_init_512[ 2], B3 = B_init_512 [3];
sph_u32 B4 = B_init_512[ 4], B5 = B_init_512[ 5], B6 = B_init_512[ 6], B7 = B_init_512[ 7];
sph_u32 B8 = B_init_512[ 8], B9 = B_init_512[ 9], BA = B_init_512[10], BB = B_init_512[11];
sph_u32 BC = B_init_512[12], BD = B_init_512[13], BE = B_init_512[14], BF = B_init_512[15];
sph_u32 C0 = C_init_512[ 0], C1 = C_init_512[ 1], C2 = C_init_512[ 2], C3 = C_init_512[ 3];
sph_u32 C4 = C_init_512[ 4], C5 = C_init_512[ 5], C6 = C_init_512[ 6], C7 = C_init_512[ 7];
sph_u32 C8 = C_init_512[ 8], C9 = C_init_512[ 9], CA = C_init_512[10], CB = C_init_512[11];
sph_u32 CC = C_init_512[12], CD = C_init_512[13], CE = C_init_512[14], CF = C_init_512[15];
sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
sph_u32 Wlow = 1, Whigh = 0;
M0 = M[ 0];
M1 = M[ 1];
M2 = M[ 2];
M3 = M[ 3];
M4 = M[ 4];
M5 = M[ 5];
M6 = M[ 6];
M7 = M[ 7];
M8 = M[ 8];
M9 = M[ 9];
MA = M[10];
MB = M[11];
MC = M[12];
MD = M[13];
ME = M[14];
MF = M[15];
INPUT_BLOCK_ADD;
XOR_W;
APPLY_P;
INPUT_BLOCK_SUB;
SWAP_BC;
INCR_W;
M0 = c_PaddedMessage80[16];
M1 = c_PaddedMessage80[17];
M2 = c_PaddedMessage80[18];
M3 = cuda_swab32(startNonce + thread);
M4 = 0x80;
M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0;
INPUT_BLOCK_ADD;
XOR_W;
APPLY_P;
for (unsigned i = 0; i < 3; i++) {
SWAP_BC;
XOR_W;
APPLY_P;
}
B[ 0] = B0;
B[ 1] = B1;
B[ 2] = B2;
B[ 3] = B3;
B[ 4] = B4;
B[ 5] = B5;
B[ 6] = B6;
B[ 7] = B7;
B[ 8] = B8;
B[ 9] = B9;
B[10] = BA;
B[11] = BB;
B[12] = BC;
B[13] = BD;
B[14] = BE;
B[15] = BF;
// output
uint64_t hashPosition = thread;
uint32_t *Hash = &g_hash[hashPosition << 4];
*(uint2x4*)&Hash[0] = *(uint2x4*)&B[0];
*(uint2x4*)&Hash[8] = *(uint2x4*)&B[8];
}
}
__host__
void x16_shabal512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
{
const uint32_t threadsperblock = TPB_SHABAL;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
x16_shabal512_gpu_hash_80 <<<grid, block >>>(threads, startNonce, d_hash);
}

1836
x16/cuda_x16_simd512_80.cu

File diff suppressed because it is too large Load Diff

622
x16/x16r.cu

@ -0,0 +1,622 @@
/**
* X16R algorithm (X16 with Randomized chain order)
*
* tpruvot 2018 - GPL code
*/
#include <stdio.h>
#include <memory.h>
#include <unistd.h>
extern "C" {
#include "sph/sph_blake.h"
#include "sph/sph_bmw.h"
#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
#include "sph/sph_luffa.h"
#include "sph/sph_cubehash.h"
#include "sph/sph_shavite.h"
#include "sph/sph_simd.h"
#include "sph/sph_echo.h"
#include "sph/sph_hamsi.h"
#include "sph/sph_fugue.h"
#include "sph/sph_shabal.h"
#include "sph/sph_whirlpool.h"
#include "sph/sph_sha2.h"
}
#include "miner.h"
#include "cuda_helper.h"
#include "cuda_x16.h"
static uint32_t *d_hash[MAX_GPUS];
enum Algo {
BLAKE = 0,
BMW,
GROESTL,
JH,
KECCAK,
SKEIN,
LUFFA,
CUBEHASH,
SHAVITE,
SIMD,
ECHO,
HAMSI,
FUGUE,
SHABAL,
WHIRLPOOL,
SHA512,
HASH_FUNC_COUNT
};
static const char* algo_strings[] = {
"blake",
"bmw512",
"groestl",
"jh512",
"keccak",
"skein",
"luffa",
"cube",
"shavite",
"simd",
"echo",
"hamsi",
"fugue",
"shabal",
"whirlpool",
"sha512",
NULL
};
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
static void getAlgoString(const uint32_t* prevblock, char *output)
{
char *sptr = output;
uint8_t* data = (uint8_t*)prevblock;
for (uint8_t j = 0; j < HASH_FUNC_COUNT; j++) {
uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed
uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4;
if (algoDigit >= 10)
sprintf(sptr, "%c", 'A' + (algoDigit - 10));
else
sprintf(sptr, "%u", (uint32_t) algoDigit);
sptr++;
}
*sptr = '\0';
}
// X16R CPU Hash (Validation)
extern "C" void x16r_hash(void *output, const void *input)
{
unsigned char _ALIGN(64) hash[128];
sph_blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
sph_groestl512_context ctx_groestl;
sph_jh512_context ctx_jh;
sph_keccak512_context ctx_keccak;
sph_skein512_context ctx_skein;
sph_luffa512_context ctx_luffa;
sph_cubehash512_context ctx_cubehash;
sph_shavite512_context ctx_shavite;
sph_simd512_context ctx_simd;
sph_echo512_context ctx_echo;
sph_hamsi512_context ctx_hamsi;
sph_fugue512_context ctx_fugue;
sph_shabal512_context ctx_shabal;
sph_whirlpool_context ctx_whirlpool;
sph_sha512_context ctx_sha512;
void *in = (void*) input;
int size = 80;
uint32_t *in32 = (uint32_t*) input;
getAlgoString(&in32[1], hashOrder);
for (int i = 0; i < 16; i++)
{
const char elem = hashOrder[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch (algo) {
case BLAKE:
sph_blake512_init(&ctx_blake);
sph_blake512(&ctx_blake, in, size);
sph_blake512_close(&ctx_blake, hash);
break;
case BMW:
sph_bmw512_init(&ctx_bmw);
sph_bmw512(&ctx_bmw, in, size);
sph_bmw512_close(&ctx_bmw, hash);
break;
case GROESTL:
sph_groestl512_init(&ctx_groestl);
sph_groestl512(&ctx_groestl, in, size);
sph_groestl512_close(&ctx_groestl, hash);
break;
case SKEIN:
sph_skein512_init(&ctx_skein);
sph_skein512(&ctx_skein, in, size);
sph_skein512_close(&ctx_skein, hash);
break;
case JH:
sph_jh512_init(&ctx_jh);
sph_jh512(&ctx_jh, in, size);
sph_jh512_close(&ctx_jh, hash);
break;
case KECCAK:
sph_keccak512_init(&ctx_keccak);
sph_keccak512(&ctx_keccak, in, size);
sph_keccak512_close(&ctx_keccak, hash);
break;
case LUFFA:
sph_luffa512_init(&ctx_luffa);
sph_luffa512(&ctx_luffa, in, size);
sph_luffa512_close(&ctx_luffa, hash);
break;
case CUBEHASH:
sph_cubehash512_init(&ctx_cubehash);
sph_cubehash512(&ctx_cubehash, in, size);
sph_cubehash512_close(&ctx_cubehash, hash);
break;
case SHAVITE:
sph_shavite512_init(&ctx_shavite);
sph_shavite512(&ctx_shavite, in, size);
sph_shavite512_close(&ctx_shavite, hash);
break;
case SIMD:
sph_simd512_init(&ctx_simd);
sph_simd512(&ctx_simd, in, size);
sph_simd512_close(&ctx_simd, hash);
break;
case ECHO:
sph_echo512_init(&ctx_echo);
sph_echo512(&ctx_echo, in, size);
sph_echo512_close(&ctx_echo, hash);
break;
case HAMSI:
sph_hamsi512_init(&ctx_hamsi);
sph_hamsi512(&ctx_hamsi, in, size);
sph_hamsi512_close(&ctx_hamsi, hash);
break;
case FUGUE:
sph_fugue512_init(&ctx_fugue);
sph_fugue512(&ctx_fugue, in, size);
sph_fugue512_close(&ctx_fugue, hash);
break;
case SHABAL:
sph_shabal512_init(&ctx_shabal);
sph_shabal512(&ctx_shabal, in, size);
sph_shabal512_close(&ctx_shabal, hash);
break;
case WHIRLPOOL:
sph_whirlpool_init(&ctx_whirlpool);
sph_whirlpool(&ctx_whirlpool, in, size);
sph_whirlpool_close(&ctx_whirlpool, hash);
break;
case SHA512:
sph_sha512_init(&ctx_sha512);
sph_sha512(&ctx_sha512,(const void*) in, size);
sph_sha512_close(&ctx_sha512,(void*) hash);
break;
}
in = (void*) hash;
size = 64;
}
memcpy(output, hash, 32);
}
void whirlpool_midstate(void *state, const void *input)
{
sph_whirlpool_context ctx;
sph_whirlpool_init(&ctx);
sph_whirlpool(&ctx, input, 64);
memcpy(state, ctx.state, 64);
}
static bool init[MAX_GPUS] = { 0 };
static bool use_compat_kernels[MAX_GPUS] = { 0 };
//#define _DEBUG
#define _DEBUG_PREFIX "x16r-"
#include "cuda_debug.cuh"
//static int algo80_tests[HASH_FUNC_COUNT] = { 0 };
//static int algo64_tests[HASH_FUNC_COUNT] = { 0 };
static int algo80_fails[HASH_FUNC_COUNT] = { 0 };
extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const int dev_id = device_map[thr_id];
int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19;
if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
cuda_get_arch(thr_id);
use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
if (use_compat_kernels[thr_id])
x11_echo512_cpu_init(thr_id, throughput);
quark_blake512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput);
qubit_luffa512_cpu_init(thr_id, throughput);
x11_luffa512_cpu_init(thr_id, throughput); // 64
x11_shavite512_cpu_init(thr_id, throughput);
x11_simd512_cpu_init(thr_id, throughput); // 64
x16_echo512_cuda_init(thr_id, throughput);
x13_hamsi512_cpu_init(thr_id, throughput);
x13_fugue512_cpu_init(thr_id, throughput);
x16_fugue512_cpu_init(thr_id, throughput);
x14_shabal512_cpu_init(thr_id, throughput);
x15_whirlpool_cpu_init(thr_id, throughput, 0);
x16_whirlpool512_init(thr_id, throughput);
x17_sha512_cpu_init(thr_id, throughput);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
if (opt_benchmark) {
((uint32_t*)ptarget)[7] = 0x003f;
//((uint8_t*)pdata)[8] = 0x90; // hashOrder[0] = '9'; for simd 80 + blake512 64
((uint8_t*)pdata)[8] = 0xAA; // hashOrder[0] = 'A'; for echo 80 + 64
//((uint8_t*)pdata)[8] = 0xB0; // hashOrder[0] = 'B'; for hamsi 80 + blake512 64
//((uint8_t*)pdata)[8] = 0xC0; // hashOrder[0] = 'C'; for fugue 80 + blake512 64
//((uint8_t*)pdata)[8] = 0xE0; // hashOrder[0] = 'E'; for whirlpool 80 + blake512 64
}
uint32_t _ALIGN(64) endiandata[20];
for (int k=0; k < 19; k++)
be32enc(&endiandata[k], pdata[k]);
uint32_t ntime = swab32(pdata[17]);
if (s_ntime != ntime) {
getAlgoString(&endiandata[1], hashOrder);
s_ntime = ntime;
if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime);
}
cuda_check_cpu_setTarget(ptarget);
char elem = hashOrder[0];
const uint8_t algo80 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch (algo80) {
case BLAKE:
quark_blake512_cpu_setBlock_80(thr_id, endiandata);
break;
case BMW:
quark_bmw512_cpu_setBlock_80(endiandata);
break;
case GROESTL:
groestl512_setBlock_80(thr_id, endiandata);
break;
case JH:
jh512_setBlock_80(thr_id, endiandata);
break;
case KECCAK:
keccak512_setBlock_80(thr_id, endiandata);
break;
case SKEIN:
skein512_cpu_setBlock_80((void*)endiandata);
break;
case LUFFA:
qubit_luffa512_cpu_setBlock_80((void*)endiandata);
break;
case CUBEHASH:
cubehash512_setBlock_80(thr_id, endiandata);
break;
case SHAVITE:
x11_shavite512_setBlock_80((void*)endiandata);
break;
case SIMD:
x16_simd512_setBlock_80((void*)endiandata);
break;
case ECHO:
x16_echo512_setBlock_80((void*)endiandata);
break;
case HAMSI:
x16_hamsi512_setBlock_80((void*)endiandata);
break;
case FUGUE:
x16_fugue512_setBlock_80((void*)pdata);
break;
case SHABAL:
x16_shabal512_setBlock_80((void*)endiandata);
break;
case WHIRLPOOL:
x16_whirlpool512_setBlock_80((void*)endiandata);
break;
case SHA512:
x16_sha512_setBlock_80(endiandata);
break;
default: {
return -1;
}
}
int warn = 0;
do {
int order = 0;
// Hash with CUDA
switch (algo80) {
case BLAKE:
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("blake80:");
break;
case BMW:
quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
TRACE("bmw80 :");
break;
case GROESTL:
groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("grstl80:");
break;
case JH:
jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("jh51280:");
break;
case KECCAK:
keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("kecck80:");
break;
case SKEIN:
skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
TRACE("skein80:");
break;
case LUFFA:
qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
TRACE("luffa80:");
break;
case CUBEHASH:
cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("cube 80:");
break;
case SHAVITE:
x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
TRACE("shavite:");
break;
case SIMD:
x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("simd512:");
break;
case ECHO:
x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("echo :");
break;
case HAMSI:
x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("hamsi :");
break;
case FUGUE:
x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("fugue :");
break;
case SHABAL:
x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("shabal :");
break;
case WHIRLPOOL:
x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("whirl :");
break;
case SHA512:
x16_sha512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("sha512 :");
break;
}
for (int i = 1; i < 16; i++)
{
const char elem = hashOrder[i];
const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch (algo64) {
case BLAKE:
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("blake :");
break;
case BMW:
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("bmw :");
break;
case GROESTL:
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("groestl:");
break;
case JH:
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("jh512 :");
break;
case KECCAK:
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("keccak :");
break;
case SKEIN:
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("skein :");
break;
case LUFFA:
x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("luffa :");
break;
case CUBEHASH:
x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("cube :");
break;
case SHAVITE:
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("shavite:");
break;
case SIMD:
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("simd :");
break;
case ECHO:
if (use_compat_kernels[thr_id])
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
else
x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
TRACE("echo :");
break;
case HAMSI:
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("hamsi :");
break;
case FUGUE:
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("fugue :");
break;
case SHABAL:
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("shabal :");
break;
case WHIRLPOOL:
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("shabal :");
break;
case SHA512:
x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("sha512 :");
break;
}
}
*hashes_done = pdata[19] - first_nonce + throughput;
work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
#ifdef _DEBUG
uint32_t _ALIGN(64) dhash[8];
be32enc(&endiandata[19], pdata[19]);
x16r_hash(dhash, endiandata);
applog_hash(dhash);
return -1;
#endif
if (work->nonces[0] != UINT32_MAX)
{
const uint32_t Htarg = ptarget[7];
uint32_t _ALIGN(64) vhash[8];
be32enc(&endiandata[19], work->nonces[0]);
x16r_hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
work->valid_nonces = 1;
work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
work_set_target_ratio(work, vhash);
if (work->nonces[1] != 0) {
be32enc(&endiandata[19], work->nonces[1]);
x16r_hash(vhash, endiandata);
bn_set_target_ratio(work, vhash, 1);
work->valid_nonces++;
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
} else {
pdata[19] = work->nonces[0] + 1; // cursor
}
#if 0
gpulog(LOG_INFO, thr_id, "hash found with %s 80!", algo_strings[algo80]);
algo80_tests[algo80] += work->valid_nonces;
char oks64[128] = { 0 };
char oks80[128] = { 0 };
char fails[128] = { 0 };
for (int a = 0; a < HASH_FUNC_COUNT; a++) {
const char elem = hashOrder[a];
const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
if (a > 0) algo64_tests[algo64] += work->valid_nonces;
sprintf(&oks64[strlen(oks64)], "|%X:%2d", a, algo64_tests[a] < 100 ? algo64_tests[a] : 99);
sprintf(&oks80[strlen(oks80)], "|%X:%2d", a, algo80_tests[a] < 100 ? algo80_tests[a] : 99);
sprintf(&fails[strlen(fails)], "|%X:%2d", a, algo80_fails[a] < 100 ? algo80_fails[a] : 99);
}
applog(LOG_INFO, "K64: %s", oks64);
applog(LOG_INFO, "K80: %s", oks80);
applog(LOG_ERR, "F80: %s", fails);
#endif
return work->valid_nonces;
}
else if (vhash[7] > Htarg) {
// x11+ coins could do some random error, but not on retry
gpu_increment_reject(thr_id);
algo80_fails[algo80]++;
if (!warn) {
warn++;
pdata[19] = work->nonces[0] + 1;
continue;
} else {
if (!opt_quiet) gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! %s %s",
work->nonces[0], algo_strings[algo80], hashOrder);
warn = 0;
}
}
}
if ((uint64_t)throughput + pdata[19] >= max_nonce) {
pdata[19] = max_nonce;
break;
}
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce;
return 0;
}
// cleanup
extern "C" void free_x16r(int thr_id)
{
if (!init[thr_id])
return;
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
quark_blake512_cpu_free(thr_id);
quark_groestl512_cpu_free(thr_id);
x11_simd512_cpu_free(thr_id);
x13_fugue512_cpu_free(thr_id);
x16_fugue512_cpu_free(thr_id); // to merge with x13_fugue512 ?
x15_whirlpool_cpu_free(thr_id);
cuda_check_cpu_free(thr_id);
cudaDeviceSynchronize();
init[thr_id] = false;
}

601
x16/x16s.cu

@ -0,0 +1,601 @@
/**
* X16S algorithm (X16 with Shuffled chain order)
*
* tpruvot 2018 - GPL code
*/
#include <stdio.h>
#include <memory.h>
#include <unistd.h>
extern "C" {
#include "sph/sph_blake.h"
#include "sph/sph_bmw.h"
#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
#include "sph/sph_luffa.h"
#include "sph/sph_cubehash.h"
#include "sph/sph_shavite.h"
#include "sph/sph_simd.h"
#include "sph/sph_echo.h"
#include "sph/sph_hamsi.h"
#include "sph/sph_fugue.h"
#include "sph/sph_shabal.h"
#include "sph/sph_whirlpool.h"
#include "sph/sph_sha2.h"
}
#include "miner.h"
#include "cuda_helper.h"
#include "cuda_x16.h"
static uint32_t *d_hash[MAX_GPUS];
enum Algo {
BLAKE = 0,
BMW,
GROESTL,
JH,
KECCAK,
SKEIN,
LUFFA,
CUBEHASH,
SHAVITE,
SIMD,
ECHO,
HAMSI,
FUGUE,
SHABAL,
WHIRLPOOL,
SHA512,
HASH_FUNC_COUNT
};
static const char* algo_strings[] = {
"blake",
"bmw512",
"groestl",
"jh512",
"keccak",
"skein",
"luffa",
"cube",
"shavite",
"simd",
"echo",
"hamsi",
"fugue",
"shabal",
"whirlpool",
"sha512",
NULL
};
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
static void getAlgoString(const uint32_t* prevblock, char *output)
{
uint8_t* data = (uint8_t*)prevblock;
strcpy(output, "0123456789ABCDEF");
for (uint8_t i = 0; i < HASH_FUNC_COUNT; i++) {
uint8_t b = (15 - i) >> 1; // 16 ascii hex chars, reversed
uint8_t algoDigit = (i & 1) ? data[b] & 0xF : data[b] >> 4;
int offset = (int) algoDigit;
char oldVal = output[offset];
for(int j=offset; j-->0;)
output[j+1] = output[j];
output[0] = oldVal;
}
}
// X16S CPU Hash (Validation)
extern "C" void x16s_hash(void *output, const void *input)
{
unsigned char _ALIGN(64) hash[128];
sph_blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
sph_groestl512_context ctx_groestl;
sph_jh512_context ctx_jh;
sph_keccak512_context ctx_keccak;
sph_skein512_context ctx_skein;
sph_luffa512_context ctx_luffa;
sph_cubehash512_context ctx_cubehash;
sph_shavite512_context ctx_shavite;
sph_simd512_context ctx_simd;
sph_echo512_context ctx_echo;
sph_hamsi512_context ctx_hamsi;
sph_fugue512_context ctx_fugue;
sph_shabal512_context ctx_shabal;
sph_whirlpool_context ctx_whirlpool;
sph_sha512_context ctx_sha512;
void *in = (void*) input;
int size = 80;
uint32_t *in32 = (uint32_t*) input;
getAlgoString(&in32[1], hashOrder);
for (int i = 0; i < 16; i++)
{
const char elem = hashOrder[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch (algo) {
case BLAKE:
sph_blake512_init(&ctx_blake);
sph_blake512(&ctx_blake, in, size);
sph_blake512_close(&ctx_blake, hash);
break;
case BMW:
sph_bmw512_init(&ctx_bmw);
sph_bmw512(&ctx_bmw, in, size);
sph_bmw512_close(&ctx_bmw, hash);
break;
case GROESTL:
sph_groestl512_init(&ctx_groestl);
sph_groestl512(&ctx_groestl, in, size);
sph_groestl512_close(&ctx_groestl, hash);
break;
case SKEIN:
sph_skein512_init(&ctx_skein);
sph_skein512(&ctx_skein, in, size);
sph_skein512_close(&ctx_skein, hash);
break;
case JH:
sph_jh512_init(&ctx_jh);
sph_jh512(&ctx_jh, in, size);
sph_jh512_close(&ctx_jh, hash);
break;
case KECCAK:
sph_keccak512_init(&ctx_keccak);
sph_keccak512(&ctx_keccak, in, size);
sph_keccak512_close(&ctx_keccak, hash);
break;
case LUFFA:
sph_luffa512_init(&ctx_luffa);
sph_luffa512(&ctx_luffa, in, size);
sph_luffa512_close(&ctx_luffa, hash);
break;
case CUBEHASH:
sph_cubehash512_init(&ctx_cubehash);
sph_cubehash512(&ctx_cubehash, in, size);
sph_cubehash512_close(&ctx_cubehash, hash);
break;
case SHAVITE:
sph_shavite512_init(&ctx_shavite);
sph_shavite512(&ctx_shavite, in, size);
sph_shavite512_close(&ctx_shavite, hash);
break;
case SIMD:
sph_simd512_init(&ctx_simd);
sph_simd512(&ctx_simd, in, size);
sph_simd512_close(&ctx_simd, hash);
break;
case ECHO:
sph_echo512_init(&ctx_echo);
sph_echo512(&ctx_echo, in, size);
sph_echo512_close(&ctx_echo, hash);
break;
case HAMSI:
sph_hamsi512_init(&ctx_hamsi);
sph_hamsi512(&ctx_hamsi, in, size);
sph_hamsi512_close(&ctx_hamsi, hash);
break;
case FUGUE:
sph_fugue512_init(&ctx_fugue);
sph_fugue512(&ctx_fugue, in, size);
sph_fugue512_close(&ctx_fugue, hash);
break;
case SHABAL:
sph_shabal512_init(&ctx_shabal);
sph_shabal512(&ctx_shabal, in, size);
sph_shabal512_close(&ctx_shabal, hash);
break;
case WHIRLPOOL:
sph_whirlpool_init(&ctx_whirlpool);
sph_whirlpool(&ctx_whirlpool, in, size);
sph_whirlpool_close(&ctx_whirlpool, hash);
break;
case SHA512:
sph_sha512_init(&ctx_sha512);
sph_sha512(&ctx_sha512,(const void*) in, size);
sph_sha512_close(&ctx_sha512,(void*) hash);
break;
}
in = (void*) hash;
size = 64;
}
memcpy(output, hash, 32);
}
#if 0 /* in x16r */
void whirlpool_midstate(void *state, const void *input)
{
sph_whirlpool_context ctx;
sph_whirlpool_init(&ctx);
sph_whirlpool(&ctx, input, 64);
memcpy(state, ctx.state, 64);
}
#endif
static bool init[MAX_GPUS] = { 0 };
static bool use_compat_kernels[MAX_GPUS] = { 0 };
//#define _DEBUG
#define _DEBUG_PREFIX "x16s-"
#include "cuda_debug.cuh"
extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const int dev_id = device_map[thr_id];
int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19;
if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
cuda_get_arch(thr_id);
use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
if (use_compat_kernels[thr_id])
x11_echo512_cpu_init(thr_id, throughput);
quark_blake512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput);
qubit_luffa512_cpu_init(thr_id, throughput);
x11_luffa512_cpu_init(thr_id, throughput); // 64
x11_shavite512_cpu_init(thr_id, throughput);
x11_simd512_cpu_init(thr_id, throughput); // 64
x16_echo512_cuda_init(thr_id, throughput);
x13_hamsi512_cpu_init(thr_id, throughput);
x13_fugue512_cpu_init(thr_id, throughput);
x16_fugue512_cpu_init(thr_id, throughput);
x14_shabal512_cpu_init(thr_id, throughput);
x15_whirlpool_cpu_init(thr_id, throughput, 0);
x16_whirlpool512_init(thr_id, throughput);
x17_sha512_cpu_init(thr_id, throughput);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
if (opt_benchmark) {
((uint32_t*)ptarget)[7] = 0x003f;
//((uint8_t*)pdata)[8] = 0x90; // hashOrder[0] = '9'; for simd 80 + blake512 64
//((uint8_t*)pdata)[8] = 0x0A; // hashOrder[0] = 'A'; for echo 64
//((uint8_t*)pdata)[8] = 0xB0; // hashOrder[0] = 'B'; for hamsi 80 + blake512 64
//((uint8_t*)pdata)[8] = 0xC0; // hashOrder[0] = 'C'; for fugue 80 + blake512 64
//((uint8_t*)pdata)[8] = 0xE0; // hashOrder[0] = 'E'; for whirlpool 80 + blake512 64
}
uint32_t _ALIGN(64) endiandata[20];
for (int k=0; k < 19; k++)
be32enc(&endiandata[k], pdata[k]);
uint32_t ntime = swab32(pdata[17]);
if (s_ntime != ntime) {
getAlgoString(&endiandata[1], hashOrder);
s_ntime = ntime;
if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime);
}
cuda_check_cpu_setTarget(ptarget);
char elem = hashOrder[0];
const uint8_t algo80 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch (algo80) {
case BLAKE:
quark_blake512_cpu_setBlock_80(thr_id, endiandata);
break;
case BMW:
quark_bmw512_cpu_setBlock_80(endiandata);
break;
case GROESTL:
groestl512_setBlock_80(thr_id, endiandata);
break;
case JH:
jh512_setBlock_80(thr_id, endiandata);
break;
case KECCAK:
keccak512_setBlock_80(thr_id, endiandata);
break;
case SKEIN:
skein512_cpu_setBlock_80((void*)endiandata);
break;
case LUFFA:
qubit_luffa512_cpu_setBlock_80((void*)endiandata);
break;
case CUBEHASH:
cubehash512_setBlock_80(thr_id, endiandata);
break;
case SHAVITE:
x11_shavite512_setBlock_80((void*)endiandata);
break;
case SIMD:
x16_simd512_setBlock_80((void*)endiandata);
break;
case ECHO:
x16_echo512_setBlock_80((void*)endiandata);
break;
case HAMSI:
x16_hamsi512_setBlock_80((void*)endiandata);
break;
case FUGUE:
x16_fugue512_setBlock_80((void*)pdata);
break;
case SHABAL:
x16_shabal512_setBlock_80((void*)endiandata);
break;
case WHIRLPOOL:
x16_whirlpool512_setBlock_80((void*)endiandata);
break;
case SHA512:
x16_sha512_setBlock_80(endiandata);
break;
default: {
return -1;
}
}
int warn = 0;
do {
int order = 0;
// Hash with CUDA
switch (algo80) {
case BLAKE:
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("blake80:");
break;
case BMW:
quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
TRACE("bmw80 :");
break;
case GROESTL:
groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("grstl80:");
break;
case JH:
jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("jh51280:");
break;
case KECCAK:
keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("kecck80:");
break;
case SKEIN:
skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
TRACE("skein80:");
break;
case LUFFA:
qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
TRACE("luffa80:");
break;
case CUBEHASH:
cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("cube 80:");
break;
case SHAVITE:
x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
TRACE("shavite:");
break;
case SIMD:
x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("simd512:");
break;
case ECHO:
x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("echo :");
break;
case HAMSI:
x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("hamsi :");
break;
case FUGUE:
x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("fugue :");
break;
case SHABAL:
x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("shabal :");
break;
case WHIRLPOOL:
x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("whirl :");
break;
case SHA512:
x16_sha512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("sha512 :");
break;
}
for (int i = 1; i < 16; i++)
{
const char elem = hashOrder[i];
const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch (algo64) {
case BLAKE:
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("blake :");
break;
case BMW:
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("bmw :");
break;
case GROESTL:
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("groestl:");
break;
case JH:
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("jh512 :");
break;
case KECCAK:
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("keccak :");
break;
case SKEIN:
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("skein :");
break;
case LUFFA:
x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("luffa :");
break;
case CUBEHASH:
x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("cube :");
break;
case SHAVITE:
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("shavite:");
break;
case SIMD:
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("simd :");
break;
case ECHO:
if (use_compat_kernels[thr_id])
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
else
x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
TRACE("echo :");
break;
case HAMSI:
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("hamsi :");
break;
case FUGUE:
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("fugue :");
break;
case SHABAL:
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("shabal :");
break;
case WHIRLPOOL:
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("shabal :");
break;
case SHA512:
x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("sha512 :");
break;
}
}
*hashes_done = pdata[19] - first_nonce + throughput;
work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
#ifdef _DEBUG
uint32_t _ALIGN(64) dhash[8];
be32enc(&endiandata[19], pdata[19]);
x16s_hash(dhash, endiandata);
applog_hash(dhash);
return -1;
#endif
if (work->nonces[0] != UINT32_MAX)
{
const uint32_t Htarg = ptarget[7];
uint32_t _ALIGN(64) vhash[8];
be32enc(&endiandata[19], work->nonces[0]);
x16s_hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
work->valid_nonces = 1;
work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
work_set_target_ratio(work, vhash);
if (work->nonces[1] != 0) {
be32enc(&endiandata[19], work->nonces[1]);
x16s_hash(vhash, endiandata);
bn_set_target_ratio(work, vhash, 1);
work->valid_nonces++;
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
} else {
pdata[19] = work->nonces[0] + 1; // cursor
}
//gpulog(LOG_INFO, thr_id, "hash found with %s 80 (%s)!", algo_strings[algo80], hashOrder);
return work->valid_nonces;
}
else if (vhash[7] > Htarg) {
// x11+ coins could do some random error, but not on retry
gpu_increment_reject(thr_id);
if (!warn) {
warn++;
pdata[19] = work->nonces[0] + 1;
continue;
} else {
if (!opt_quiet) gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! %s %s",
work->nonces[0], algo_strings[algo80], hashOrder);
warn = 0;
}
}
}
if ((uint64_t)throughput + pdata[19] >= max_nonce) {
pdata[19] = max_nonce;
break;
}
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce;
return 0;
}
// cleanup
extern "C" void free_x16s(int thr_id)
{
if (!init[thr_id])
return;
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
quark_blake512_cpu_free(thr_id);
quark_groestl512_cpu_free(thr_id);
x11_simd512_cpu_free(thr_id);
x13_fugue512_cpu_free(thr_id);
x16_fugue512_cpu_free(thr_id); // to merge with x13_fugue512 ?
x15_whirlpool_cpu_free(thr_id);
cuda_check_cpu_free(thr_id);
cudaDeviceSynchronize();
init[thr_id] = false;
}

77
x17/cuda_x17_sha512.cu

@ -169,3 +169,80 @@ void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,
x17_sha512_gpu_hash_64 <<<grid, block>>> (threads, (uint64_t*)d_hash); x17_sha512_gpu_hash_64 <<<grid, block>>> (threads, (uint64_t*)d_hash);
} }
__constant__
static uint64_t c_PaddedMessage80[10];
__global__
/*__launch_bounds__(256, 4)*/
void x16_sha512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint64_t W[80];
#pragma unroll
for (int i = 0; i < 9; i ++) {
W[i] = SWAP64(c_PaddedMessage80[i]);
}
const uint32_t nonce = startNonce + thread;
//((uint32_t*)W)[19] = cuda_swab32(nonce);
W[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nonce));
W[9] = cuda_swab64(W[9]);
W[10] = 0x8000000000000000;
#pragma unroll
for (int i = 11; i<15; i++) {
W[i] = 0U;
}
W[15] = 0x0000000000000280;
#pragma unroll 64
for (int i = 16; i < 80; i ++) {
W[i] = SSG5_1(W[i-2]) + W[i-7];
W[i] += SSG5_0(W[i-15]) + W[i-16];
}
const uint64_t IV512[8] = {
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
uint64_t r[8];
#pragma unroll
for (int i = 0; i < 8; i++) {
r[i] = IV512[i];
}
#pragma unroll
for (int i = 0; i < 80; i++) {
SHA3_STEP(c_WB, r, W, i&7, i);
}
const uint64_t hashPosition = thread;
uint64_t *pHash = &g_hash[hashPosition << 3];
#pragma unroll
for (int u = 0; u < 8; u ++) {
pHash[u] = SWAP64(r[u] + IV512[u]);
}
}
}
__host__
void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
{
const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
x16_sha512_gpu_hash_80 <<<grid, block >>> (threads, startNounce, (uint64_t*)d_hash);
}
__host__
void x16_sha512_setBlock_80(void *pdata)
{
cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
}
Loading…
Cancel
Save