Browse Source

x16r algo and new kernels

Was a very long work but finally working,
and unlike xevan these new kernels are reusable..

Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
pull/4/head
Tanguy Pruvot 6 years ago
parent
commit
78dad7dd65
  1. 2
      Makefile.am
  2. 6
      README.txt
  3. 2
      algos.h
  4. 1
      bench.cpp
  5. 5
      ccminer.cpp
  6. 12
      ccminer.vcxproj
  7. 26
      ccminer.vcxproj.filters
  8. 2
      compat/ccminer-config.h
  9. 3
      miner.h
  10. 8
      res/ccminer.rc
  11. 3
      util.cpp
  12. 715
      x13/cuda_x13_hamsi512.cu
  13. 64
      x15/cuda_x15_whirlpool_sm3.cu
  14. 214
      x16r/cuda_x16_echo512.cu
  15. 467
      x16r/cuda_x16_fugue512.cu
  16. 350
      x16r/cuda_x16_shabal512.cu
  17. 1836
      x16r/cuda_x16_simd512_80.cu
  18. 75
      x16r/cuda_x16r.h
  19. 625
      x16r/x16r.cu
  20. 77
      x17/cuda_x17_sha512.cu

2
Makefile.am

@ -76,6 +76,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \ @@ -76,6 +76,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \
x13/hsr.cu x13/cuda_hsr_sm3.cu x13/sm3.c \
x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \
x15/whirlpool.cu x15/cuda_x15_whirlpool_sm3.cu \
x16r/x16r.cu x16r/cuda_x16_echo512.cu x16r/cuda_x16_fugue512.cu \
x16r/cuda_x16_shabal512.cu x16r/cuda_x16_simd512_80.cu \
x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
x11/phi.cu x11/cuda_streebog_maxwell.cu \
x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu

6
README.txt

@ -1,5 +1,5 @@ @@ -1,5 +1,5 @@
ccminer 2.2.4 (Jan. 2018) "lyra2v2 and keccak improvements"
ccminer 2.2.5 (Feb 2018) "x16r algo"
---------------------------------------------------------------
***************************************************************
@ -122,6 +122,7 @@ its command line interface and options. @@ -122,6 +122,7 @@ its command line interface and options.
x11 use to mine DarkCoin
x14 use to mine X14Coin
x15 use to mine Halcyon
x16r use to mine Raven
x17 use to mine X17
vanilla use to mine Vanilla (Blake256)
veltor use to mine VeltorCoin
@ -277,6 +278,9 @@ so we can more efficiently implement new algorithms using the latest hardware @@ -277,6 +278,9 @@ so we can more efficiently implement new algorithms using the latest hardware
features.
>>> RELEASE HISTORY <<<
Feb. 2017 v2.2.5
New x16r algo
Jan. 04th 2017 v2.2.4
Improve lyra2v2
Higher keccak default intensity

2
algos.h

@ -59,6 +59,7 @@ enum sha_algos { @@ -59,6 +59,7 @@ enum sha_algos {
ALGO_X13,
ALGO_X14,
ALGO_X15,
ALGO_X16R,
ALGO_X17,
ALGO_VANILLA,
ALGO_VELTOR,
@ -128,6 +129,7 @@ static const char *algo_names[] = { @@ -128,6 +129,7 @@ static const char *algo_names[] = {
"x13",
"x14",
"x15",
"x16r",
"x17",
"vanilla",
"veltor",

1
bench.cpp

@ -102,6 +102,7 @@ void algo_free_all(int thr_id) @@ -102,6 +102,7 @@ void algo_free_all(int thr_id)
free_x13(thr_id);
free_x14(thr_id);
free_x15(thr_id);
free_x16r(thr_id);
free_x17(thr_id);
free_zr5(thr_id);
free_scrypt(thr_id);

5
ccminer.cpp

@ -293,6 +293,7 @@ Options:\n\ @@ -293,6 +293,7 @@ Options:\n\
x13 X13 (MaruCoin)\n\
x14 X14\n\
x15 X15\n\
x16r X16R (Raven)\n\
x17 X17\n\
wildkeccak Boolberry\n\
zr5 ZR5 (ZiftrCoin)\n\
@ -1705,6 +1706,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) @@ -1705,6 +1706,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
case ALGO_LYRA2Z:
case ALGO_TIMETRAVEL:
case ALGO_BITCORE:
case ALGO_X16R:
work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));
break;
case ALGO_KECCAK:
@ -2499,6 +2501,9 @@ static void *miner_thread(void *userdata) @@ -2499,6 +2501,9 @@ static void *miner_thread(void *userdata)
case ALGO_X15:
rc = scanhash_x15(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_X16R:
rc = scanhash_x16r(thr_id, &work, max_nonce, &hashes_done);
break;
case ALGO_X17:
rc = scanhash_x17(thr_id, &work, max_nonce, &hashes_done);
break;

12
ccminer.vcxproj

@ -269,6 +269,7 @@ @@ -269,6 +269,7 @@
<ClCompile Include="neoscrypt\neoscrypt-cpu.c" />
<ClInclude Include="neoscrypt\cuda_vectors.h" />
<ClInclude Include="x11\cuda_x11_simd512_sm2.cuh" />
<ClInclude Include="x16r\cuda_x16r.h" />
<CudaCompile Include="Algo256\bmw.cu" />
<CudaCompile Include="Algo256\cuda_bmw.cu">
<MaxRegCount>76</MaxRegCount>
@ -434,6 +435,12 @@ @@ -434,6 +435,12 @@
<CudaCompile Include="sha256\sha256d.cu" />
<CudaCompile Include="sha256\cuda_sha256t.cu" />
<CudaCompile Include="sha256\sha256t.cu" />
<CudaCompile Include="x15\cuda_x15_whirlpool_sm3.cu" />
<CudaCompile Include="x16r\x16r.cu" />
<CudaCompile Include="x16r\cuda_x16_echo512.cu" />
<CudaCompile Include="x16r\cuda_x16_fugue512.cu" />
<CudaCompile Include="x16r\cuda_x16_shabal512.cu" />
<CudaCompile Include="x16r\cuda_x16_simd512_80.cu" />
<CudaCompile Include="zr5.cu" />
<CudaCompile Include="heavy\cuda_blake512.cu">
</CudaCompile>
@ -587,8 +594,7 @@ @@ -587,8 +594,7 @@
<CudaCompile Include="x17\hmq17.cu" />
<CudaCompile Include="x15\x15.cu" />
<CudaCompile Include="x15\whirlpool.cu" />
<CudaCompile Include="x17\x17.cu">
</CudaCompile>
<CudaCompile Include="x17\x17.cu" />
<CudaCompile Include="x17\cuda_x17_haval256.cu">
</CudaCompile>
<CudaCompile Include="x17\cuda_x17_sha512.cu">
@ -615,4 +621,4 @@ @@ -615,4 +621,4 @@
<Target Name="AfterClean">
<Delete Files="@(FilesToCopy->'$(OutDir)%(Filename)%(Extension)')" TreatErrorsAsWarnings="true" />
</Target>
</Project>
</Project>

26
ccminer.vcxproj.filters

@ -58,6 +58,9 @@ @@ -58,6 +58,9 @@
<Filter Include="Source Files\CUDA\x15">
<UniqueIdentifier>{a2403c22-6777-46ab-a55a-3fcc7386c974}</UniqueIdentifier>
</Filter>
<Filter Include="Source Files\CUDA\x16r">
<UniqueIdentifier>{55dfae6a-66ba-43e2-8ceb-98ee70cbdf16}</UniqueIdentifier>
</Filter>
<Filter Include="Source Files\CUDA\x17">
<UniqueIdentifier>{85dfae6a-66ca-4332-8cec-98ee70cbdf2f}</UniqueIdentifier>
</Filter>
@ -596,6 +599,9 @@ @@ -596,6 +599,9 @@
<ClInclude Include="equi\equihash.h">
<Filter>Source Files\equi</Filter>
</ClInclude>
<ClInclude Include="x16r\cuda_x16r.h">
<Filter>Header Files\CUDA</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<CudaCompile Include="cuda.cpp">
@ -967,6 +973,24 @@ @@ -967,6 +973,24 @@
<CudaCompile Include="equi\cuda_equi.cu">
<Filter>Source Files\equi</Filter>
</CudaCompile>
<CudaCompile Include="x15\cuda_x15_whirlpool_sm3.cu">
<Filter>Source Files\CUDA\x15</Filter>
</CudaCompile>
<CudaCompile Include="x16r\cuda_x16_echo512.cu">
<Filter>Source Files\CUDA\x16r</Filter>
</CudaCompile>
<CudaCompile Include="x16r\cuda_x16_fugue512.cu">
<Filter>Source Files\CUDA\x16r</Filter>
</CudaCompile>
<CudaCompile Include="x16r\cuda_x16_shabal512.cu">
<Filter>Source Files\CUDA\x16r</Filter>
</CudaCompile>
<CudaCompile Include="x16r\cuda_x16_simd512_80.cu">
<Filter>Source Files\CUDA\x16r</Filter>
</CudaCompile>
<CudaCompile Include="x16r\x16r.cu">
<Filter>Source Files\CUDA\x16r</Filter>
</CudaCompile>
</ItemGroup>
<ItemGroup>
<Image Include="res\ccminer.ico">
@ -983,4 +1007,4 @@ @@ -983,4 +1007,4 @@
<Filter>Ressources</Filter>
</Text>
</ItemGroup>
</Project>
</Project>

2
compat/ccminer-config.h

@ -164,7 +164,7 @@ @@ -164,7 +164,7 @@
#define PACKAGE_URL "http://github.com/tpruvot/ccminer"
/* Define to the version of this package. */
#define PACKAGE_VERSION "2.2.4"
#define PACKAGE_VERSION "2.2.5"
/* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be

3
miner.h

@ -325,6 +325,7 @@ extern int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, unsig @@ -325,6 +325,7 @@ extern int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, unsig
extern int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_zr5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
@ -389,6 +390,7 @@ extern void free_x11(int thr_id); @@ -389,6 +390,7 @@ extern void free_x11(int thr_id);
extern void free_x13(int thr_id);
extern void free_x14(int thr_id);
extern void free_x15(int thr_id);
extern void free_x16r(int thr_id);
extern void free_x17(int thr_id);
extern void free_zr5(int thr_id);
//extern void free_sha256d(int thr_id);
@ -933,6 +935,7 @@ void x11hash(void *output, const void *input); @@ -933,6 +935,7 @@ void x11hash(void *output, const void *input);
void x13hash(void *output, const void *input);
void x14hash(void *output, const void *input);
void x15hash(void *output, const void *input);
void x16r_hash(void *output, const void *input);
void x17hash(void *output, const void *input);
void wildkeccak_hash(void *output, const void *input, uint64_t* scratchpad, uint64_t ssize);
void zr5hash(void *output, const void *input);

8
res/ccminer.rc

@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico" @@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico"
//
VS_VERSION_INFO VERSIONINFO
FILEVERSION 2,2,4,0
PRODUCTVERSION 2,2,4,0
FILEVERSION 2,2,5,0
PRODUCTVERSION 2,2,5,0
FILEFLAGSMASK 0x3fL
#ifdef _DEBUG
FILEFLAGS 0x21L
@ -76,10 +76,10 @@ BEGIN @@ -76,10 +76,10 @@ BEGIN
BEGIN
BLOCK "040904e4"
BEGIN
VALUE "FileVersion", "2.2.4"
VALUE "FileVersion", "2.2.5"
VALUE "LegalCopyright", "Copyright (C) 2018"
VALUE "ProductName", "ccminer"
VALUE "ProductVersion", "2.2.4"
VALUE "ProductVersion", "2.2.5"
END
END
BLOCK "VarFileInfo"

3
util.cpp

@ -2325,6 +2325,9 @@ void print_hash_tests(void) @@ -2325,6 +2325,9 @@ void print_hash_tests(void)
x15hash(&hash[0], &buf[0]);
printpfx("X15", hash);
x16r_hash(&hash[0], &buf[0]);
printpfx("X16r", hash);
x17hash(&hash[0], &buf[0]);
printpfx("X17", hash);

715
x13/cuda_x13_hamsi512.cu

@ -1,6 +1,6 @@ @@ -1,6 +1,6 @@
/*
* Quick Hamsi-512 for X13
* by tsiv - 2014
* Quick Hamsi-512 for X13 by tsiv - 2014
* + Hamsi-512 80 by tpruvot - 2018
*/
#include <stdio.h>
@ -16,31 +16,17 @@ static __constant__ uint32_t d_alpha_f[32]; @@ -16,31 +16,17 @@ static __constant__ uint32_t d_alpha_f[32];
static __constant__ uint32_t d_T512[64][16];
static const uint32_t alpha_n[] = {
SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
0xff00f0f0, 0xccccaaaa, 0xf0f0cccc, 0xff00aaaa, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xf0f0ff00,
0xf0f0cccc, 0xaaaaff00, 0xccccff00, 0xaaaaf0f0, 0xaaaaf0f0, 0xff00cccc, 0xccccf0f0, 0xff00aaaa,
0xccccaaaa, 0xff00f0f0, 0xff00aaaa, 0xf0f0cccc, 0xf0f0ff00, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc,
0xaaaaff00, 0xf0f0cccc, 0xaaaaf0f0, 0xccccff00, 0xff00cccc, 0xaaaaf0f0, 0xff00aaaa, 0xccccf0f0
};
static const uint32_t alpha_f[] = {
SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0, 0xcaf9f9c0, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, 0x639ccaf9,
0x639c0ff0, 0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c, 0xf9c0639c, 0xcaf90ff0, 0x0ff0639c, 0xcaf9f9c0,
0x0ff0f9c0, 0xcaf9639c, 0xcaf9f9c0, 0x639c0ff0, 0x639ccaf9, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0,
0xf9c0caf9, 0x639c0ff0, 0xf9c0639c, 0x0ff0caf9, 0xcaf90ff0, 0xf9c0639c, 0xcaf9f9c0, 0x0ff0639c
};
#define hamsi_s00 m0
@ -200,390 +186,134 @@ static const uint32_t alpha_f[] = { @@ -200,390 +186,134 @@ static const uint32_t alpha_f[] = {
static const uint32_t T512[64][16] = {
{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
SPH_C32(0x9e69af68) },
{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
SPH_C32(0x0c26f262) },
{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
SPH_C32(0xdc24e61f) },
{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
SPH_C32(0x3daac2da) },
{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
SPH_C32(0x78cace29) },
{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
SPH_C32(0x2dd1f9ab) },
{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
SPH_C32(0xbf2c0be2) },
{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
SPH_C32(0x32219526) },
{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
SPH_C32(0xac8e6c88) },
{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
SPH_C32(0x7b1bd6b9) },
{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
SPH_C32(0xf746c320) },
{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
SPH_C32(0x69505b3a) },
{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
SPH_C32(0x8a341574) },
{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
SPH_C32(0x450360bf) },
{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
SPH_C32(0xf3d45758) },
{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
SPH_C32(0x925c44e9) },
{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
SPH_C32(0xa123ff9f) },
{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
SPH_C32(0x1568ff0f) },
{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
SPH_C32(0xc5c1eb3e) },
{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
SPH_C32(0x1af21fe1) },
{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
SPH_C32(0x857f3c2b) },
{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
SPH_C32(0x2ba05a55) },
{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
SPH_C32(0xfeabf254) },
{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
SPH_C32(0xfe1cdc7f) },
{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
SPH_C32(0xb0a51834) },
{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
SPH_C32(0xa6b8c28d) },
{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
SPH_C32(0x3a4e99d7) },
{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
SPH_C32(0xe1844257) },
{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
SPH_C32(0x2c3b504e) },
{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
SPH_C32(0x524a0d59) },
{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
SPH_C32(0x378dd173) },
{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
SPH_C32(0x8b6c72bd) },
{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
SPH_C32(0x8e67b7fa) },
{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
SPH_C32(0x443d3004) },
{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
SPH_C32(0xf4f6ea7b) },
{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
SPH_C32(0x979961d0) },
{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
SPH_C32(0x98aa496e) },
{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
SPH_C32(0x094e3198) },
{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
SPH_C32(0xe86cba2e) },
{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
SPH_C32(0x4b7eec55) },
{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
SPH_C32(0x1e7536a6) },
{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
SPH_C32(0x24314f17) },
{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
SPH_C32(0x9075b1ce) },
{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
SPH_C32(0x9b6ef888) },
{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
SPH_C32(0xd8b61463) },
{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
SPH_C32(0x3ea660f7) },
{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
SPH_C32(0x7f975691) },
{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
SPH_C32(0x2c94459e) },
{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
SPH_C32(0x56a7b19f) },
{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
SPH_C32(0x81fdf908) },
{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
SPH_C32(0x5bd61539) },
{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
SPH_C32(0x15b961e7) },
{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
SPH_C32(0x2a2c18f0) },
{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
SPH_C32(0x551e3d6e) },
{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
SPH_C32(0x33c5244f) },
{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
SPH_C32(0x8a58e6a4) },
{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
SPH_C32(0xda878000) },
{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
SPH_C32(0x3c5dfffe) },
{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
SPH_C32(0x7b1675d7) },
{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
SPH_C32(0x2879ebac) },
{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
SPH_C32(0xbe0a679e) },
{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
SPH_C32(0x30aebcf7) },
{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
SPH_C32(0xc7ff60f0) },
{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
SPH_C32(0xe7e00a94) }
{ 0xef0b0270, 0x3afd0000, 0x5dae0000, 0x69490000, 0x9b0f3c06, 0x4405b5f9, 0x66140a51, 0x924f5d0a, // 0
0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68 },
{ 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68,
0x26600240, 0xddd80000, 0x722a0000, 0x4f060000, 0x936667ff, 0x29f944ce, 0x368b63d5, 0x0c26f262 },
{ 0x145a3c00, 0xb9e90000, 0x61270000, 0xf1610000, 0xce613d6c, 0xb0493d78, 0x47a96720, 0xe18e24c5,
0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f },
{ 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f,
0x373d2800, 0x71500000, 0x95e00000, 0x0a140000, 0xbdac1909, 0x48ef9831, 0x456d6d1f, 0x3daac2da },
{ 0x54285c00, 0xeaed0000, 0xc5d60000, 0xa1c50000, 0xb3a26770, 0x94a5c4e1, 0x6bb0419d, 0x551b3782,
0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29 },
{ 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29,
0xc8934400, 0x5a3e0000, 0x57870000, 0x4c560000, 0xea982435, 0x75b11115, 0x28b67247, 0x2dd1f9ab },
{ 0x29449c00, 0x64e70000, 0xf24b0000, 0xc2f30000, 0x0ede4e8f, 0x56c23745, 0xf3e04259, 0x8d0d9ec4,
0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2 },
{ 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2, // 7
0x6f299000, 0x6c850000, 0x2f160000, 0x782e0000, 0x644c37cd, 0x12dd1cd6, 0xd26a8c36, 0x32219526 },
{ 0xf6800005, 0x3443c000, 0x24070000, 0x8f3d0000, 0x21373bfb, 0x0ab8d5ae, 0xcdc58b19, 0xd795ba31,
0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88 },
{ 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88,
0x50ff0004, 0x45744000, 0x3dfb0000, 0x19e60000, 0x1bbc5606, 0xe1727b5d, 0xe1a8cc96, 0x7b1bd6b9 },
{ 0xf7750009, 0xcf3cc000, 0xc3d60000, 0x04920000, 0x029519a9, 0xf8e836ba, 0x7a87f14e, 0x9e16981a,
0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320 },
{ 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320,
0x231f0009, 0x42f40000, 0x66790000, 0x4ebb0000, 0xfedb5bd3, 0x315cb0d6, 0xe2b1674a, 0x69505b3a },
{ 0x774400f0, 0xf15a0000, 0xf5b20000, 0x34140000, 0x89377e8c, 0x5a8bec25, 0x0bc3cd1e, 0xcf3775cb,
0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574 },
{ 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574,
0x832800a0, 0x67420000, 0xe1170000, 0x370b0000, 0xcba30034, 0x3c34923c, 0x9767bdcc, 0x450360bf },
{ 0xe8870170, 0x9d720000, 0x12db0000, 0xd4220000, 0xf2886b27, 0xa921e543, 0x4ef8b518, 0x618813b1, // 14
0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758 },
{ 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758,
0x5cb00110, 0x913e0000, 0x44190000, 0x888c0000, 0x66dc7418, 0x921f1d66, 0x55ceea25, 0x925c44e9 },
{ 0x0c720000, 0x49e50f00, 0x42790000, 0x5cea0000, 0x33aa301a, 0x15822514, 0x95a34b7b, 0xb44b0090,
0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f },
{ 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f,
0xf2500000, 0xeebd0a00, 0x67a80000, 0xab8a0000, 0xba9b48c0, 0x0a56dd74, 0xdb73e86e, 0x1568ff0f },
{ 0x45180000, 0xa5b51700, 0xf96a0000, 0x3b480000, 0x1ecc142c, 0x231395d6, 0x16bca6b0, 0xdf33f4df,
0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e },
{ 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e,
0xfd250000, 0xb3c41100, 0xcef00000, 0xcef90000, 0x3c4d7580, 0x8d5b6493, 0x7098b0a6, 0x1af21fe1 },
{ 0x75a40000, 0xc28b2700, 0x94a40000, 0x90f50000, 0xfb7857e0, 0x49ce0bae, 0x1767c483, 0xaedf667e,
0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b },
{ 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b, // 21
0xa4c20000, 0xd9372400, 0x0a480000, 0x66610000, 0xf87a12c7, 0x86bef75c, 0xa324df94, 0x2ba05a55 },
{ 0x75c90003, 0x0e10c000, 0xd1200000, 0xbaea0000, 0x8bc42f3e, 0x8758b757, 0xbb28761d, 0x00b72e2b,
0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254 },
{ 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254,
0x9b060002, 0x61468000, 0x221e0000, 0x1d740000, 0x36715d27, 0x30495c92, 0xf11336a7, 0xfe1cdc7f },
{ 0x86790000, 0x3f390002, 0xe19ae000, 0x98560000, 0x9565670e, 0x4e88c8ea, 0xd3dd4944, 0x161ddab9,
0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834 },
{ 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834,
0xb6ce0000, 0xdae90002, 0x156e8000, 0xda920000, 0xf6dd5a64, 0x36325c8a, 0xf272e8ae, 0xa6b8c28d },
{ 0x14190000, 0x23ca003c, 0x50df0000, 0x44b60000, 0x1b6c67b0, 0x3cf3ac75, 0x61e610b0, 0xdbcadb80,
0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7 },
{ 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7,
0xf75a0000, 0x19840028, 0xa2190000, 0xeef80000, 0xc0722516, 0x19981260, 0x73dba1e6, 0xe1844257 },
{ 0x54500000, 0x0671005c, 0x25ae0000, 0x6a1e0000, 0x2ea54edf, 0x664e8512, 0xbfba18c3, 0x7e715d17, // 28
0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e },
{ 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e,
0xe8dd0000, 0xfa4a0044, 0x3c2d0000, 0xbb150000, 0x80bd361b, 0x24e81d44, 0xbfa8c2f4, 0x524a0d59 },
{ 0x69510000, 0xd4e1009c, 0xc3230000, 0xac2f0000, 0xe4950bae, 0xcea415dc, 0x87ec287c, 0xbce1a3ce,
0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173 },
{ 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173,
0xaf220000, 0x7b6c0090, 0x67e20000, 0x8da20000, 0xc7841e29, 0xb7b744f3, 0x9ac484f4, 0x8b6c72bd },
{ 0xcc140000, 0xa5630000, 0x5ab90780, 0x3b500000, 0x4bd013ff, 0x879b3418, 0x694348c1, 0xca5a87fe,
0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa },
{ 0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa,
0x4d8a0000, 0x49340000, 0x3c8b0500, 0xaea30000, 0x16793bfd, 0xcf6f08a4, 0x8f19eaec, 0x443d3004 },
{ 0x78230000, 0x12fc0000, 0xa93a0b80, 0x90a50000, 0x713e2879, 0x7ee98924, 0xf08ca062, 0x636f8bab,
0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b },
{ 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b, // 35
0x7a8c0000, 0xa5d40000, 0x13260880, 0xc63d0000, 0xcbb36daa, 0xfea14f43, 0x59d0b4f8, 0x979961d0 },
{ 0xac480000, 0x1ba60000, 0x45fb1380, 0x03430000, 0x5a85316a, 0x1fb250b6, 0xfe72c7fe, 0x91e478f6,
0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e },
{ 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e,
0xb2060000, 0xc5690000, 0x28031200, 0x74670000, 0xb6c236f4, 0xeb1239f8, 0x33d1dfec, 0x094e3198 },
{ 0xaec30000, 0x9c4f0001, 0x79d1e000, 0x2c150000, 0x45cc75b3, 0x6650b736, 0xab92f78f, 0xa312567b,
0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e },
{ 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e,
0x75e60000, 0x95660001, 0x307b2000, 0xadf40000, 0x8f321eea, 0x24298307, 0xe8c49cf9, 0x4b7eec55 },
{ 0x58430000, 0x807e0000, 0x78330001, 0xc66b3800, 0xe7375cdc, 0x79ad3fdd, 0xac73fe6f, 0x3a4479b1,
0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6 },
{ 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6,
0x45190000, 0xab0c0000, 0x30be0001, 0x690a2000, 0xc2fc7219, 0xb1d4800d, 0x2dd1fa46, 0x24314f17 },
{ 0xa53b0000, 0x14260000, 0x4e30001e, 0x7cae0000, 0x8f9e0dd5, 0x78dfaa3d, 0xf73168d8, 0x0b1b4946, // 42
0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce },
{ 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce,
0xa2d60000, 0xa6760000, 0xc9440014, 0xeba30000, 0xccec2e7b, 0x3018c499, 0x03490afa, 0x9b6ef888 },
{ 0x88980000, 0x1f940000, 0x7fcf002e, 0xfb4e0000, 0xf158079a, 0x61ae9167, 0xa895706c, 0xe6107494,
0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463 },
{ 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463,
0x835a0000, 0xc4f70000, 0x01470022, 0xeec80000, 0x60a54f69, 0x142f2a24, 0x5cf534f2, 0x3ea660f7 },
{ 0x52500000, 0x29540000, 0x6a61004e, 0xf0ff0000, 0x9a317eec, 0x452341ce, 0xcf568fe5, 0x5303130f,
0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691 },
{ 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691,
0x01dd0000, 0x80a80000, 0xf4960048, 0xa6000000, 0x90d57ea2, 0xd7e68c37, 0x6612cffd, 0x2c94459e },
{ 0xe6280000, 0x4c4b0000, 0xa8550000, 0xd3d002e0, 0xd86130b8, 0x98a7b0da, 0x289506b4, 0xd75a4897,
0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f },
{ 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f, // 49
0x16ed0000, 0x15680000, 0xedd70000, 0x325d0220, 0xe30c3689, 0x5a4ae643, 0xe375f8a8, 0x81fdf908 },
{ 0xb4310000, 0x77330000, 0xb15d0000, 0x7fd004e0, 0x78a26138, 0xd116c35d, 0xd256d489, 0x4e6f74de,
0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539 },
{ 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539,
0x57370000, 0xcaf20000, 0x364e0000, 0xc0220480, 0x56186b22, 0x5ca3f40c, 0xa1937f8f, 0x15b961e7 },
{ 0x02f20000, 0xa2810000, 0x873f0000, 0xe36c7800, 0x1e1d74ef, 0x073d2bd6, 0xc4c23237, 0x7f32259e,
0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0 },
{ 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0,
0xb82f0000, 0xb12c0000, 0x30d80000, 0x14445000, 0xc15860a2, 0x3127e8ec, 0x2e98bf23, 0x551e3d6e },
{ 0x1e6c0000, 0xc4420000, 0x8a2e0000, 0xbcb6b800, 0x2c4413b6, 0x8bfdd3da, 0x6a0c1bc8, 0xb99dc2eb,
0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f },
{ 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f,
0x8c3a0000, 0xda980000, 0x607f0000, 0x54078800, 0x85714513, 0x6006b243, 0xdb50399c, 0x8a58e6a4 },
{ 0x033d0000, 0x08b30000, 0xf33a0000, 0x3ac20007, 0x51298a50, 0x6b6e661f, 0x0ea5cfe3, 0xe6da7ffe, // 56
0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000 },
{ 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000,
0xabe70000, 0x9e0d0000, 0xaf270000, 0x3d180005, 0x2c4f1fd3, 0x74f61695, 0xb5c347eb, 0x3c5dfffe },
{ 0x01930000, 0xe7820000, 0xedfb0000, 0xcf0c000b, 0x8dd08d58, 0xbca3b42e, 0x063661e1, 0x536f9e7b,
0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7 },
{ 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7,
0x93bb0000, 0x3b070000, 0xba010000, 0x99d00008, 0x3739ae4e, 0xe64c1722, 0x96f896b3, 0x2879ebac },
{ 0x5fa80000, 0x56030000, 0x43ae0000, 0x64f30013, 0x257e86bf, 0x1311944e, 0x541e95bf, 0x8ea4db69,
0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e },
{ 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e,
0x5fec0000, 0x294b0000, 0x99d20000, 0x4ed00012, 0x1ed34f73, 0xbaa708c9, 0x57140bdf, 0x30aebcf7 },
{ 0xee930000, 0xd6070000, 0x92c10000, 0x2b9801e0, 0x9451287c, 0x3b6cfb57, 0x45312374, 0x201f6a64,
0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0 },
{ 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0,
0x95bb0000, 0x81450000, 0x3b240000, 0x48db0140, 0x0a8a6c53, 0x56f56eec, 0x62c91877, 0xe7e00a94 }
};
__global__
@ -598,12 +328,12 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * @@ -598,12 +328,12 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
unsigned char *h1 = (unsigned char *)Hash;
uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265);
uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c);
uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48);
uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d);
uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
uint32_t c0 = 0x73746565, c1 = 0x6c706172, c2 = 0x6b204172, c3 = 0x656e6265;
uint32_t c4 = 0x72672031, c5 = 0x302c2062, c6 = 0x75732032, c7 = 0x3434362c;
uint32_t c8 = 0x20422d33, c9 = 0x30303120, cA = 0x4c657576, cB = 0x656e2d48;
uint32_t cC = 0x65766572, cD = 0x6c65652c, cE = 0x2042656c, cF = 0x6769756d;
uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
uint32_t *tp, db, dm;
for(int i = 0; i < 64; i += 8) {
@ -637,16 +367,16 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * @@ -637,16 +367,16 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
T_BIG;
}
// precomputed for 64 bytes blocks ?
tp = &d_T512[0][0] + 112;
m0 = *(tp+ 0); m1 = *(tp+ 1);
m2 = *(tp+ 2); m3 = *(tp+ 3);
m4 = *(tp+ 4); m5 = *(tp+ 5);
m6 = *(tp+ 6); m7 = *(tp+ 7);
m8 = *(tp+ 8); m9 = *(tp+ 9);
mA = *(tp+10); mB = *(tp+11);
mC = *(tp+12); mD = *(tp+13);
mE = *(tp+14); mF = *(tp+15);
m0 = tp[ 0]; m1 = tp[ 1];
m2 = tp[ 2]; m3 = tp[ 3];
m4 = tp[ 4]; m5 = tp[ 5];
m6 = tp[ 6]; m7 = tp[ 7];
m8 = tp[ 8]; m9 = tp[ 9];
mA = tp[10]; mB = tp[11];
mC = tp[12]; mD = tp[13];
mE = tp[14]; mF = tp[15];
for( int r = 0; r < 6; r += 2 ) {
ROUND_BIG(r, d_alpha_n);
@ -655,15 +385,14 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * @@ -655,15 +385,14 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
T_BIG;
tp = &d_T512[0][0] + 784;
m0 = *(tp+ 0); m1 = *(tp+ 1);
m2 = *(tp+ 2); m3 = *(tp+ 3);
m4 = *(tp+ 4); m5 = *(tp+ 5);
m6 = *(tp+ 6); m7 = *(tp+ 7);
m8 = *(tp+ 8); m9 = *(tp+ 9);
mA = *(tp+10); mB = *(tp+11);
mC = *(tp+12); mD = *(tp+13);
mE = *(tp+14); mF = *(tp+15);
m0 = tp[ 0]; m1 = tp[ 1];
m2 = tp[ 2]; m3 = tp[ 3];
m4 = tp[ 4]; m5 = tp[ 5];
m6 = tp[ 6]; m7 = tp[ 7];
m8 = tp[ 8]; m9 = tp[ 9];
mA = tp[10]; mB = tp[11];
mC = tp[12]; mD = tp[13];
mE = tp[14]; mF = tp[15];
for( int r = 0; r < 12; r += 2 ) {
ROUND_BIG(r, d_alpha_f);
@ -696,3 +425,127 @@ void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce @@ -696,3 +425,127 @@ void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce
x13_hamsi512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
//MyStreamSynchronize(NULL, order, thr_id);
}
__constant__ static uint64_t c_PaddedMessage80[10];
__host__
void x16_hamsi512_setBlock_80(void *pdata)
{
cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
}
__global__
void x16_hamsi512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
{
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
unsigned char h1[80];
#pragma unroll
for (int i = 0; i < 10; i++)
((uint2*)h1)[i] = ((uint2*)c_PaddedMessage80)[i];
//((uint64_t*)h1)[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(startNonce + thread));
((uint32_t*)h1)[19] = cuda_swab32(startNonce + thread);
uint32_t c0 = 0x73746565, c1 = 0x6c706172, c2 = 0x6b204172, c3 = 0x656e6265;
uint32_t c4 = 0x72672031, c5 = 0x302c2062, c6 = 0x75732032, c7 = 0x3434362c;
uint32_t c8 = 0x20422d33, c9 = 0x30303120, cA = 0x4c657576, cB = 0x656e2d48;
uint32_t cC = 0x65766572, cD = 0x6c65652c, cE = 0x2042656c, cF = 0x6769756d;
uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
uint32_t *tp, db, dm;
for(int i = 0; i < 80; i += 8)
{
m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0;
m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0;
tp = &d_T512[0][0];
#pragma unroll
for (int u = 0; u < 8; u++) {
db = h1[i + u];
#pragma unroll 2
for (int v = 0; v < 8; v++, db >>= 1) {
dm = -(uint32_t)(db & 1);
m0 ^= dm & tp[ 0]; m1 ^= dm & tp[ 1];
m2 ^= dm & tp[ 2]; m3 ^= dm & tp[ 3];
m4 ^= dm & tp[ 4]; m5 ^= dm & tp[ 5];
m6 ^= dm & tp[ 6]; m7 ^= dm & tp[ 7];
m8 ^= dm & tp[ 8]; m9 ^= dm & tp[ 9];
mA ^= dm & tp[10]; mB ^= dm & tp[11];
mC ^= dm & tp[12]; mD ^= dm & tp[13];
mE ^= dm & tp[14]; mF ^= dm & tp[15];
tp += 16;
}
}
#pragma unroll
for (int r = 0; r < 6; r++) {
ROUND_BIG(r, d_alpha_n);
}
T_BIG;
}
#define INPUT_BIG { \
m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0; \
m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0; \
tp = &d_T512[0][0]; \
for (int u = 0; u < 8; u++) { \
db = endtag[u]; \
for (int v = 0; v < 8; v++, db >>= 1) { \
dm = -(uint32_t)(db & 1); \
m0 ^= dm & tp[ 0]; m1 ^= dm & tp[ 1]; \
m2 ^= dm & tp[ 2]; m3 ^= dm & tp[ 3]; \
m4 ^= dm & tp[ 4]; m5 ^= dm & tp[ 5]; \
m6 ^= dm & tp[ 6]; m7 ^= dm & tp[ 7]; \
m8 ^= dm & tp[ 8]; m9 ^= dm & tp[ 9]; \
mA ^= dm & tp[10]; mB ^= dm & tp[11]; \
mC ^= dm & tp[12]; mD ^= dm & tp[13]; \
mE ^= dm & tp[14]; mF ^= dm & tp[15]; \
tp += 16; \
} \
} \
}
// close
uint8_t endtag[8] = { 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
INPUT_BIG;
#pragma unroll
for (int r = 0; r < 6; r++) {
ROUND_BIG(r, d_alpha_n);
}
T_BIG;
endtag[0] = endtag[1] = 0x00;
endtag[6] = 0x02;
endtag[7] = 0x80;
INPUT_BIG;
// PF_BIG
#pragma unroll
for(int r = 0; r < 12; r++) {
ROUND_BIG(r, d_alpha_f);
}
T_BIG;
uint64_t hashPosition = thread;
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition << 3];
#pragma unroll 16
for(int i = 0; i < 16; i++)
Hash[i] = cuda_swab32(h[i]);
#undef INPUT_BIG
}
}
__host__
void x16_hamsi512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
x16_hamsi512_gpu_hash_80 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash);
}

64
x15/cuda_x15_whirlpool_sm3.cu

@ -1998,7 +1998,7 @@ const int i0, const int i1, const int i2, const int i3, const int i4, const int @@ -1998,7 +1998,7 @@ const int i0, const int i1, const int i2, const int i3, const int i4, const int
__global__
void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, int swab)
void oldwhirlpool_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, void *outputHash, int swab)
{
__shared__ uint64_t sharedMemory[2048];
@ -2014,7 +2014,8 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp @@ -2014,7 +2014,8 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
#endif
}
__threadfence_block(); // ensure shared mem is ready
//__threadfence_block(); // ensure shared mem is ready
__syncthreads();
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
@ -2028,7 +2029,8 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp @@ -2028,7 +2029,8 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
uint64_t state[8];
#pragma unroll 8
for (int i=0; i < 8; i++) {
state[i] = c_PaddedMessage80[i];
//state[i] = c_PaddedMessage80[i];
AS_UINT2(&state[i]) = AS_UINT2(&c_PaddedMessage80[i]);
}
#else
#pragma unroll 8
@ -2050,6 +2052,7 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp @@ -2050,6 +2052,7 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
state[i] = xor1(n[i],c_PaddedMessage80[i]);
}
#endif
/// round 2 ///////
//////////////////////////////////
n[0] = c_PaddedMessage80[8]; //read data
@ -2331,7 +2334,7 @@ extern uint32_t whirlpool512_finalhash_64(int thr_id, uint32_t threads, uint32_t @@ -2331,7 +2334,7 @@ extern uint32_t whirlpool512_finalhash_64(int thr_id, uint32_t threads, uint32_t
}
__host__
void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash)
{
dim3 grid((threads + threadsperblock-1) / threadsperblock);
dim3 block(threadsperblock);
@ -2339,7 +2342,7 @@ void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNounce @@ -2339,7 +2342,7 @@ void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNounce
if (threads < 256)
applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!");
oldwhirlpool_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_outputHash, 1);
oldwhirlpool_gpu_hash_80<<<grid, block>>>(threads, startNonce, d_outputHash, 1);
}
extern void whirl_midstate(void *state, const void *input);
@ -2363,3 +2366,54 @@ void whirlpool512_setBlock_80_sm3(void *pdata, const void *ptarget) @@ -2363,3 +2366,54 @@ void whirlpool512_setBlock_80_sm3(void *pdata, const void *ptarget)
cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice);
}
// ------------------------------------------------------------------------------------------------
__host__
void x16_whirlpool512_init(int thr_id, uint32_t threads)
{
cudaMemcpyToSymbol(InitVector_RC, plain_RC, sizeof(plain_RC), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(mixTob0Tox, plain_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice);
#if USE_ALL_TABLES
cudaMemcpyToSymbol(mixTob1Tox, plain_T1, (256 * 8), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(mixTob2Tox, plain_T2, (256 * 8), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(mixTob3Tox, plain_T3, (256 * 8), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(mixTob4Tox, plain_T4, (256 * 8), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(mixTob5Tox, plain_T5, (256 * 8), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(mixTob6Tox, plain_T6, (256 * 8), 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(mixTob7Tox, plain_T7, (256 * 8), 0, cudaMemcpyHostToDevice);
#endif
}
extern void whirlpool_midstate(void *state, const void *input);
__host__
void x16_whirlpool512_setBlock_80(void *pdata)
{
unsigned char PaddedMessage[128];
memcpy(PaddedMessage, pdata, 80);
memset(PaddedMessage + 80, 0, 48);
PaddedMessage[80] = 0x80; /* ending */
#if HOST_MIDSTATE
// compute constant first block
unsigned char midstate[64] = { 0 };
whirlpool_midstate(midstate, pdata);
memcpy(PaddedMessage, midstate, 64);
#endif
cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice);
}
__host__
void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_outputHash)
{
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
if (threads < 256)
applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!");
oldwhirlpool_gpu_hash_80 <<<grid, block>>> (threads, startNonce, d_outputHash, 1);
}

214
x16r/cuda_x16_echo512.cu

@ -0,0 +1,214 @@ @@ -0,0 +1,214 @@
/**
* echo512-80 cuda kernel for X16R algorithm
*
* tpruvot 2018 - GPL code
*/
#include <stdio.h>
#include <memory.h>
#include "cuda_helper.h"
extern __device__ __device_builtin__ void __threadfence_block(void);
#include "../x11/cuda_x11_aes.cuh"
__device__ __forceinline__ void AES_2ROUND(const uint32_t* __restrict__ sharedMemory,
uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3,
uint32_t &k0)
{
uint32_t y0, y1, y2, y3;
aes_round(sharedMemory,
x0, x1, x2, x3,
k0,
y0, y1, y2, y3);
aes_round(sharedMemory,
y0, y1, y2, y3,
x0, x1, x2, x3);
k0++;
}
__device__
static void echo_round(uint32_t* const sharedMemory, uint32_t *W, uint32_t &k0)
{
// Big Sub Words
#pragma unroll 16
for (int idx = 0; idx < 16; idx++) {
AES_2ROUND(sharedMemory, W[(idx << 2) + 0], W[(idx << 2) + 1], W[(idx << 2) + 2], W[(idx << 2) + 3], k0);
}
// Shift Rows
#pragma unroll 4
for (int i = 0; i < 4; i++)
{
uint32_t t[4];
/// 1, 5, 9, 13
t[0] = W[i + 4];
t[1] = W[i + 8];
t[2] = W[i + 24];
t[3] = W[i + 60];
W[i + 4] = W[i + 20];
W[i + 8] = W[i + 40];
W[i + 24] = W[i + 56];
W[i + 60] = W[i + 44];
W[i + 20] = W[i + 36];
W[i + 40] = t[1];
W[i + 56] = t[2];
W[i + 44] = W[i + 28];
W[i + 28] = W[i + 12];
W[i + 12] = t[3];
W[i + 36] = W[i + 52];
W[i + 52] = t[0];
}
// Mix Columns
#pragma unroll 4
for (int i = 0; i < 4; i++)
{
#pragma unroll 4
for (int idx = 0; idx < 64; idx += 16)
{
uint32_t a[4];
a[0] = W[idx + i];
a[1] = W[idx + i + 4];
a[2] = W[idx + i + 8];
a[3] = W[idx + i + 12];
uint32_t ab = a[0] ^ a[1];
uint32_t bc = a[1] ^ a[2];
uint32_t cd = a[2] ^ a[3];
uint32_t t, t2, t3;
t = (ab & 0x80808080);
t2 = (bc & 0x80808080);
t3 = (cd & 0x80808080);
uint32_t abx = (t >> 7) * 27U ^ ((ab^t) << 1);
uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
W[idx + i] = bc ^ a[3] ^ abx;
W[idx + i + 4] = a[0] ^ cd ^ bcx;
W[idx + i + 8] = ab ^ a[3] ^ cdx;
W[idx + i + 12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx);
}
}
}
__device__ __forceinline__
void cuda_echo_round_80(uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__ data, const uint32_t nonce, uint32_t *hash)
{
uint32_t h[29]; // <= 127 bytes input
#pragma unroll 8
for (int i = 0; i < 18; i += 2)
AS_UINT2(&h[i]) = AS_UINT2(&data[i]);
h[18] = data[18];
h[19] = cuda_swab32(nonce);
h[20] = 0x80;
h[21] = h[22] = h[23] = h[24] = h[25] = h[26] = 0;
//((uint8_t*)h)[80] = 0x80;
//((uint8_t*)h)[128-17] = 0x02;
//((uint8_t*)h)[128-16] = 0x80;
//((uint8_t*)h)[128-15] = 0x02;
h[27] = 0x2000000;
h[28] = 0x280;
//h[29] = h[30] = h[31] = 0;
uint32_t k0 = 640; // bitlen
uint32_t W[64];
#pragma unroll 8
for (int i = 0; i < 32; i+=4) {
W[i] = 512; // L
W[i+1] = 0; // H
W[i+2] = 0; // X
W[i+3] = 0;
}
uint32_t Z[16];
#pragma unroll
for (int i = 0; i<16; i++) Z[i] = W[i];
#pragma unroll
for (int i = 32; i<61; i++) W[i] = h[i - 32];
#pragma unroll
for (int i = 61; i<64; i++) W[i] = 0;
for (int i = 0; i < 10; i++)
echo_round(sharedMemory, W, k0);
#pragma unroll 16
for (int i = 0; i < 16; i++) {
Z[i] ^= h[i] ^ W[i] ^ W[i + 32];
}
#pragma unroll 8
for (int i = 0; i < 16; i += 2)
AS_UINT2(&hash[i]) = AS_UINT2(&Z[i]);
}
__device__ __forceinline__
void echo_gpu_init(uint32_t *const __restrict__ sharedMemory)
{
/* each thread startup will fill a uint32 */
if (threadIdx.x < 128) {
sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
}
}
__host__
void x16_echo512_cuda_init(int thr_id, const uint32_t threads)
{
aes_cpu_init(thr_id);
}
__constant__ static uint32_t c_PaddedMessage80[20];
__host__
void x16_echo512_setBlock_80(void *endiandata)
{
cudaMemcpyToSymbol(c_PaddedMessage80, endiandata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
}
__global__ __launch_bounds__(128, 7) /* will force 72 registers */
void x16_echo512_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint64_t *g_hash)
{
__shared__ uint32_t sharedMemory[1024];
echo_gpu_init(sharedMemory);
__threadfence_block();
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint64_t hashPosition = thread;
uint32_t *pHash = (uint32_t*)&g_hash[hashPosition<<3];
cuda_echo_round_80(sharedMemory, c_PaddedMessage80, startNonce + thread, pHash);
}
}
__host__
void x16_echo512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
{
const uint32_t threadsperblock = 128;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
x16_echo512_gpu_hash_80<<<grid, block>>>(threads, startNonce, (uint64_t*)d_hash);
}

467
x16r/cuda_x16_fugue512.cu

@ -0,0 +1,467 @@ @@ -0,0 +1,467 @@
#include <stdio.h>
#include <cuda_helper.h>
#define TPB 256
/*
* fugue512-80 x16r kernel implementation.
*
* ==========================(LICENSE BEGIN)============================
*
* Copyright (c) 2018 tpruvot
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
* ===========================(LICENSE END)=============================
*/
#ifdef __INTELLISENSE__
#define __byte_perm(x, y, m) (x|y)
#define tex1Dfetch(t, n) (n)
#define __CUDACC__
#include <cuda_texture_types.h>
#endif
// store allocated textures device addresses
static unsigned int* d_textures[MAX_GPUS][1];
#define mixtab0(x) mixtabs[(x)]
#define mixtab1(x) mixtabs[(x)+256]
#define mixtab2(x) mixtabs[(x)+512]
#define mixtab3(x) mixtabs[(x)+768]
static texture<unsigned int, 1, cudaReadModeElementType> mixTab0Tex;
static const uint32_t mixtab0[] = {
0x63633297, 0x7c7c6feb, 0x77775ec7, 0x7b7b7af7, 0xf2f2e8e5, 0x6b6b0ab7, 0x6f6f16a7, 0xc5c56d39,
0x303090c0, 0x01010704, 0x67672e87, 0x2b2bd1ac, 0xfefeccd5, 0xd7d71371, 0xabab7c9a, 0x767659c3,
0xcaca4005, 0x8282a33e, 0xc9c94909, 0x7d7d68ef, 0xfafad0c5, 0x5959947f, 0x4747ce07, 0xf0f0e6ed,
0xadad6e82, 0xd4d41a7d, 0xa2a243be, 0xafaf608a, 0x9c9cf946, 0xa4a451a6, 0x727245d3, 0xc0c0762d,
0xb7b728ea, 0xfdfdc5d9, 0x9393d47a, 0x2626f298, 0x363682d8, 0x3f3fbdfc, 0xf7f7f3f1, 0xcccc521d,
0x34348cd0, 0xa5a556a2, 0xe5e58db9, 0xf1f1e1e9, 0x71714cdf, 0xd8d83e4d, 0x313197c4, 0x15156b54,
0x04041c10, 0xc7c76331, 0x2323e98c, 0xc3c37f21, 0x18184860, 0x9696cf6e, 0x05051b14, 0x9a9aeb5e,
0x0707151c, 0x12127e48, 0x8080ad36, 0xe2e298a5, 0xebeba781, 0x2727f59c, 0xb2b233fe, 0x757550cf,
0x09093f24, 0x8383a43a, 0x2c2cc4b0, 0x1a1a4668, 0x1b1b416c, 0x6e6e11a3, 0x5a5a9d73, 0xa0a04db6,
0x5252a553, 0x3b3ba1ec, 0xd6d61475, 0xb3b334fa, 0x2929dfa4, 0xe3e39fa1, 0x2f2fcdbc, 0x8484b126,
0x5353a257, 0xd1d10169, 0x00000000, 0xededb599, 0x2020e080, 0xfcfcc2dd, 0xb1b13af2, 0x5b5b9a77,
0x6a6a0db3, 0xcbcb4701, 0xbebe17ce, 0x3939afe4, 0x4a4aed33, 0x4c4cff2b, 0x5858937b, 0xcfcf5b11,
0xd0d0066d, 0xefefbb91, 0xaaaa7b9e, 0xfbfbd7c1, 0x4343d217, 0x4d4df82f, 0x333399cc, 0x8585b622,
0x4545c00f, 0xf9f9d9c9, 0x02020e08, 0x7f7f66e7, 0x5050ab5b, 0x3c3cb4f0, 0x9f9ff04a, 0xa8a87596,
0x5151ac5f, 0xa3a344ba, 0x4040db1b, 0x8f8f800a, 0x9292d37e, 0x9d9dfe42, 0x3838a8e0, 0xf5f5fdf9,
0xbcbc19c6, 0xb6b62fee, 0xdada3045, 0x2121e784, 0x10107040, 0xffffcbd1, 0xf3f3efe1, 0xd2d20865,
0xcdcd5519, 0x0c0c2430, 0x1313794c, 0xececb29d, 0x5f5f8667, 0x9797c86a, 0x4444c70b, 0x1717655c,
0xc4c46a3d, 0xa7a758aa, 0x7e7e61e3, 0x3d3db3f4, 0x6464278b, 0x5d5d886f, 0x19194f64, 0x737342d7,
0x60603b9b, 0x8181aa32, 0x4f4ff627, 0xdcdc225d, 0x2222ee88, 0x2a2ad6a8, 0x9090dd76, 0x88889516,
0x4646c903, 0xeeeebc95, 0xb8b805d6, 0x14146c50, 0xdede2c55, 0x5e5e8163, 0x0b0b312c, 0xdbdb3741,
0xe0e096ad, 0x32329ec8, 0x3a3aa6e8, 0x0a0a3628, 0x4949e43f, 0x06061218, 0x2424fc90, 0x5c5c8f6b,
0xc2c27825, 0xd3d30f61, 0xacac6986, 0x62623593, 0x9191da72, 0x9595c662, 0xe4e48abd, 0x797974ff,
0xe7e783b1, 0xc8c84e0d, 0x373785dc, 0x6d6d18af, 0x8d8d8e02, 0xd5d51d79, 0x4e4ef123, 0xa9a97292,
0x6c6c1fab, 0x5656b943, 0xf4f4fafd, 0xeaeaa085, 0x6565208f, 0x7a7a7df3, 0xaeae678e, 0x08083820,
0xbaba0bde, 0x787873fb, 0x2525fb94, 0x2e2ecab8, 0x1c1c5470, 0xa6a65fae, 0xb4b421e6, 0xc6c66435,
0xe8e8ae8d, 0xdddd2559, 0x747457cb, 0x1f1f5d7c, 0x4b4bea37, 0xbdbd1ec2, 0x8b8b9c1a, 0x8a8a9b1e,
0x70704bdb, 0x3e3ebaf8, 0xb5b526e2, 0x66662983, 0x4848e33b, 0x0303090c, 0xf6f6f4f5, 0x0e0e2a38,
0x61613c9f, 0x35358bd4, 0x5757be47, 0xb9b902d2, 0x8686bf2e, 0xc1c17129, 0x1d1d5374, 0x9e9ef74e,
0xe1e191a9, 0xf8f8decd, 0x9898e556, 0x11117744, 0x696904bf, 0xd9d93949, 0x8e8e870e, 0x9494c166,
0x9b9bec5a, 0x1e1e5a78, 0x8787b82a, 0xe9e9a989, 0xcece5c15, 0x5555b04f, 0x2828d8a0, 0xdfdf2b51,
0x8c8c8906, 0xa1a14ab2, 0x89899212, 0x0d0d2334, 0xbfbf10ca, 0xe6e684b5, 0x4242d513, 0x686803bb,
0x4141dc1f, 0x9999e252, 0x2d2dc3b4, 0x0f0f2d3c, 0xb0b03df6, 0x5454b74b, 0xbbbb0cda, 0x16166258
};
#define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \
x22 ^= x00; \
x00 = (q); \
x08 ^= x00; \
x01 ^= x24; \
x04 ^= x27; \
x07 ^= x30; \
}
#define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20) { \
x00 ^= x04; \
x01 ^= x05; \
x02 ^= x06; \
x18 ^= x04; \
x19 ^= x05; \
x20 ^= x06; \
}
#define SMIX(x0, x1, x2, x3) { \
uint32_t tmp; \
uint32_t r0 = 0; \
uint32_t r1 = 0; \
uint32_t r2 = 0; \
uint32_t r3 = 0; \
uint32_t c0 = mixtab0(x0 >> 24); \
tmp = mixtab1((x0 >> 16) & 0xFF); \
c0 ^= tmp; \
r1 ^= tmp; \
tmp = mixtab2((x0 >> 8) & 0xFF); \
c0 ^= tmp; \
r2 ^= tmp; \
tmp = mixtab3(x0 & 0xFF); \
c0 ^= tmp; \
r3 ^= tmp; \
tmp = mixtab0(x1 >> 24); \
uint32_t c1 = tmp; \
r0 ^= tmp; \
tmp = mixtab1((x1 >> 16) & 0xFF); \
c1 ^= tmp; \
tmp = mixtab2((x1 >> 8) & 0xFF); \
c1 ^= tmp; \
r2 ^= tmp; \
tmp = mixtab3(x1 & 0xFF); \
c1 ^= tmp; \
r3 ^= tmp; \
tmp = mixtab0(x2 >> 24); \
uint32_t c2 = tmp; \
r0 ^= tmp; \
tmp = mixtab1((x2 >> 16) & 0xFF); \
c2 ^= tmp; \
r1 ^= tmp; \
tmp = mixtab2((x2 >> 8) & 0xFF); \
c2 ^= tmp; \
tmp = mixtab3(x2 & 0xFF); \
c2 ^= tmp; \
r3 ^= tmp; \
tmp = mixtab0(x3 >> 24); \
uint32_t c3 = tmp; \
r0 ^= tmp; \
tmp = mixtab1((x3 >> 16) & 0xFF); \
c3 ^= tmp; \
r1 ^= tmp; \
tmp = mixtab2((x3 >> 8) & 0xFF); \
c3 ^= tmp; \
r2 ^= tmp; \
tmp = mixtab3(x3 & 0xFF); \
c3 ^= tmp; \
x0 = ((c0 ^ r0) & 0xFF000000) | ((c1 ^ r1) & 0x00FF0000) \
| ((c2 ^ r2) & 0x0000FF00) | ((c3 ^ r3) & 0x000000FF); \
x1 = ((c1 ^ (r0 << 8)) & 0xFF000000) | ((c2 ^ (r1 << 8)) & 0x00FF0000) \
| ((c3 ^ (r2 << 8)) & 0x0000FF00) | ((c0 ^ (r3 >> 24)) & 0x000000FF); \
x2 = ((c2 ^ (r0 << 16)) & 0xFF000000) | ((c3 ^ (r1 << 16)) & 0x00FF0000) \
| ((c0 ^ (r2 >> 16)) & 0x0000FF00) | ((c1 ^ (r3 >> 16)) & 0x000000FF); \
x3 = ((c3 ^ (r0 << 24)) & 0xFF000000) | ((c0 ^ (r1 >> 8)) & 0x00FF0000) \
| ((c1 ^ (r2 >> 8)) & 0x0000FF00) | ((c2 ^ (r3 >> 8)) & 0x000000FF); \
}
#define SUB_ROR3 { \
B33 = S33, B34 = S34, B35 = S35; \
S35 = S32; S34 = S31; S33 = S30; S32 = S29; S31 = S28; S30 = S27; S29 = S26; S28 = S25; S27 = S24; \
S26 = S23; S25 = S22; S24 = S21; S23 = S20; S22 = S19; S21 = S18; S20 = S17; S19 = S16; S18 = S15; \
S17 = S14; S16 = S13; S15 = S12; S14 = S11; S13 = S10; S12 = S09; S11 = S08; S10 = S07; S09 = S06; \
S08 = S05; S07 = S04; S06 = S03; S05 = S02; S04 = S01; S03 = S00; S02 = B35; S01 = B34; S00 = B33; \
}
#define SUB_ROR8 { \
B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
S35 = S27; S34 = S26; S33 = S25; S32 = S24; S31 = S23; S30 = S22; S29 = S21; S28 = S20; S27 = S19; \
S26 = S18; S25 = S17; S24 = S16; S23 = S15; S22 = S14; S21 = S13; S20 = S12; S19 = S11; S18 = S10; \
S17 = S09; S16 = S08; S15 = S07; S14 = S06; S13 = S05; S12 = S04; S11 = S03; S10 = S02; S09 = S01; \
S08 = S00; S07 = B35; S06 = B34; S05 = B33; S04 = B32; S03 = B31; S02 = B30; S01 = B29; S00 = B28; \
}
#define SUB_ROR9 { \
B27 = S27, B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
S35 = S26; S34 = S25; S33 = S24; S32 = S23; S31 = S22; S30 = S21; S29 = S20; S28 = S19; S27 = S18; \
S26 = S17; S25 = S16; S24 = S15; S23 = S14; S22 = S13; S21 = S12; S20 = S11; S19 = S10; S18 = S09; \
S17 = S08; S16 = S07; S15 = S06; S14 = S05; S13 = S04; S12 = S03; S11 = S02; S10 = S01; S09 = S00; \
S08 = B35; S07 = B34; S06 = B33; S05 = B32; S04 = B31; S03 = B30; S02 = B29; S01 = B28; S00 = B27; \
}
#define SUB_ROR9_3 { \
SUB_ROR3; SUB_ROR3; SUB_ROR3; \
}
#define SUB_ROR12 { /* to fix */ \
B24 = S00; B25 = S01; B26 = S02; B27 = S03; B28 = S04; B29 = S05; B30 = S06; B31 = S07; B32 = S08; B33 = S09; B34 = S10; B35 = S11; \
S00 = S12; S01 = S13; S02 = S14; S03 = S15; S04 = S16; S05 = S17; S06 = S18; S07 = S19; S08 = S20; S09 = S21; S10 = S22; S11 = S23; \
S12 = S24; S13 = S25; S14 = S26; S15 = S27; S16 = S28; S17 = S29; S18 = S30; S19 = S31; S20 = S32; S21 = S33; S22 = S34; S23 = S35; \
S24 = B24; S25 = B25; S26 = B26; S27 = B27; S28 = B28; S29 = B29; S30 = B30; S31 = B31; S32 = B32; S33 = B33; S34 = B34; S35 = B35; \
}
#define FUGUE512_3(x, y, z) { \
TIX4(x, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
SMIX(S33, S34, S35, S00); \
CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
SMIX(S30, S31, S32, S33); \
CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
SMIX(S27, S28, S29, S30); \
CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
SMIX(S24, S25, S26, S27); \
\
TIX4(y, S24, S25, S28, S31, S32, S10, S12, S15, S18); \
CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \
SMIX(S21, S22, S23, S24); \
CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \
SMIX(S18, S19, S20, S21); \
CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \
SMIX(S15, S16, S17, S18); \
CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \
SMIX(S12, S13, S14, S15); \
\
TIX4(z, S12, S13, S16, S19, S20, S34, S00, S03, S06); \
CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \
SMIX(S09, S10, S11, S12); \
CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \
SMIX(S06, S07, S08, S09); \
CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \
SMIX(S03, S04, S05, S06); \
CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \
SMIX(S00, S01, S02, S03); \
}
#define FUGUE512_F(w, x, y, z) { \
TIX4(w, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
SMIX(S33, S34, S35, S00); \
CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
SMIX(S30, S31, S32, S33); \
CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
SMIX(S27, S28, S29, S30); \
CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
SMIX(S24, S25, S26, S27); \
\
TIX4(x, S24, S25, S28, S31, S32, S10, S12, S15, S18); \
CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \
SMIX(S21, S22, S23, S24); \
CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \
SMIX(S18, S19, S20, S21); \
CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \
SMIX(S15, S16, S17, S18); \
CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \
SMIX(S12, S13, S14, S15); \
\
TIX4(y, S12, S13, S16, S19, S20, S34, S00, S03, S06); \
CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \
SMIX(S09, S10, S11, S12); \
CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \
SMIX(S06, S07, S08, S09); \
CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \
SMIX(S03, S04, S05, S06); \
CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \
SMIX(S00, S01, S02, S03); \
\
TIX4(z, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
SMIX(S33, S34, S35, S00); \
CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
SMIX(S30, S31, S32, S33); \
CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
SMIX(S27, S28, S29, S30); \
CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
SMIX(S24, S25, S26, S27); \
}
#undef ROL8
#ifdef __CUDA_ARCH__
__device__ __forceinline__
uint32_t ROL8(const uint32_t a) {
return __byte_perm(a, 0, 0x2103);
}
__device__ __forceinline__
uint32_t ROR8(const uint32_t a) {
return __byte_perm(a, 0, 0x0321);
}
__device__ __forceinline__
uint32_t ROL16(const uint32_t a) {
return __byte_perm(a, 0, 0x1032);
}
#else
#define ROL8(u) ROTL32(u, 8)
#define ROR8(u) ROTR32(u, 8)
#define ROL16(u) ROTL32(u,16)
#endif
//#define AS_UINT4(addr) *((uint4*)(addr))
__constant__ static uint64_t c_PaddedMessage80[10];
__host__
void x16_fugue512_setBlock_80(void *pdata)
{
cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
}
/***************************************************/
__global__
__launch_bounds__(TPB)
void x16_fugue512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
{
__shared__ uint32_t mixtabs[1024];
// load shared mem (with 256 threads)
const uint32_t thr = threadIdx.x & 0xFF;
const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);
mixtabs[thr] = tmp;
mixtabs[thr+256] = ROR8(tmp);
mixtabs[thr+512] = ROL16(tmp);
mixtabs[thr+768] = ROL8(tmp);
#if TPB <= 256
if (blockDim.x < 256) {
const uint32_t thr = (threadIdx.x + 0x80) & 0xFF;
const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);
mixtabs[thr] = tmp;
mixtabs[thr + 256] = ROR8(tmp);
mixtabs[thr + 512] = ROL16(tmp);
mixtabs[thr + 768] = ROL8(tmp);
}
#endif
__syncthreads();
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint32_t Data[20];
#pragma unroll
for(int i = 0; i < 10; i++)
AS_UINT2(&Data[i * 2]) = AS_UINT2(&c_PaddedMessage80[i]);
Data[19] = (startNonce + thread);
uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09, S10, S11;
uint32_t S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23;
uint32_t S24, S25, S26, S27, S28, S29, S30, S31, S32, S33, S34, S35;
//uint32_t B24, B25, B26,
uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35;
//const uint64_t bc = 640 bits to hash
//const uint32_t bclo = (uint32_t)(bc);
//const uint32_t bchi = (uint32_t)(bc >> 32);
S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = 0;
S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
S20 = 0x8807a57e; S21 = 0xe616af75; S22 = 0xc5d3e4db; S23 = 0xac9ab027;
S24 = 0xd915f117; S25 = 0xb6eecc54; S26 = 0x06e8020b; S27 = 0x4a92efd1;
S28 = 0xaac6e2c9; S29 = 0xddb21398; S30 = 0xcae65838; S31 = 0x437f203f;
S32 = 0x25ea78e7; S33 = 0x951fddd6; S34 = 0xda6ed11d; S35 = 0xe13e3567;
FUGUE512_3((Data[ 0]), (Data[ 1]), (Data[ 2]));
FUGUE512_3((Data[ 3]), (Data[ 4]), (Data[ 5]));
FUGUE512_3((Data[ 6]), (Data[ 7]), (Data[ 8]));
FUGUE512_3((Data[ 9]), (Data[10]), (Data[11]));
FUGUE512_3((Data[12]), (Data[13]), (Data[14]));
FUGUE512_3((Data[15]), (Data[16]), (Data[17]));
FUGUE512_F((Data[18]), (Data[19]), 0/*bchi*/, (80*8)/*bclo*/);
// rotate right state by 3 dwords (S00 = S33, S03 = S00)
SUB_ROR3;
SUB_ROR9;
#pragma unroll 32
for (int i = 0; i < 32; i++) {
SUB_ROR3;
CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
SMIX(S00, S01, S02, S03);
}
#pragma unroll 13
for (int i = 0; i < 13; i++) {
S04 ^= S00;
S09 ^= S00;
S18 ^= S00;
S27 ^= S00;
SUB_ROR9;
SMIX(S00, S01, S02, S03);
S04 ^= S00;
S10 ^= S00;
S18 ^= S00;
S27 ^= S00;
SUB_ROR9;
SMIX(S00, S01, S02, S03);
S04 ^= S00;
S10 ^= S00;
S19 ^= S00;
S27 ^= S00;
SUB_ROR9;
SMIX(S00, S01, S02, S03);
S04 ^= S00;
S10 ^= S00;
S19 ^= S00;
S28 ^= S00;
SUB_ROR8;
SMIX(S00, S01, S02, S03);
}
S04 ^= S00;
S09 ^= S00;
S18 ^= S00;
S27 ^= S00;
Data[ 0] = cuda_swab32(S01);
Data[ 1] = cuda_swab32(S02);
Data[ 2] = cuda_swab32(S03);
Data[ 3] = cuda_swab32(S04);
Data[ 4] = cuda_swab32(S09);
Data[ 5] = cuda_swab32(S10);
Data[ 6] = cuda_swab32(S11);
Data[ 7] = cuda_swab32(S12);
Data[ 8] = cuda_swab32(S18);
Data[ 9] = cuda_swab32(S19);
Data[10] = cuda_swab32(S20);
Data[11] = cuda_swab32(S21);
Data[12] = cuda_swab32(S27);
Data[13] = cuda_swab32(S28);
Data[14] = cuda_swab32(S29);
Data[15] = cuda_swab32(S30);
const size_t hashPosition = thread;
uint64_t* pHash = &g_hash[hashPosition << 3];
#pragma unroll 4
for(int i = 0; i < 4; i++)
AS_UINT4(&pHash[i * 2]) = AS_UINT4(&Data[i * 4]);
}
}
#define texDef(id, texname, texmem, texsource, texsize) { \
unsigned int *texmem; \
cudaMalloc(&texmem, texsize); \
d_textures[thr_id][id] = texmem; \
cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
texname.normalized = 0; \
texname.filterMode = cudaFilterModePoint; \
texname.addressMode[0] = cudaAddressModeClamp; \
{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
} \
}
__host__
void x16_fugue512_cpu_init(int thr_id, uint32_t threads)
{
texDef(0, mixTab0Tex, mixTab0m, mixtab0, sizeof(uint32_t)*256);
}
__host__
void x16_fugue512_cpu_free(int thr_id)
{
cudaFree(d_textures[thr_id][0]);
}
__host__
void x16_fugue512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
{
const uint32_t threadsperblock = TPB;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
x16_fugue512_gpu_hash_80 <<<grid, block>>> (threads, startNonce, (uint64_t*)d_hash);
}

350
x16r/cuda_x16_shabal512.cu

@ -0,0 +1,350 @@ @@ -0,0 +1,350 @@
/*
* Shabal-512 for X16R
* tpruvot 2018, based on alexis x14 and xevan kernlx code
*/
#include <cuda_helper.h>
#include <cuda_vectors.h>
#include <cuda_vector_uint2x4.h>
typedef uint32_t sph_u32;
#define C32(x) (x)
#define T32(x) (x)
#define INPUT_BLOCK_ADD do { \
B0 = T32(B0 + M0); \
B1 = T32(B1 + M1); \
B2 = T32(B2 + M2); \
B3 = T32(B3 + M3); \
B4 = T32(B4 + M4); \
B5 = T32(B5 + M5); \
B6 = T32(B6 + M6); \
B7 = T32(B7 + M7); \
B8 = T32(B8 + M8); \
B9 = T32(B9 + M9); \
BA = T32(BA + MA); \
BB = T32(BB + MB); \
BC = T32(BC + MC); \
BD = T32(BD + MD); \
BE = T32(BE + ME); \
BF = T32(BF + MF); \
} while (0)
#define INPUT_BLOCK_SUB do { \
C0 = T32(C0 - M0); \
C1 = T32(C1 - M1); \
C2 = T32(C2 - M2); \
C3 = T32(C3 - M3); \
C4 = T32(C4 - M4); \
C5 = T32(C5 - M5); \
C6 = T32(C6 - M6); \
C7 = T32(C7 - M7); \
C8 = T32(C8 - M8); \
C9 = T32(C9 - M9); \
CA = T32(CA - MA); \
CB = T32(CB - MB); \
CC = T32(CC - MC); \
CD = T32(CD - MD); \
CE = T32(CE - ME); \
CF = T32(CF - MF); \
} while (0)
#define XOR_W do { \
A00 ^= Wlow; \
A01 ^= Whigh; \
} while (0)
#define SWAP(v1, v2) do { \
sph_u32 tmp = (v1); \
(v1) = (v2); \
(v2) = tmp; \
} while (0)
#define SWAP_BC do { \
SWAP(B0, C0); \
SWAP(B1, C1); \
SWAP(B2, C2); \
SWAP(B3, C3); \
SWAP(B4, C4); \
SWAP(B5, C5); \
SWAP(B6, C6); \
SWAP(B7, C7); \
SWAP(B8, C8); \
SWAP(B9, C9); \
SWAP(BA, CA); \
SWAP(BB, CB); \
SWAP(BC, CC); \
SWAP(BD, CD); \
SWAP(BE, CE); \
SWAP(BF, CF); \
} while (0)
#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) do { \
xa0 = T32((xa0 \
^ (((xa1 << 15) | (xa1 >> 17)) * 5U) \
^ xc) * 3U) \
^ xb1 ^ (xb2 & ~xb3) ^ xm; \
xb0 = T32(~(((xb0 << 1) | (xb0 >> 31)) ^ xa0)); \
} while (0)
#define PERM_STEP_0 do { \
PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
} while (0)
#define PERM_STEP_1 do { \
PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
} while (0)
#define PERM_STEP_2 do { \
PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
} while (0)
#define APPLY_P do { \
B0 = T32(B0 << 17) | (B0 >> 15); \
B1 = T32(B1 << 17) | (B1 >> 15); \
B2 = T32(B2 << 17) | (B2 >> 15); \
B3 = T32(B3 << 17) | (B3 >> 15); \
B4 = T32(B4 << 17) | (B4 >> 15); \
B5 = T32(B5 << 17) | (B5 >> 15); \
B6 = T32(B6 << 17) | (B6 >> 15); \
B7 = T32(B7 << 17) | (B7 >> 15); \
B8 = T32(B8 << 17) | (B8 >> 15); \
B9 = T32(B9 << 17) | (B9 >> 15); \
BA = T32(BA << 17) | (BA >> 15); \
BB = T32(BB << 17) | (BB >> 15); \
BC = T32(BC << 17) | (BC >> 15); \
BD = T32(BD << 17) | (BD >> 15); \
BE = T32(BE << 17) | (BE >> 15); \
BF = T32(BF << 17) | (BF >> 15); \
PERM_STEP_0; \
PERM_STEP_1; \
PERM_STEP_2; \
A0B = T32(A0B + C6); \
A0A = T32(A0A + C5); \
A09 = T32(A09 + C4); \
A08 = T32(A08 + C3); \
A07 = T32(A07 + C2); \
A06 = T32(A06 + C1); \
A05 = T32(A05 + C0); \
A04 = T32(A04 + CF); \
A03 = T32(A03 + CE); \
A02 = T32(A02 + CD); \
A01 = T32(A01 + CC); \
A00 = T32(A00 + CB); \
A0B = T32(A0B + CA); \
A0A = T32(A0A + C9); \
A09 = T32(A09 + C8); \
A08 = T32(A08 + C7); \
A07 = T32(A07 + C6); \
A06 = T32(A06 + C5); \
A05 = T32(A05 + C4); \
A04 = T32(A04 + C3); \
A03 = T32(A03 + C2); \
A02 = T32(A02 + C1); \
A01 = T32(A01 + C0); \
A00 = T32(A00 + CF); \
A0B = T32(A0B + CE); \
A0A = T32(A0A + CD); \
A09 = T32(A09 + CC); \
A08 = T32(A08 + CB); \
A07 = T32(A07 + CA); \
A06 = T32(A06 + C9); \
A05 = T32(A05 + C8); \
A04 = T32(A04 + C7); \
A03 = T32(A03 + C6); \
A02 = T32(A02 + C5); \
A01 = T32(A01 + C4); \
A00 = T32(A00 + C3); \
} while (0)
#define INCR_W do { \
if ((Wlow = T32(Wlow + 1)) == 0) \
Whigh = T32(Whigh + 1); \
} while (0)
__constant__ static const sph_u32 A_init_512[] = {
C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
};
__constant__ static const sph_u32 B_init_512[] = {
C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
};
__constant__ static const sph_u32 C_init_512[] = {
C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
};
__constant__ static uint32_t c_PaddedMessage80[20];
__host__
void x16_shabal512_setBlock_80(void *pdata)
{
cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
}
#define TPB_SHABAL 256
__global__ __launch_bounds__(TPB_SHABAL, 2)
void x16_shabal512_gpu_hash_80(uint32_t threads, const uint32_t startNonce, uint32_t *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
uint32_t B[] = {
0xC1099CB7, 0x07B385F3, 0xE7442C26, 0xCC8AD640, 0xEB6F56C7, 0x1EA81AA9, 0x73B9D314, 0x1DE85D08,
0x48910A5A, 0x893B22DB, 0xC5A0DF44, 0xBBC4324E, 0x72D2F240, 0x75941D99, 0x6D8BDE82, 0xA1A7502B
};
uint32_t M[16];
if (thread < threads)
{
// todo: try __ldc
*(uint2x4*)&M[0] = *(uint2x4*)&c_PaddedMessage80[0];
*(uint2x4*)&M[8] = *(uint2x4*)&c_PaddedMessage80[8];
sph_u32 A00 = A_init_512[0], A01 = A_init_512[1], A02 = A_init_512[ 2], A03 = A_init_512[ 3];
sph_u32 A04 = A_init_512[4], A05 = A_init_512[5], A06 = A_init_512[ 6], A07 = A_init_512[ 7];
sph_u32 A08 = A_init_512[8], A09 = A_init_512[9], A0A = A_init_512[10], A0B = A_init_512[11];
sph_u32 B0 = B_init_512[ 0], B1 = B_init_512[ 1], B2 = B_init_512[ 2], B3 = B_init_512 [3];
sph_u32 B4 = B_init_512[ 4], B5 = B_init_512[ 5], B6 = B_init_512[ 6], B7 = B_init_512[ 7];
sph_u32 B8 = B_init_512[ 8], B9 = B_init_512[ 9], BA = B_init_512[10], BB = B_init_512[11];
sph_u32 BC = B_init_512[12], BD = B_init_512[13], BE = B_init_512[14], BF = B_init_512[15];
sph_u32 C0 = C_init_512[ 0], C1 = C_init_512[ 1], C2 = C_init_512[ 2], C3 = C_init_512[ 3];
sph_u32 C4 = C_init_512[ 4], C5 = C_init_512[ 5], C6 = C_init_512[ 6], C7 = C_init_512[ 7];
sph_u32 C8 = C_init_512[ 8], C9 = C_init_512[ 9], CA = C_init_512[10], CB = C_init_512[11];
sph_u32 CC = C_init_512[12], CD = C_init_512[13], CE = C_init_512[14], CF = C_init_512[15];
sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
sph_u32 Wlow = 1, Whigh = 0;
M0 = M[ 0];
M1 = M[ 1];
M2 = M[ 2];
M3 = M[ 3];
M4 = M[ 4];
M5 = M[ 5];
M6 = M[ 6];
M7 = M[ 7];
M8 = M[ 8];
M9 = M[ 9];
MA = M[10];
MB = M[11];
MC = M[12];
MD = M[13];
ME = M[14];
MF = M[15];
INPUT_BLOCK_ADD;
XOR_W;
APPLY_P;
INPUT_BLOCK_SUB;
SWAP_BC;
INCR_W;
M0 = c_PaddedMessage80[16];
M1 = c_PaddedMessage80[17];
M2 = c_PaddedMessage80[18];
M3 = cuda_swab32(startNonce + thread);
M4 = 0x80;
M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0;
INPUT_BLOCK_ADD;
XOR_W;
APPLY_P;
for (unsigned i = 0; i < 3; i++) {
SWAP_BC;
XOR_W;
APPLY_P;
}
B[ 0] = B0;
B[ 1] = B1;
B[ 2] = B2;
B[ 3] = B3;
B[ 4] = B4;
B[ 5] = B5;
B[ 6] = B6;
B[ 7] = B7;
B[ 8] = B8;
B[ 9] = B9;
B[10] = BA;
B[11] = BB;
B[12] = BC;
B[13] = BD;
B[14] = BE;
B[15] = BF;
// output
uint64_t hashPosition = thread;
uint32_t *Hash = &g_hash[hashPosition << 4];
*(uint2x4*)&Hash[0] = *(uint2x4*)&B[0];
*(uint2x4*)&Hash[8] = *(uint2x4*)&B[8];
}
}
__host__
void x16_shabal512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
{
const uint32_t threadsperblock = TPB_SHABAL;
dim3 grid((threads + threadsperblock - 1) / threadsperblock);
dim3 block(threadsperblock);
x16_shabal512_gpu_hash_80 <<<grid, block >>>(threads, startNonce, d_hash);
}

1836
x16r/cuda_x16_simd512_80.cu

File diff suppressed because it is too large Load Diff

75
x16r/cuda_x16r.h

@ -0,0 +1,75 @@ @@ -0,0 +1,75 @@
#include "x11/cuda_x11.h"
extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x13_fugue512_cpu_free(int thr_id);
extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag);
extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
extern void x15_whirlpool_cpu_free(int thr_id);
extern void x17_sha512_cpu_init(int thr_id, uint32_t threads);
extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash);
extern void x17_haval256_cpu_init(int thr_id, uint32_t threads);
extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, const int outlen);
void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
// ---- 80 bytes kernels
void quark_bmw512_cpu_setBlock_80(void *pdata);
void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
void groestl512_setBlock_80(int thr_id, uint32_t *endiandata);
void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
void skein512_cpu_setBlock_80(void *pdata);
void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap);
void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
void qubit_luffa512_cpu_setBlock_80(void *pdata);
void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
void jh512_setBlock_80(int thr_id, uint32_t *endiandata);
void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
void keccak512_setBlock_80(int thr_id, uint32_t *endiandata);
void keccak512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
void x11_shavite512_setBlock_80(void *pdata);
void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
void x16_shabal512_setBlock_80(void *pdata);
void x16_shabal512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
void x16_simd512_setBlock_80(void *pdata);
void x16_simd512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
void x16_echo512_cuda_init(int thr_id, const uint32_t threads);
void x16_echo512_setBlock_80(void *pdata);
void x16_echo512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
void x16_hamsi512_setBlock_80(void *pdata);
void x16_hamsi512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
void x16_fugue512_cpu_init(int thr_id, uint32_t threads);
void x16_fugue512_cpu_free(int thr_id);
void x16_fugue512_setBlock_80(void *pdata);
void x16_fugue512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
void x16_whirlpool512_init(int thr_id, uint32_t threads);
void x16_whirlpool512_setBlock_80(void* endiandata);
void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
void x16_sha512_setBlock_80(void *pdata);
void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);

625
x16r/x16r.cu

@ -0,0 +1,625 @@ @@ -0,0 +1,625 @@
/**
* X16R algorithm (X16 with Randomized chain order)
*
* tpruvot 2018 - GPL code
*/
#include <stdio.h>
#include <memory.h>
#include <unistd.h>
extern "C" {
#include "sph/sph_blake.h"
#include "sph/sph_bmw.h"
#include "sph/sph_groestl.h"
#include "sph/sph_skein.h"
#include "sph/sph_jh.h"
#include "sph/sph_keccak.h"
#include "sph/sph_luffa.h"
#include "sph/sph_cubehash.h"
#include "sph/sph_shavite.h"
#include "sph/sph_simd.h"
#include "sph/sph_echo.h"
#include "sph/sph_hamsi.h"
#include "sph/sph_fugue.h"
#include "sph/sph_shabal.h"
#include "sph/sph_whirlpool.h"
#include "sph/sph_sha2.h"
}
#include "miner.h"
#include "cuda_helper.h"
#include "cuda_x16r.h"
static uint32_t *d_hash[MAX_GPUS];
enum Algo {
BLAKE = 0,
BMW,
GROESTL,
JH,
KECCAK,
SKEIN,
LUFFA,
CUBEHASH,
SHAVITE,
SIMD,
ECHO,
HAMSI,
FUGUE,
SHABAL,
WHIRLPOOL,
SHA512,
HASH_FUNC_COUNT
};
static const char* algo_strings[] = {
"blake",
"bmw512",
"groestl",
"jh512",
"keccak",
"skein",
"luffa",
"cube",
"shavite",
"simd",
"echo",
"hamsi",
"fugue",
"shabal",
"whirlpool",
"sha512",
NULL
};
static __thread uint32_t s_ntime = UINT32_MAX;
static __thread bool s_implemented = false;
static __thread char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
static void getAlgoString(const uint32_t* prevblock, char *output)
{
char *sptr = output;
uint8_t* data = (uint8_t*)prevblock;
for (uint8_t j = 0; j < HASH_FUNC_COUNT; j++) {
uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed
uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4;
if (algoDigit >= 10)
sprintf(sptr, "%c", 'A' + (algoDigit - 10));
else
sprintf(sptr, "%u", (uint32_t) algoDigit);
sptr++;
}
*sptr = '\0';
}
// X16R CPU Hash (Validation)
extern "C" void x16r_hash(void *output, const void *input)
{
unsigned char _ALIGN(64) hash[128];
sph_blake512_context ctx_blake;
sph_bmw512_context ctx_bmw;
sph_groestl512_context ctx_groestl;
sph_jh512_context ctx_jh;
sph_keccak512_context ctx_keccak;
sph_skein512_context ctx_skein;
sph_luffa512_context ctx_luffa;
sph_cubehash512_context ctx_cubehash;
sph_shavite512_context ctx_shavite;
sph_simd512_context ctx_simd;
sph_echo512_context ctx_echo;
sph_hamsi512_context ctx_hamsi;
sph_fugue512_context ctx_fugue;
sph_shabal512_context ctx_shabal;
sph_whirlpool_context ctx_whirlpool;
sph_sha512_context ctx_sha512;
void *in = (void*) input;
int size = 80;
uint32_t *in32 = (uint32_t*) input;
getAlgoString(&in32[1], hashOrder);
for (int i = 0; i < 16; i++)
{
const char elem = hashOrder[i];
const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch (algo) {
case BLAKE:
sph_blake512_init(&ctx_blake);
sph_blake512(&ctx_blake, in, size);
sph_blake512_close(&ctx_blake, hash);
break;
case BMW:
sph_bmw512_init(&ctx_bmw);
sph_bmw512(&ctx_bmw, in, size);
sph_bmw512_close(&ctx_bmw, hash);
break;
case GROESTL:
sph_groestl512_init(&ctx_groestl);
sph_groestl512(&ctx_groestl, in, size);
sph_groestl512_close(&ctx_groestl, hash);
break;
case SKEIN:
sph_skein512_init(&ctx_skein);
sph_skein512(&ctx_skein, in, size);
sph_skein512_close(&ctx_skein, hash);
break;
case JH:
sph_jh512_init(&ctx_jh);
sph_jh512(&ctx_jh, in, size);
sph_jh512_close(&ctx_jh, hash);
break;
case KECCAK:
sph_keccak512_init(&ctx_keccak);
sph_keccak512(&ctx_keccak, in, size);
sph_keccak512_close(&ctx_keccak, hash);
break;
case LUFFA:
sph_luffa512_init(&ctx_luffa);
sph_luffa512(&ctx_luffa, in, size);
sph_luffa512_close(&ctx_luffa, hash);
break;
case CUBEHASH:
sph_cubehash512_init(&ctx_cubehash);
sph_cubehash512(&ctx_cubehash, in, size);
sph_cubehash512_close(&ctx_cubehash, hash);
break;
case SHAVITE:
sph_shavite512_init(&ctx_shavite);
sph_shavite512(&ctx_shavite, in, size);
sph_shavite512_close(&ctx_shavite, hash);
break;
case SIMD:
sph_simd512_init(&ctx_simd);
sph_simd512(&ctx_simd, in, size);
sph_simd512_close(&ctx_simd, hash);
break;
case ECHO:
sph_echo512_init(&ctx_echo);
sph_echo512(&ctx_echo, in, size);
sph_echo512_close(&ctx_echo, hash);
break;
case HAMSI:
sph_hamsi512_init(&ctx_hamsi);
sph_hamsi512(&ctx_hamsi, in, size);
sph_hamsi512_close(&ctx_hamsi, hash);
break;
case FUGUE:
sph_fugue512_init(&ctx_fugue);
sph_fugue512(&ctx_fugue, in, size);
sph_fugue512_close(&ctx_fugue, hash);
break;
case SHABAL:
sph_shabal512_init(&ctx_shabal);
sph_shabal512(&ctx_shabal, in, size);
sph_shabal512_close(&ctx_shabal, hash);
break;
case WHIRLPOOL:
sph_whirlpool_init(&ctx_whirlpool);
sph_whirlpool(&ctx_whirlpool, in, size);
sph_whirlpool_close(&ctx_whirlpool, hash);
break;
case SHA512:
sph_sha512_init(&ctx_sha512);
sph_sha512(&ctx_sha512,(const void*) in, size);
sph_sha512_close(&ctx_sha512,(void*) hash);
break;
}
in = (void*) hash;
size = 64;
}
memcpy(output, hash, 32);
}
void whirlpool_midstate(void *state, const void *input)
{
sph_whirlpool_context ctx;
sph_whirlpool_init(&ctx);
sph_whirlpool(&ctx, input, 64);
memcpy(state, ctx.state, 64);
}
static bool init[MAX_GPUS] = { 0 };
//#define _DEBUG
#define _DEBUG_PREFIX "x16r-"
#include "cuda_debug.cuh"
//static int algo80_tests[HASH_FUNC_COUNT] = { 0 };
//static int algo64_tests[HASH_FUNC_COUNT] = { 0 };
static int algo80_fails[HASH_FUNC_COUNT] = { 0 };
extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
{
uint32_t *pdata = work->data;
uint32_t *ptarget = work->target;
const uint32_t first_nonce = pdata[19];
const int dev_id = device_map[thr_id];
int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19;
if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20;
uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
if (!init[thr_id])
{
cudaSetDevice(device_map[thr_id]);
if (opt_cudaschedule == -1 && gpu_threads == 1) {
cudaDeviceReset();
// reduce cpu usage
cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
}
gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
quark_blake512_cpu_init(thr_id, throughput);
quark_bmw512_cpu_init(thr_id, throughput);
quark_groestl512_cpu_init(thr_id, throughput);
quark_skein512_cpu_init(thr_id, throughput);
quark_jh512_cpu_init(thr_id, throughput);
quark_keccak512_cpu_init(thr_id, throughput);
qubit_luffa512_cpu_init(thr_id, throughput);
x11_luffa512_cpu_init(thr_id, throughput); // 64
x11_shavite512_cpu_init(thr_id, throughput);
x11_simd512_cpu_init(thr_id, throughput); // 64
x11_echo512_cpu_init(thr_id, throughput);
x16_echo512_cuda_init(thr_id, throughput);
x13_hamsi512_cpu_init(thr_id, throughput);
x13_fugue512_cpu_init(thr_id, throughput);
x16_fugue512_cpu_init(thr_id, throughput);
x14_shabal512_cpu_init(thr_id, throughput);
x15_whirlpool_cpu_init(thr_id, throughput, 0);
x16_whirlpool512_init(thr_id, throughput);
x17_sha512_cpu_init(thr_id, throughput);
CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
cuda_check_cpu_init(thr_id, throughput);
init[thr_id] = true;
}
if (opt_benchmark) {
((uint32_t*)ptarget)[7] = 0x003f;
((uint8_t*)pdata)[8] = 0x90; // hashOrder[0] = '9'; for simd 80 + blake512 64
//((uint8_t*)pdata)[8] = 0xA0; // hashOrder[0] = 'A'; for echo 80 + blake512 64
//((uint8_t*)pdata)[8] = 0xB0; // hashOrder[0] = 'B'; for hamsi 80 + blake512 64
//((uint8_t*)pdata)[8] = 0xC0; // hashOrder[0] = 'C'; for fugue 80 + blake512 64
//((uint8_t*)pdata)[8] = 0xE0; // hashOrder[0] = 'E'; for whirlpool 80 + blake512 64
}
uint32_t _ALIGN(64) endiandata[20];
for (int k=0; k < 19; k++)
be32enc(&endiandata[k], pdata[k]);
uint32_t ntime = swab32(pdata[17]);
if (s_ntime != ntime) {
getAlgoString(&endiandata[1], hashOrder);
s_ntime = ntime;
s_implemented = true;
if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime);
}
if (!s_implemented) {
sleep(1);
return -1;
}
cuda_check_cpu_setTarget(ptarget);
char elem = hashOrder[0];
const uint8_t algo80 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch (algo80) {
case BLAKE:
quark_blake512_cpu_setBlock_80(thr_id, endiandata);
break;
case BMW:
quark_bmw512_cpu_setBlock_80(endiandata);
break;
case GROESTL:
groestl512_setBlock_80(thr_id, endiandata);
break;
case JH:
jh512_setBlock_80(thr_id, endiandata);
break;
case KECCAK:
keccak512_setBlock_80(thr_id, endiandata);
break;
case SKEIN:
skein512_cpu_setBlock_80((void*)endiandata);
break;
case LUFFA:
qubit_luffa512_cpu_setBlock_80((void*)endiandata);
break;
case CUBEHASH:
cubehash512_setBlock_80(thr_id, endiandata);
break;
case SHAVITE:
x11_shavite512_setBlock_80((void*)endiandata);
break;
case SIMD:
x16_simd512_setBlock_80((void*)endiandata);
break;
case ECHO:
x16_echo512_setBlock_80((void*)endiandata);
break;
case HAMSI:
x16_hamsi512_setBlock_80((void*)endiandata);
break;
case FUGUE:
x16_fugue512_setBlock_80((void*)pdata);
break;
case SHABAL:
x16_shabal512_setBlock_80((void*)endiandata);
break;
case WHIRLPOOL:
x16_whirlpool512_setBlock_80((void*)endiandata);
break;
case SHA512:
x16_sha512_setBlock_80(endiandata);
break;
default: {
if (!thr_id)
applog(LOG_WARNING, "kernel %s %c unimplemented, order %s", algo_strings[algo80], elem, hashOrder);
s_implemented = false;
sleep(5);
return -1;
}
}
int warn = 0;
do {
int order = 0;
// Hash with CUDA
switch (algo80) {
case BLAKE:
quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("blake80:");
break;
case BMW:
quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
TRACE("bmw80 :");
break;
case GROESTL:
groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("grstl80:");
break;
case JH:
jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("jh51280:");
break;
case KECCAK:
keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("kecck80:");
break;
case SKEIN:
skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
TRACE("skein80:");
break;
case LUFFA:
qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
TRACE("luffa80:");
break;
case CUBEHASH:
cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("cube 80:");
break;
case SHAVITE:
x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
TRACE("shavite:");
break;
case SIMD:
x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("simd512:");
break;
case ECHO:
x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("echo :");
break;
case HAMSI:
x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("hamsi :");
break;
case FUGUE:
x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("fugue :");
break;
case SHABAL:
x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("shabal :");
break;
case WHIRLPOOL:
x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("whirl :");
break;
case SHA512:
x16_sha512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("sha512 :");
break;
}
for (int i = 1; i < 16; i++)
{
const char elem = hashOrder[i];
const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
switch (algo64) {
case BLAKE:
quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("blake :");
break;
case BMW:
quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("bmw :");
break;
case GROESTL:
quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("groestl:");
break;
case JH:
quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("jh512 :");
break;
case KECCAK:
quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("keccak :");
break;
case SKEIN:
quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("skein :");
break;
case LUFFA:
x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("luffa :");
break;
case CUBEHASH:
x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("cube :");
break;
case SHAVITE:
x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("shavite:");
break;
case SIMD:
x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("simd :");
break;
case ECHO:
x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("echo :");
break;
case HAMSI:
x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("hamsi :");
break;
case FUGUE:
x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("fugue :");
break;
case SHABAL:
x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("shabal :");
break;
case WHIRLPOOL:
x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
TRACE("shabal :");
break;
case SHA512:
x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
TRACE("sha512 :");
break;
}
}
*hashes_done = pdata[19] - first_nonce + throughput;
work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
#ifdef _DEBUG
uint32_t _ALIGN(64) dhash[8];
be32enc(&endiandata[19], pdata[19]);
x16r_hash(dhash, endiandata);
applog_hash(dhash);
return -1;
#endif
if (work->nonces[0] != UINT32_MAX)
{
const uint32_t Htarg = ptarget[7];
uint32_t _ALIGN(64) vhash[8];
be32enc(&endiandata[19], work->nonces[0]);
x16r_hash(vhash, endiandata);
if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
work->valid_nonces = 1;
work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
work_set_target_ratio(work, vhash);
if (work->nonces[1] != 0) {
be32enc(&endiandata[19], work->nonces[1]);
x16r_hash(vhash, endiandata);
bn_set_target_ratio(work, vhash, 1);
work->valid_nonces++;
pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
} else {
pdata[19] = work->nonces[0] + 1; // cursor
}
#if 0
gpulog(LOG_INFO, thr_id, "hash found with %s 80!", algo_strings[algo80]);
algo80_tests[algo80] += work->valid_nonces;
char oks64[128] = { 0 };
char oks80[128] = { 0 };
char fails[128] = { 0 };
for (int a = 0; a < HASH_FUNC_COUNT; a++) {
const char elem = hashOrder[a];
const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
if (a > 0) algo64_tests[algo64] += work->valid_nonces;
sprintf(&oks64[strlen(oks64)], "|%X:%2d", a, algo64_tests[a] < 100 ? algo64_tests[a] : 99);
sprintf(&oks80[strlen(oks80)], "|%X:%2d", a, algo80_tests[a] < 100 ? algo80_tests[a] : 99);
sprintf(&fails[strlen(fails)], "|%X:%2d", a, algo80_fails[a] < 100 ? algo80_fails[a] : 99);
}
applog(LOG_INFO, "K64: %s", oks64);
applog(LOG_INFO, "K80: %s", oks80);
applog(LOG_ERR, "F80: %s", fails);
#endif
return work->valid_nonces;
}
else if (vhash[7] > Htarg) {
// x11+ coins could do some random error, but not on retry
gpu_increment_reject(thr_id);
algo80_fails[algo80]++;
if (!warn) {
warn++;
pdata[19] = work->nonces[0] + 1;
continue;
} else {
if (!opt_quiet) gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! %s %s",
work->nonces[0], algo_strings[algo80], hashOrder);
warn = 0;
}
}
}
if ((uint64_t)throughput + pdata[19] >= max_nonce) {
pdata[19] = max_nonce;
break;
}
pdata[19] += throughput;
} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
*hashes_done = pdata[19] - first_nonce;
return 0;
}
// cleanup
extern "C" void free_x16r(int thr_id)
{
if (!init[thr_id])
return;
cudaThreadSynchronize();
cudaFree(d_hash[thr_id]);
quark_blake512_cpu_free(thr_id);
quark_groestl512_cpu_free(thr_id);
x11_simd512_cpu_free(thr_id);
x13_fugue512_cpu_free(thr_id);
x16_fugue512_cpu_free(thr_id); // to merge with x13_fugue512 ?
x15_whirlpool_cpu_free(thr_id);
cuda_check_cpu_free(thr_id);
cudaDeviceSynchronize();
init[thr_id] = false;
}

77
x17/cuda_x17_sha512.cu

@ -169,3 +169,80 @@ void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, @@ -169,3 +169,80 @@ void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,
x17_sha512_gpu_hash_64 <<<grid, block>>> (threads, (uint64_t*)d_hash);
}
__constant__
static uint64_t c_PaddedMessage80[10];
__global__
/*__launch_bounds__(256, 4)*/
void x16_sha512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
{
const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
{
uint64_t W[80];
#pragma unroll
for (int i = 0; i < 9; i ++) {
W[i] = SWAP64(c_PaddedMessage80[i]);
}
const uint32_t nonce = startNonce + thread;
//((uint32_t*)W)[19] = cuda_swab32(nonce);
W[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nonce));
W[9] = cuda_swab64(W[9]);
W[10] = 0x8000000000000000;
#pragma unroll
for (int i = 11; i<15; i++) {
W[i] = 0U;
}
W[15] = 0x0000000000000280;
#pragma unroll 64
for (int i = 16; i < 80; i ++) {
W[i] = SSG5_1(W[i-2]) + W[i-7];
W[i] += SSG5_0(W[i-15]) + W[i-16];
}
const uint64_t IV512[8] = {
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
uint64_t r[8];
#pragma unroll
for (int i = 0; i < 8; i++) {
r[i] = IV512[i];
}
#pragma unroll
for (int i = 0; i < 80; i++) {
SHA3_STEP(c_WB, r, W, i&7, i);
}
const uint64_t hashPosition = thread;
uint64_t *pHash = &g_hash[hashPosition << 3];
#pragma unroll
for (int u = 0; u < 8; u ++) {
pHash[u] = SWAP64(r[u] + IV512[u]);
}
}
}
__host__
void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
{
const uint32_t threadsperblock = 256;
dim3 grid((threads + threadsperblock-1)/threadsperblock);
dim3 block(threadsperblock);
x16_sha512_gpu_hash_80 <<<grid, block >>> (threads, startNounce, (uint64_t*)d_hash);
}
__host__
void x16_sha512_setBlock_80(void *pdata)
{
cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
}
Loading…
Cancel
Save