diff --git a/Makefile.am b/Makefile.am
index 01fc6b1..0d802ed 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -76,6 +76,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \
x13/hsr.cu x13/cuda_hsr_sm3.cu x13/sm3.c \
x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \
x15/whirlpool.cu x15/cuda_x15_whirlpool_sm3.cu \
+ x16r/x16r.cu x16r/cuda_x16_echo512.cu x16r/cuda_x16_fugue512.cu \
+ x16r/cuda_x16_shabal512.cu x16r/cuda_x16_simd512_80.cu \
x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
x11/phi.cu x11/cuda_streebog_maxwell.cu \
x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu
diff --git a/README.txt b/README.txt
index 327ce31..74d38d1 100644
--- a/README.txt
+++ b/README.txt
@@ -1,5 +1,5 @@
-ccminer 2.2.4 (Jan. 2018) "lyra2v2 and keccak improvements"
+ccminer 2.2.5 (Feb 2018) "x16r algo"
---------------------------------------------------------------
***************************************************************
@@ -122,6 +122,7 @@ its command line interface and options.
x11 use to mine DarkCoin
x14 use to mine X14Coin
x15 use to mine Halcyon
+ x16r use to mine Raven
x17 use to mine X17
vanilla use to mine Vanilla (Blake256)
veltor use to mine VeltorCoin
@@ -277,6 +278,9 @@ so we can more efficiently implement new algorithms using the latest hardware
features.
>>> RELEASE HISTORY <<<
+ Feb. 2017 v2.2.5
+ New x16r algo
+
Jan. 04th 2017 v2.2.4
Improve lyra2v2
Higher keccak default intensity
diff --git a/algos.h b/algos.h
index c77b5e8..014b4cb 100644
--- a/algos.h
+++ b/algos.h
@@ -59,6 +59,7 @@ enum sha_algos {
ALGO_X13,
ALGO_X14,
ALGO_X15,
+ ALGO_X16R,
ALGO_X17,
ALGO_VANILLA,
ALGO_VELTOR,
@@ -128,6 +129,7 @@ static const char *algo_names[] = {
"x13",
"x14",
"x15",
+ "x16r",
"x17",
"vanilla",
"veltor",
diff --git a/bench.cpp b/bench.cpp
index baa999d..0fe248f 100644
--- a/bench.cpp
+++ b/bench.cpp
@@ -102,6 +102,7 @@ void algo_free_all(int thr_id)
free_x13(thr_id);
free_x14(thr_id);
free_x15(thr_id);
+ free_x16r(thr_id);
free_x17(thr_id);
free_zr5(thr_id);
free_scrypt(thr_id);
diff --git a/ccminer.cpp b/ccminer.cpp
index 6c6d33f..46957e7 100644
--- a/ccminer.cpp
+++ b/ccminer.cpp
@@ -293,6 +293,7 @@ Options:\n\
x13 X13 (MaruCoin)\n\
x14 X14\n\
x15 X15\n\
+ x16r X16R (Raven)\n\
x17 X17\n\
wildkeccak Boolberry\n\
zr5 ZR5 (ZiftrCoin)\n\
@@ -1705,6 +1706,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
case ALGO_LYRA2Z:
case ALGO_TIMETRAVEL:
case ALGO_BITCORE:
+ case ALGO_X16R:
work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));
break;
case ALGO_KECCAK:
@@ -2499,6 +2501,9 @@ static void *miner_thread(void *userdata)
case ALGO_X15:
rc = scanhash_x15(thr_id, &work, max_nonce, &hashes_done);
break;
+ case ALGO_X16R:
+ rc = scanhash_x16r(thr_id, &work, max_nonce, &hashes_done);
+ break;
case ALGO_X17:
rc = scanhash_x17(thr_id, &work, max_nonce, &hashes_done);
break;
diff --git a/ccminer.vcxproj b/ccminer.vcxproj
index 2a37505..5fe11d7 100644
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@@ -269,6 +269,7 @@
+
76
@@ -434,6 +435,12 @@
+
+
+
+
+
+
@@ -587,8 +594,7 @@
-
-
+
@@ -615,4 +621,4 @@
-
+
\ No newline at end of file
diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters
index a508e22..a8f149e 100644
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@@ -58,6 +58,9 @@
{a2403c22-6777-46ab-a55a-3fcc7386c974}
+
+ {55dfae6a-66ba-43e2-8ceb-98ee70cbdf16}
+
{85dfae6a-66ca-4332-8cec-98ee70cbdf2f}
@@ -596,6 +599,9 @@
Source Files\equi
+
+ Header Files\CUDA
+
@@ -967,6 +973,24 @@
Source Files\equi
+
+ Source Files\CUDA\x15
+
+
+ Source Files\CUDA\x16r
+
+
+ Source Files\CUDA\x16r
+
+
+ Source Files\CUDA\x16r
+
+
+ Source Files\CUDA\x16r
+
+
+ Source Files\CUDA\x16r
+
@@ -983,4 +1007,4 @@
Ressources
-
+
\ No newline at end of file
diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h
index d3aeabc..17efd4c 100644
--- a/compat/ccminer-config.h
+++ b/compat/ccminer-config.h
@@ -164,7 +164,7 @@
#define PACKAGE_URL "http://github.com/tpruvot/ccminer"
/* Define to the version of this package. */
-#define PACKAGE_VERSION "2.2.4"
+#define PACKAGE_VERSION "2.2.5"
/* If using the C implementation of alloca, define if you know the
direction of stack growth for your system; otherwise it will be
diff --git a/miner.h b/miner.h
index 0ad8523..86b1c1d 100644
--- a/miner.h
+++ b/miner.h
@@ -325,6 +325,7 @@ extern int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, unsig
extern int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
extern int scanhash_zr5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
@@ -389,6 +390,7 @@ extern void free_x11(int thr_id);
extern void free_x13(int thr_id);
extern void free_x14(int thr_id);
extern void free_x15(int thr_id);
+extern void free_x16r(int thr_id);
extern void free_x17(int thr_id);
extern void free_zr5(int thr_id);
//extern void free_sha256d(int thr_id);
@@ -933,6 +935,7 @@ void x11hash(void *output, const void *input);
void x13hash(void *output, const void *input);
void x14hash(void *output, const void *input);
void x15hash(void *output, const void *input);
+void x16r_hash(void *output, const void *input);
void x17hash(void *output, const void *input);
void wildkeccak_hash(void *output, const void *input, uint64_t* scratchpad, uint64_t ssize);
void zr5hash(void *output, const void *input);
diff --git a/res/ccminer.rc b/res/ccminer.rc
index 832e7be..fa9a8a9 100644
--- a/res/ccminer.rc
+++ b/res/ccminer.rc
@@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico"
//
VS_VERSION_INFO VERSIONINFO
- FILEVERSION 2,2,4,0
- PRODUCTVERSION 2,2,4,0
+ FILEVERSION 2,2,5,0
+ PRODUCTVERSION 2,2,5,0
FILEFLAGSMASK 0x3fL
#ifdef _DEBUG
FILEFLAGS 0x21L
@@ -76,10 +76,10 @@ BEGIN
BEGIN
BLOCK "040904e4"
BEGIN
- VALUE "FileVersion", "2.2.4"
+ VALUE "FileVersion", "2.2.5"
VALUE "LegalCopyright", "Copyright (C) 2018"
VALUE "ProductName", "ccminer"
- VALUE "ProductVersion", "2.2.4"
+ VALUE "ProductVersion", "2.2.5"
END
END
BLOCK "VarFileInfo"
diff --git a/util.cpp b/util.cpp
index 3141305..f5a922d 100644
--- a/util.cpp
+++ b/util.cpp
@@ -2325,6 +2325,9 @@ void print_hash_tests(void)
x15hash(&hash[0], &buf[0]);
printpfx("X15", hash);
+ x16r_hash(&hash[0], &buf[0]);
+ printpfx("X16r", hash);
+
x17hash(&hash[0], &buf[0]);
printpfx("X17", hash);
diff --git a/x13/cuda_x13_hamsi512.cu b/x13/cuda_x13_hamsi512.cu
index 69070fb..3c21f7f 100644
--- a/x13/cuda_x13_hamsi512.cu
+++ b/x13/cuda_x13_hamsi512.cu
@@ -1,6 +1,6 @@
/*
- * Quick Hamsi-512 for X13
- * by tsiv - 2014
+ * Quick Hamsi-512 for X13 by tsiv - 2014
+ * + Hamsi-512 80 by tpruvot - 2018
*/
#include
@@ -16,31 +16,17 @@ static __constant__ uint32_t d_alpha_f[32];
static __constant__ uint32_t d_T512[64][16];
static const uint32_t alpha_n[] = {
- SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
- SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
- SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
- SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
- SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
- SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
- SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
- SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
- SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
- SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
- SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
+ 0xff00f0f0, 0xccccaaaa, 0xf0f0cccc, 0xff00aaaa, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xf0f0ff00,
+ 0xf0f0cccc, 0xaaaaff00, 0xccccff00, 0xaaaaf0f0, 0xaaaaf0f0, 0xff00cccc, 0xccccf0f0, 0xff00aaaa,
+ 0xccccaaaa, 0xff00f0f0, 0xff00aaaa, 0xf0f0cccc, 0xf0f0ff00, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc,
+ 0xaaaaff00, 0xf0f0cccc, 0xaaaaf0f0, 0xccccff00, 0xff00cccc, 0xaaaaf0f0, 0xff00aaaa, 0xccccf0f0
};
static const uint32_t alpha_f[] = {
- SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
- SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
- SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
- SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
- SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
- SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
- SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
- SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
- SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
- SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
- SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
+ 0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0, 0xcaf9f9c0, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, 0x639ccaf9,
+ 0x639c0ff0, 0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c, 0xf9c0639c, 0xcaf90ff0, 0x0ff0639c, 0xcaf9f9c0,
+ 0x0ff0f9c0, 0xcaf9639c, 0xcaf9f9c0, 0x639c0ff0, 0x639ccaf9, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0,
+ 0xf9c0caf9, 0x639c0ff0, 0xf9c0639c, 0x0ff0caf9, 0xcaf90ff0, 0xf9c0639c, 0xcaf9f9c0, 0x0ff0639c
};
#define hamsi_s00 m0
@@ -200,390 +186,134 @@ static const uint32_t alpha_f[] = {
static const uint32_t T512[64][16] = {
- { SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
- SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
- SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
- SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
- SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
- SPH_C32(0x9e69af68) },
- { SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
- SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
- SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
- SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
- SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
- SPH_C32(0x0c26f262) },
- { SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
- SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
- SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
- SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
- SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
- SPH_C32(0xdc24e61f) },
- { SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
- SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
- SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
- SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
- SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
- SPH_C32(0x3daac2da) },
- { SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
- SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
- SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
- SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
- SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
- SPH_C32(0x78cace29) },
- { SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
- SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
- SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
- SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
- SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
- SPH_C32(0x2dd1f9ab) },
- { SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
- SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
- SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
- SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
- SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
- SPH_C32(0xbf2c0be2) },
- { SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
- SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
- SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
- SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
- SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
- SPH_C32(0x32219526) },
- { SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
- SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
- SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
- SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
- SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
- SPH_C32(0xac8e6c88) },
- { SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
- SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
- SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
- SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
- SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
- SPH_C32(0x7b1bd6b9) },
- { SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
- SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
- SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
- SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
- SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
- SPH_C32(0xf746c320) },
- { SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
- SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
- SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
- SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
- SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
- SPH_C32(0x69505b3a) },
- { SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
- SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
- SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
- SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
- SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
- SPH_C32(0x8a341574) },
- { SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
- SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
- SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
- SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
- SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
- SPH_C32(0x450360bf) },
- { SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
- SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
- SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
- SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
- SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
- SPH_C32(0xf3d45758) },
- { SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
- SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
- SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
- SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
- SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
- SPH_C32(0x925c44e9) },
- { SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
- SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
- SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
- SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
- SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
- SPH_C32(0xa123ff9f) },
- { SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
- SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
- SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
- SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
- SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
- SPH_C32(0x1568ff0f) },
- { SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
- SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
- SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
- SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
- SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
- SPH_C32(0xc5c1eb3e) },
- { SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
- SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
- SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
- SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
- SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
- SPH_C32(0x1af21fe1) },
- { SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
- SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
- SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
- SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
- SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
- SPH_C32(0x857f3c2b) },
- { SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
- SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
- SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
- SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
- SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
- SPH_C32(0x2ba05a55) },
- { SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
- SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
- SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
- SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
- SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
- SPH_C32(0xfeabf254) },
- { SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
- SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
- SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
- SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
- SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
- SPH_C32(0xfe1cdc7f) },
- { SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
- SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
- SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
- SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
- SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
- SPH_C32(0xb0a51834) },
- { SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
- SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
- SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
- SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
- SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
- SPH_C32(0xa6b8c28d) },
- { SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
- SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
- SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
- SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
- SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
- SPH_C32(0x3a4e99d7) },
- { SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
- SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
- SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
- SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
- SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
- SPH_C32(0xe1844257) },
- { SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
- SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
- SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
- SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
- SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
- SPH_C32(0x2c3b504e) },
- { SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
- SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
- SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
- SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
- SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
- SPH_C32(0x524a0d59) },
- { SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
- SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
- SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
- SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
- SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
- SPH_C32(0x378dd173) },
- { SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
- SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
- SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
- SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
- SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
- SPH_C32(0x8b6c72bd) },
- { SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
- SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
- SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
- SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
- SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
- SPH_C32(0x8e67b7fa) },
- { SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
- SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
- SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
- SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
- SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
- SPH_C32(0x443d3004) },
- { SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
- SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
- SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
- SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
- SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
- SPH_C32(0xf4f6ea7b) },
- { SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
- SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
- SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
- SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
- SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
- SPH_C32(0x979961d0) },
- { SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
- SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
- SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
- SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
- SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
- SPH_C32(0x98aa496e) },
- { SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
- SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
- SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
- SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
- SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
- SPH_C32(0x094e3198) },
- { SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
- SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
- SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
- SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
- SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
- SPH_C32(0xe86cba2e) },
- { SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
- SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
- SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
- SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
- SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
- SPH_C32(0x4b7eec55) },
- { SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
- SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
- SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
- SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
- SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
- SPH_C32(0x1e7536a6) },
- { SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
- SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
- SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
- SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
- SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
- SPH_C32(0x24314f17) },
- { SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
- SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
- SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
- SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
- SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
- SPH_C32(0x9075b1ce) },
- { SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
- SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
- SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
- SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
- SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
- SPH_C32(0x9b6ef888) },
- { SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
- SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
- SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
- SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
- SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
- SPH_C32(0xd8b61463) },
- { SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
- SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
- SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
- SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
- SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
- SPH_C32(0x3ea660f7) },
- { SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
- SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
- SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
- SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
- SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
- SPH_C32(0x7f975691) },
- { SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
- SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
- SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
- SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
- SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
- SPH_C32(0x2c94459e) },
- { SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
- SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
- SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
- SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
- SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
- SPH_C32(0x56a7b19f) },
- { SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
- SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
- SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
- SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
- SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
- SPH_C32(0x81fdf908) },
- { SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
- SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
- SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
- SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
- SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
- SPH_C32(0x5bd61539) },
- { SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
- SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
- SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
- SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
- SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
- SPH_C32(0x15b961e7) },
- { SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
- SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
- SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
- SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
- SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
- SPH_C32(0x2a2c18f0) },
- { SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
- SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
- SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
- SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
- SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
- SPH_C32(0x551e3d6e) },
- { SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
- SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
- SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
- SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
- SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
- SPH_C32(0x33c5244f) },
- { SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
- SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
- SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
- SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
- SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
- SPH_C32(0x8a58e6a4) },
- { SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
- SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
- SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
- SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
- SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
- SPH_C32(0xda878000) },
- { SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
- SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
- SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
- SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
- SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
- SPH_C32(0x3c5dfffe) },
- { SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
- SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
- SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
- SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
- SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
- SPH_C32(0x7b1675d7) },
- { SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
- SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
- SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
- SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
- SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
- SPH_C32(0x2879ebac) },
- { SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
- SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
- SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
- SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
- SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
- SPH_C32(0xbe0a679e) },
- { SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
- SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
- SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
- SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
- SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
- SPH_C32(0x30aebcf7) },
- { SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
- SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
- SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
- SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
- SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
- SPH_C32(0xc7ff60f0) },
- { SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
- SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
- SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
- SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
- SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
- SPH_C32(0xe7e00a94) }
+{ 0xef0b0270, 0x3afd0000, 0x5dae0000, 0x69490000, 0x9b0f3c06, 0x4405b5f9, 0x66140a51, 0x924f5d0a, // 0
+ 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68 },
+{ 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68,
+ 0x26600240, 0xddd80000, 0x722a0000, 0x4f060000, 0x936667ff, 0x29f944ce, 0x368b63d5, 0x0c26f262 },
+{ 0x145a3c00, 0xb9e90000, 0x61270000, 0xf1610000, 0xce613d6c, 0xb0493d78, 0x47a96720, 0xe18e24c5,
+ 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f },
+{ 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f,
+ 0x373d2800, 0x71500000, 0x95e00000, 0x0a140000, 0xbdac1909, 0x48ef9831, 0x456d6d1f, 0x3daac2da },
+{ 0x54285c00, 0xeaed0000, 0xc5d60000, 0xa1c50000, 0xb3a26770, 0x94a5c4e1, 0x6bb0419d, 0x551b3782,
+ 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29 },
+{ 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29,
+ 0xc8934400, 0x5a3e0000, 0x57870000, 0x4c560000, 0xea982435, 0x75b11115, 0x28b67247, 0x2dd1f9ab },
+{ 0x29449c00, 0x64e70000, 0xf24b0000, 0xc2f30000, 0x0ede4e8f, 0x56c23745, 0xf3e04259, 0x8d0d9ec4,
+ 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2 },
+{ 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2, // 7
+ 0x6f299000, 0x6c850000, 0x2f160000, 0x782e0000, 0x644c37cd, 0x12dd1cd6, 0xd26a8c36, 0x32219526 },
+{ 0xf6800005, 0x3443c000, 0x24070000, 0x8f3d0000, 0x21373bfb, 0x0ab8d5ae, 0xcdc58b19, 0xd795ba31,
+ 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88 },
+{ 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88,
+ 0x50ff0004, 0x45744000, 0x3dfb0000, 0x19e60000, 0x1bbc5606, 0xe1727b5d, 0xe1a8cc96, 0x7b1bd6b9 },
+{ 0xf7750009, 0xcf3cc000, 0xc3d60000, 0x04920000, 0x029519a9, 0xf8e836ba, 0x7a87f14e, 0x9e16981a,
+ 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320 },
+{ 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320,
+ 0x231f0009, 0x42f40000, 0x66790000, 0x4ebb0000, 0xfedb5bd3, 0x315cb0d6, 0xe2b1674a, 0x69505b3a },
+{ 0x774400f0, 0xf15a0000, 0xf5b20000, 0x34140000, 0x89377e8c, 0x5a8bec25, 0x0bc3cd1e, 0xcf3775cb,
+ 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574 },
+{ 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574,
+ 0x832800a0, 0x67420000, 0xe1170000, 0x370b0000, 0xcba30034, 0x3c34923c, 0x9767bdcc, 0x450360bf },
+{ 0xe8870170, 0x9d720000, 0x12db0000, 0xd4220000, 0xf2886b27, 0xa921e543, 0x4ef8b518, 0x618813b1, // 14
+ 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758 },
+{ 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758,
+ 0x5cb00110, 0x913e0000, 0x44190000, 0x888c0000, 0x66dc7418, 0x921f1d66, 0x55ceea25, 0x925c44e9 },
+{ 0x0c720000, 0x49e50f00, 0x42790000, 0x5cea0000, 0x33aa301a, 0x15822514, 0x95a34b7b, 0xb44b0090,
+ 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f },
+{ 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f,
+ 0xf2500000, 0xeebd0a00, 0x67a80000, 0xab8a0000, 0xba9b48c0, 0x0a56dd74, 0xdb73e86e, 0x1568ff0f },
+{ 0x45180000, 0xa5b51700, 0xf96a0000, 0x3b480000, 0x1ecc142c, 0x231395d6, 0x16bca6b0, 0xdf33f4df,
+ 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e },
+{ 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e,
+ 0xfd250000, 0xb3c41100, 0xcef00000, 0xcef90000, 0x3c4d7580, 0x8d5b6493, 0x7098b0a6, 0x1af21fe1 },
+{ 0x75a40000, 0xc28b2700, 0x94a40000, 0x90f50000, 0xfb7857e0, 0x49ce0bae, 0x1767c483, 0xaedf667e,
+ 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b },
+{ 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b, // 21
+ 0xa4c20000, 0xd9372400, 0x0a480000, 0x66610000, 0xf87a12c7, 0x86bef75c, 0xa324df94, 0x2ba05a55 },
+{ 0x75c90003, 0x0e10c000, 0xd1200000, 0xbaea0000, 0x8bc42f3e, 0x8758b757, 0xbb28761d, 0x00b72e2b,
+ 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254 },
+{ 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254,
+ 0x9b060002, 0x61468000, 0x221e0000, 0x1d740000, 0x36715d27, 0x30495c92, 0xf11336a7, 0xfe1cdc7f },
+{ 0x86790000, 0x3f390002, 0xe19ae000, 0x98560000, 0x9565670e, 0x4e88c8ea, 0xd3dd4944, 0x161ddab9,
+ 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834 },
+{ 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834,
+ 0xb6ce0000, 0xdae90002, 0x156e8000, 0xda920000, 0xf6dd5a64, 0x36325c8a, 0xf272e8ae, 0xa6b8c28d },
+{ 0x14190000, 0x23ca003c, 0x50df0000, 0x44b60000, 0x1b6c67b0, 0x3cf3ac75, 0x61e610b0, 0xdbcadb80,
+ 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7 },
+{ 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7,
+ 0xf75a0000, 0x19840028, 0xa2190000, 0xeef80000, 0xc0722516, 0x19981260, 0x73dba1e6, 0xe1844257 },
+{ 0x54500000, 0x0671005c, 0x25ae0000, 0x6a1e0000, 0x2ea54edf, 0x664e8512, 0xbfba18c3, 0x7e715d17, // 28
+ 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e },
+{ 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e,
+ 0xe8dd0000, 0xfa4a0044, 0x3c2d0000, 0xbb150000, 0x80bd361b, 0x24e81d44, 0xbfa8c2f4, 0x524a0d59 },
+{ 0x69510000, 0xd4e1009c, 0xc3230000, 0xac2f0000, 0xe4950bae, 0xcea415dc, 0x87ec287c, 0xbce1a3ce,
+ 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173 },
+{ 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173,
+ 0xaf220000, 0x7b6c0090, 0x67e20000, 0x8da20000, 0xc7841e29, 0xb7b744f3, 0x9ac484f4, 0x8b6c72bd },
+{ 0xcc140000, 0xa5630000, 0x5ab90780, 0x3b500000, 0x4bd013ff, 0x879b3418, 0x694348c1, 0xca5a87fe,
+ 0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa },
+{ 0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa,
+ 0x4d8a0000, 0x49340000, 0x3c8b0500, 0xaea30000, 0x16793bfd, 0xcf6f08a4, 0x8f19eaec, 0x443d3004 },
+{ 0x78230000, 0x12fc0000, 0xa93a0b80, 0x90a50000, 0x713e2879, 0x7ee98924, 0xf08ca062, 0x636f8bab,
+ 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b },
+{ 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b, // 35
+ 0x7a8c0000, 0xa5d40000, 0x13260880, 0xc63d0000, 0xcbb36daa, 0xfea14f43, 0x59d0b4f8, 0x979961d0 },
+{ 0xac480000, 0x1ba60000, 0x45fb1380, 0x03430000, 0x5a85316a, 0x1fb250b6, 0xfe72c7fe, 0x91e478f6,
+ 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e },
+{ 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e,
+ 0xb2060000, 0xc5690000, 0x28031200, 0x74670000, 0xb6c236f4, 0xeb1239f8, 0x33d1dfec, 0x094e3198 },
+{ 0xaec30000, 0x9c4f0001, 0x79d1e000, 0x2c150000, 0x45cc75b3, 0x6650b736, 0xab92f78f, 0xa312567b,
+ 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e },
+{ 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e,
+ 0x75e60000, 0x95660001, 0x307b2000, 0xadf40000, 0x8f321eea, 0x24298307, 0xe8c49cf9, 0x4b7eec55 },
+{ 0x58430000, 0x807e0000, 0x78330001, 0xc66b3800, 0xe7375cdc, 0x79ad3fdd, 0xac73fe6f, 0x3a4479b1,
+ 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6 },
+{ 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6,
+ 0x45190000, 0xab0c0000, 0x30be0001, 0x690a2000, 0xc2fc7219, 0xb1d4800d, 0x2dd1fa46, 0x24314f17 },
+{ 0xa53b0000, 0x14260000, 0x4e30001e, 0x7cae0000, 0x8f9e0dd5, 0x78dfaa3d, 0xf73168d8, 0x0b1b4946, // 42
+ 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce },
+{ 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce,
+ 0xa2d60000, 0xa6760000, 0xc9440014, 0xeba30000, 0xccec2e7b, 0x3018c499, 0x03490afa, 0x9b6ef888 },
+{ 0x88980000, 0x1f940000, 0x7fcf002e, 0xfb4e0000, 0xf158079a, 0x61ae9167, 0xa895706c, 0xe6107494,
+ 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463 },
+{ 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463,
+ 0x835a0000, 0xc4f70000, 0x01470022, 0xeec80000, 0x60a54f69, 0x142f2a24, 0x5cf534f2, 0x3ea660f7 },
+{ 0x52500000, 0x29540000, 0x6a61004e, 0xf0ff0000, 0x9a317eec, 0x452341ce, 0xcf568fe5, 0x5303130f,
+ 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691 },
+{ 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691,
+ 0x01dd0000, 0x80a80000, 0xf4960048, 0xa6000000, 0x90d57ea2, 0xd7e68c37, 0x6612cffd, 0x2c94459e },
+{ 0xe6280000, 0x4c4b0000, 0xa8550000, 0xd3d002e0, 0xd86130b8, 0x98a7b0da, 0x289506b4, 0xd75a4897,
+ 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f },
+{ 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f, // 49
+ 0x16ed0000, 0x15680000, 0xedd70000, 0x325d0220, 0xe30c3689, 0x5a4ae643, 0xe375f8a8, 0x81fdf908 },
+{ 0xb4310000, 0x77330000, 0xb15d0000, 0x7fd004e0, 0x78a26138, 0xd116c35d, 0xd256d489, 0x4e6f74de,
+ 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539 },
+{ 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539,
+ 0x57370000, 0xcaf20000, 0x364e0000, 0xc0220480, 0x56186b22, 0x5ca3f40c, 0xa1937f8f, 0x15b961e7 },
+{ 0x02f20000, 0xa2810000, 0x873f0000, 0xe36c7800, 0x1e1d74ef, 0x073d2bd6, 0xc4c23237, 0x7f32259e,
+ 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0 },
+{ 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0,
+ 0xb82f0000, 0xb12c0000, 0x30d80000, 0x14445000, 0xc15860a2, 0x3127e8ec, 0x2e98bf23, 0x551e3d6e },
+{ 0x1e6c0000, 0xc4420000, 0x8a2e0000, 0xbcb6b800, 0x2c4413b6, 0x8bfdd3da, 0x6a0c1bc8, 0xb99dc2eb,
+ 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f },
+{ 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f,
+ 0x8c3a0000, 0xda980000, 0x607f0000, 0x54078800, 0x85714513, 0x6006b243, 0xdb50399c, 0x8a58e6a4 },
+{ 0x033d0000, 0x08b30000, 0xf33a0000, 0x3ac20007, 0x51298a50, 0x6b6e661f, 0x0ea5cfe3, 0xe6da7ffe, // 56
+ 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000 },
+{ 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000,
+ 0xabe70000, 0x9e0d0000, 0xaf270000, 0x3d180005, 0x2c4f1fd3, 0x74f61695, 0xb5c347eb, 0x3c5dfffe },
+{ 0x01930000, 0xe7820000, 0xedfb0000, 0xcf0c000b, 0x8dd08d58, 0xbca3b42e, 0x063661e1, 0x536f9e7b,
+ 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7 },
+{ 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7,
+ 0x93bb0000, 0x3b070000, 0xba010000, 0x99d00008, 0x3739ae4e, 0xe64c1722, 0x96f896b3, 0x2879ebac },
+{ 0x5fa80000, 0x56030000, 0x43ae0000, 0x64f30013, 0x257e86bf, 0x1311944e, 0x541e95bf, 0x8ea4db69,
+ 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e },
+{ 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e,
+ 0x5fec0000, 0x294b0000, 0x99d20000, 0x4ed00012, 0x1ed34f73, 0xbaa708c9, 0x57140bdf, 0x30aebcf7 },
+{ 0xee930000, 0xd6070000, 0x92c10000, 0x2b9801e0, 0x9451287c, 0x3b6cfb57, 0x45312374, 0x201f6a64,
+ 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0 },
+{ 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0,
+ 0x95bb0000, 0x81450000, 0x3b240000, 0x48db0140, 0x0a8a6c53, 0x56f56eec, 0x62c91877, 0xe7e00a94 }
};
__global__
@@ -598,12 +328,12 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
unsigned char *h1 = (unsigned char *)Hash;
- uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265);
- uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c);
- uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48);
- uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d);
- uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
+ uint32_t c0 = 0x73746565, c1 = 0x6c706172, c2 = 0x6b204172, c3 = 0x656e6265;
+ uint32_t c4 = 0x72672031, c5 = 0x302c2062, c6 = 0x75732032, c7 = 0x3434362c;
+ uint32_t c8 = 0x20422d33, c9 = 0x30303120, cA = 0x4c657576, cB = 0x656e2d48;
+ uint32_t cC = 0x65766572, cD = 0x6c65652c, cE = 0x2042656c, cF = 0x6769756d;
uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
+ uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
uint32_t *tp, db, dm;
for(int i = 0; i < 64; i += 8) {
@@ -637,16 +367,16 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
T_BIG;
}
+ // precomputed for 64 bytes blocks ?
tp = &d_T512[0][0] + 112;
-
- m0 = *(tp+ 0); m1 = *(tp+ 1);
- m2 = *(tp+ 2); m3 = *(tp+ 3);
- m4 = *(tp+ 4); m5 = *(tp+ 5);
- m6 = *(tp+ 6); m7 = *(tp+ 7);
- m8 = *(tp+ 8); m9 = *(tp+ 9);
- mA = *(tp+10); mB = *(tp+11);
- mC = *(tp+12); mD = *(tp+13);
- mE = *(tp+14); mF = *(tp+15);
+ m0 = tp[ 0]; m1 = tp[ 1];
+ m2 = tp[ 2]; m3 = tp[ 3];
+ m4 = tp[ 4]; m5 = tp[ 5];
+ m6 = tp[ 6]; m7 = tp[ 7];
+ m8 = tp[ 8]; m9 = tp[ 9];
+ mA = tp[10]; mB = tp[11];
+ mC = tp[12]; mD = tp[13];
+ mE = tp[14]; mF = tp[15];
for( int r = 0; r < 6; r += 2 ) {
ROUND_BIG(r, d_alpha_n);
@@ -655,15 +385,14 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
T_BIG;
tp = &d_T512[0][0] + 784;
-
- m0 = *(tp+ 0); m1 = *(tp+ 1);
- m2 = *(tp+ 2); m3 = *(tp+ 3);
- m4 = *(tp+ 4); m5 = *(tp+ 5);
- m6 = *(tp+ 6); m7 = *(tp+ 7);
- m8 = *(tp+ 8); m9 = *(tp+ 9);
- mA = *(tp+10); mB = *(tp+11);
- mC = *(tp+12); mD = *(tp+13);
- mE = *(tp+14); mF = *(tp+15);
+ m0 = tp[ 0]; m1 = tp[ 1];
+ m2 = tp[ 2]; m3 = tp[ 3];
+ m4 = tp[ 4]; m5 = tp[ 5];
+ m6 = tp[ 6]; m7 = tp[ 7];
+ m8 = tp[ 8]; m9 = tp[ 9];
+ mA = tp[10]; mB = tp[11];
+ mC = tp[12]; mD = tp[13];
+ mE = tp[14]; mF = tp[15];
for( int r = 0; r < 12; r += 2 ) {
ROUND_BIG(r, d_alpha_f);
@@ -696,3 +425,127 @@ void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce
x13_hamsi512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
//MyStreamSynchronize(NULL, order, thr_id);
}
+
+__constant__ static uint64_t c_PaddedMessage80[10];
+
+__host__
+void x16_hamsi512_setBlock_80(void *pdata)
+{
+ cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+}
+
+__global__
+void x16_hamsi512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
+{
+ uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ if (thread < threads)
+ {
+ unsigned char h1[80];
+ #pragma unroll
+ for (int i = 0; i < 10; i++)
+ ((uint2*)h1)[i] = ((uint2*)c_PaddedMessage80)[i];
+ //((uint64_t*)h1)[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(startNonce + thread));
+ ((uint32_t*)h1)[19] = cuda_swab32(startNonce + thread);
+
+ uint32_t c0 = 0x73746565, c1 = 0x6c706172, c2 = 0x6b204172, c3 = 0x656e6265;
+ uint32_t c4 = 0x72672031, c5 = 0x302c2062, c6 = 0x75732032, c7 = 0x3434362c;
+ uint32_t c8 = 0x20422d33, c9 = 0x30303120, cA = 0x4c657576, cB = 0x656e2d48;
+ uint32_t cC = 0x65766572, cD = 0x6c65652c, cE = 0x2042656c, cF = 0x6769756d;
+ uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
+ uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
+ uint32_t *tp, db, dm;
+
+ for(int i = 0; i < 80; i += 8)
+ {
+ m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0;
+ m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0;
+ tp = &d_T512[0][0];
+
+ #pragma unroll
+ for (int u = 0; u < 8; u++) {
+ db = h1[i + u];
+ #pragma unroll 2
+ for (int v = 0; v < 8; v++, db >>= 1) {
+ dm = -(uint32_t)(db & 1);
+ m0 ^= dm & tp[ 0]; m1 ^= dm & tp[ 1];
+ m2 ^= dm & tp[ 2]; m3 ^= dm & tp[ 3];
+ m4 ^= dm & tp[ 4]; m5 ^= dm & tp[ 5];
+ m6 ^= dm & tp[ 6]; m7 ^= dm & tp[ 7];
+ m8 ^= dm & tp[ 8]; m9 ^= dm & tp[ 9];
+ mA ^= dm & tp[10]; mB ^= dm & tp[11];
+ mC ^= dm & tp[12]; mD ^= dm & tp[13];
+ mE ^= dm & tp[14]; mF ^= dm & tp[15];
+ tp += 16;
+ }
+ }
+
+ #pragma unroll
+ for (int r = 0; r < 6; r++) {
+ ROUND_BIG(r, d_alpha_n);
+ }
+ T_BIG;
+ }
+
+ #define INPUT_BIG { \
+ m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0; \
+ m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0; \
+ tp = &d_T512[0][0]; \
+ for (int u = 0; u < 8; u++) { \
+ db = endtag[u]; \
+ for (int v = 0; v < 8; v++, db >>= 1) { \
+ dm = -(uint32_t)(db & 1); \
+ m0 ^= dm & tp[ 0]; m1 ^= dm & tp[ 1]; \
+ m2 ^= dm & tp[ 2]; m3 ^= dm & tp[ 3]; \
+ m4 ^= dm & tp[ 4]; m5 ^= dm & tp[ 5]; \
+ m6 ^= dm & tp[ 6]; m7 ^= dm & tp[ 7]; \
+ m8 ^= dm & tp[ 8]; m9 ^= dm & tp[ 9]; \
+ mA ^= dm & tp[10]; mB ^= dm & tp[11]; \
+ mC ^= dm & tp[12]; mD ^= dm & tp[13]; \
+ mE ^= dm & tp[14]; mF ^= dm & tp[15]; \
+ tp += 16; \
+ } \
+ } \
+ }
+
+ // close
+ uint8_t endtag[8] = { 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 };
+ INPUT_BIG;
+
+ #pragma unroll
+ for (int r = 0; r < 6; r++) {
+ ROUND_BIG(r, d_alpha_n);
+ }
+ T_BIG;
+
+ endtag[0] = endtag[1] = 0x00;
+ endtag[6] = 0x02;
+ endtag[7] = 0x80;
+ INPUT_BIG;
+
+ // PF_BIG
+ #pragma unroll
+ for(int r = 0; r < 12; r++) {
+ ROUND_BIG(r, d_alpha_f);
+ }
+ T_BIG;
+
+ uint64_t hashPosition = thread;
+ uint32_t *Hash = (uint32_t*)&g_hash[hashPosition << 3];
+ #pragma unroll 16
+ for(int i = 0; i < 16; i++)
+ Hash[i] = cuda_swab32(h[i]);
+
+ #undef INPUT_BIG
+ }
+}
+
+__host__
+void x16_hamsi512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
+{
+ const uint32_t threadsperblock = 128;
+
+ dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+ dim3 block(threadsperblock);
+
+ x16_hamsi512_gpu_hash_80 <<>> (threads, startNounce, (uint64_t*)d_hash);
+}
diff --git a/x15/cuda_x15_whirlpool_sm3.cu b/x15/cuda_x15_whirlpool_sm3.cu
index e2df3dc..3110a69 100644
--- a/x15/cuda_x15_whirlpool_sm3.cu
+++ b/x15/cuda_x15_whirlpool_sm3.cu
@@ -1998,7 +1998,7 @@ const int i0, const int i1, const int i2, const int i3, const int i4, const int
__global__
-void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, int swab)
+void oldwhirlpool_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, void *outputHash, int swab)
{
__shared__ uint64_t sharedMemory[2048];
@@ -2014,7 +2014,8 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
#endif
}
- __threadfence_block(); // ensure shared mem is ready
+ //__threadfence_block(); // ensure shared mem is ready
+ __syncthreads();
uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
if (thread < threads)
@@ -2028,7 +2029,8 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
uint64_t state[8];
#pragma unroll 8
for (int i=0; i < 8; i++) {
- state[i] = c_PaddedMessage80[i];
+ //state[i] = c_PaddedMessage80[i];
+ AS_UINT2(&state[i]) = AS_UINT2(&c_PaddedMessage80[i]);
}
#else
#pragma unroll 8
@@ -2050,6 +2052,7 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
state[i] = xor1(n[i],c_PaddedMessage80[i]);
}
#endif
+
/// round 2 ///////
//////////////////////////////////
n[0] = c_PaddedMessage80[8]; //read data
@@ -2331,7 +2334,7 @@ extern uint32_t whirlpool512_finalhash_64(int thr_id, uint32_t threads, uint32_t
}
__host__
-void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
+void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash)
{
dim3 grid((threads + threadsperblock-1) / threadsperblock);
dim3 block(threadsperblock);
@@ -2339,7 +2342,7 @@ void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNounce
if (threads < 256)
applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!");
- oldwhirlpool_gpu_hash_80<<>>(threads, startNounce, d_outputHash, 1);
+ oldwhirlpool_gpu_hash_80<<>>(threads, startNonce, d_outputHash, 1);
}
extern void whirl_midstate(void *state, const void *input);
@@ -2363,3 +2366,54 @@ void whirlpool512_setBlock_80_sm3(void *pdata, const void *ptarget)
cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice);
}
+
+// ------------------------------------------------------------------------------------------------
+
+__host__
+void x16_whirlpool512_init(int thr_id, uint32_t threads)
+{
+ cudaMemcpyToSymbol(InitVector_RC, plain_RC, sizeof(plain_RC), 0, cudaMemcpyHostToDevice);
+ cudaMemcpyToSymbol(mixTob0Tox, plain_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice);
+#if USE_ALL_TABLES
+ cudaMemcpyToSymbol(mixTob1Tox, plain_T1, (256 * 8), 0, cudaMemcpyHostToDevice);
+ cudaMemcpyToSymbol(mixTob2Tox, plain_T2, (256 * 8), 0, cudaMemcpyHostToDevice);
+ cudaMemcpyToSymbol(mixTob3Tox, plain_T3, (256 * 8), 0, cudaMemcpyHostToDevice);
+ cudaMemcpyToSymbol(mixTob4Tox, plain_T4, (256 * 8), 0, cudaMemcpyHostToDevice);
+ cudaMemcpyToSymbol(mixTob5Tox, plain_T5, (256 * 8), 0, cudaMemcpyHostToDevice);
+ cudaMemcpyToSymbol(mixTob6Tox, plain_T6, (256 * 8), 0, cudaMemcpyHostToDevice);
+ cudaMemcpyToSymbol(mixTob7Tox, plain_T7, (256 * 8), 0, cudaMemcpyHostToDevice);
+#endif
+}
+
+extern void whirlpool_midstate(void *state, const void *input);
+
+__host__
+void x16_whirlpool512_setBlock_80(void *pdata)
+{
+ unsigned char PaddedMessage[128];
+
+ memcpy(PaddedMessage, pdata, 80);
+ memset(PaddedMessage + 80, 0, 48);
+ PaddedMessage[80] = 0x80; /* ending */
+
+#if HOST_MIDSTATE
+ // compute constant first block
+ unsigned char midstate[64] = { 0 };
+ whirlpool_midstate(midstate, pdata);
+ memcpy(PaddedMessage, midstate, 64);
+#endif
+
+ cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice);
+}
+
+__host__
+void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_outputHash)
+{
+ dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+ dim3 block(threadsperblock);
+
+ if (threads < 256)
+ applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!");
+
+ oldwhirlpool_gpu_hash_80 <<>> (threads, startNonce, d_outputHash, 1);
+}
diff --git a/x16r/cuda_x16_echo512.cu b/x16r/cuda_x16_echo512.cu
new file mode 100644
index 0000000..5e6013d
--- /dev/null
+++ b/x16r/cuda_x16_echo512.cu
@@ -0,0 +1,214 @@
+/**
+ * echo512-80 cuda kernel for X16R algorithm
+ *
+ * tpruvot 2018 - GPL code
+ */
+
+#include
+#include
+
+#include "cuda_helper.h"
+
+extern __device__ __device_builtin__ void __threadfence_block(void);
+
+#include "../x11/cuda_x11_aes.cuh"
+
+__device__ __forceinline__ void AES_2ROUND(const uint32_t* __restrict__ sharedMemory,
+ uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3,
+ uint32_t &k0)
+{
+ uint32_t y0, y1, y2, y3;
+
+ aes_round(sharedMemory,
+ x0, x1, x2, x3,
+ k0,
+ y0, y1, y2, y3);
+
+ aes_round(sharedMemory,
+ y0, y1, y2, y3,
+ x0, x1, x2, x3);
+
+ k0++;
+}
+
+__device__
+static void echo_round(uint32_t* const sharedMemory, uint32_t *W, uint32_t &k0)
+{
+ // Big Sub Words
+ #pragma unroll 16
+ for (int idx = 0; idx < 16; idx++) {
+ AES_2ROUND(sharedMemory, W[(idx << 2) + 0], W[(idx << 2) + 1], W[(idx << 2) + 2], W[(idx << 2) + 3], k0);
+ }
+
+ // Shift Rows
+ #pragma unroll 4
+ for (int i = 0; i < 4; i++)
+ {
+ uint32_t t[4];
+ /// 1, 5, 9, 13
+ t[0] = W[i + 4];
+ t[1] = W[i + 8];
+ t[2] = W[i + 24];
+ t[3] = W[i + 60];
+
+ W[i + 4] = W[i + 20];
+ W[i + 8] = W[i + 40];
+ W[i + 24] = W[i + 56];
+ W[i + 60] = W[i + 44];
+
+ W[i + 20] = W[i + 36];
+ W[i + 40] = t[1];
+ W[i + 56] = t[2];
+ W[i + 44] = W[i + 28];
+
+ W[i + 28] = W[i + 12];
+ W[i + 12] = t[3];
+ W[i + 36] = W[i + 52];
+ W[i + 52] = t[0];
+ }
+
+ // Mix Columns
+ #pragma unroll 4
+ for (int i = 0; i < 4; i++)
+ {
+ #pragma unroll 4
+ for (int idx = 0; idx < 64; idx += 16)
+ {
+ uint32_t a[4];
+ a[0] = W[idx + i];
+ a[1] = W[idx + i + 4];
+ a[2] = W[idx + i + 8];
+ a[3] = W[idx + i + 12];
+
+ uint32_t ab = a[0] ^ a[1];
+ uint32_t bc = a[1] ^ a[2];
+ uint32_t cd = a[2] ^ a[3];
+
+ uint32_t t, t2, t3;
+ t = (ab & 0x80808080);
+ t2 = (bc & 0x80808080);
+ t3 = (cd & 0x80808080);
+
+ uint32_t abx = (t >> 7) * 27U ^ ((ab^t) << 1);
+ uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+ uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+ W[idx + i] = bc ^ a[3] ^ abx;
+ W[idx + i + 4] = a[0] ^ cd ^ bcx;
+ W[idx + i + 8] = ab ^ a[3] ^ cdx;
+ W[idx + i + 12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx);
+ }
+ }
+}
+
+__device__ __forceinline__
+void cuda_echo_round_80(uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__ data, const uint32_t nonce, uint32_t *hash)
+{
+ uint32_t h[29]; // <= 127 bytes input
+
+ #pragma unroll 8
+ for (int i = 0; i < 18; i += 2)
+ AS_UINT2(&h[i]) = AS_UINT2(&data[i]);
+ h[18] = data[18];
+ h[19] = cuda_swab32(nonce);
+ h[20] = 0x80;
+ h[21] = h[22] = h[23] = h[24] = h[25] = h[26] = 0;
+ //((uint8_t*)h)[80] = 0x80;
+ //((uint8_t*)h)[128-17] = 0x02;
+ //((uint8_t*)h)[128-16] = 0x80;
+ //((uint8_t*)h)[128-15] = 0x02;
+ h[27] = 0x2000000;
+ h[28] = 0x280;
+ //h[29] = h[30] = h[31] = 0;
+
+ uint32_t k0 = 640; // bitlen
+ uint32_t W[64];
+
+ #pragma unroll 8
+ for (int i = 0; i < 32; i+=4) {
+ W[i] = 512; // L
+ W[i+1] = 0; // H
+ W[i+2] = 0; // X
+ W[i+3] = 0;
+ }
+
+ uint32_t Z[16];
+ #pragma unroll
+ for (int i = 0; i<16; i++) Z[i] = W[i];
+ #pragma unroll
+ for (int i = 32; i<61; i++) W[i] = h[i - 32];
+ #pragma unroll
+ for (int i = 61; i<64; i++) W[i] = 0;
+
+ for (int i = 0; i < 10; i++)
+ echo_round(sharedMemory, W, k0);
+
+ #pragma unroll 16
+ for (int i = 0; i < 16; i++) {
+ Z[i] ^= h[i] ^ W[i] ^ W[i + 32];
+ }
+
+ #pragma unroll 8
+ for (int i = 0; i < 16; i += 2)
+ AS_UINT2(&hash[i]) = AS_UINT2(&Z[i]);
+}
+
+__device__ __forceinline__
+void echo_gpu_init(uint32_t *const __restrict__ sharedMemory)
+{
+ /* each thread startup will fill a uint32 */
+ if (threadIdx.x < 128) {
+ sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
+ sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
+ sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
+ sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
+
+ sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
+ sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
+ sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
+ sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
+ }
+}
+
+__host__
+void x16_echo512_cuda_init(int thr_id, const uint32_t threads)
+{
+ aes_cpu_init(thr_id);
+}
+
+__constant__ static uint32_t c_PaddedMessage80[20];
+
+__host__
+void x16_echo512_setBlock_80(void *endiandata)
+{
+ cudaMemcpyToSymbol(c_PaddedMessage80, endiandata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+}
+
+__global__ __launch_bounds__(128, 7) /* will force 72 registers */
+void x16_echo512_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint64_t *g_hash)
+{
+ __shared__ uint32_t sharedMemory[1024];
+
+ echo_gpu_init(sharedMemory);
+ __threadfence_block();
+
+ const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ if (thread < threads)
+ {
+ uint64_t hashPosition = thread;
+ uint32_t *pHash = (uint32_t*)&g_hash[hashPosition<<3];
+
+ cuda_echo_round_80(sharedMemory, c_PaddedMessage80, startNonce + thread, pHash);
+ }
+}
+
+__host__
+void x16_echo512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
+{
+ const uint32_t threadsperblock = 128;
+
+ dim3 grid((threads + threadsperblock-1)/threadsperblock);
+ dim3 block(threadsperblock);
+
+ x16_echo512_gpu_hash_80<<>>(threads, startNonce, (uint64_t*)d_hash);
+}
diff --git a/x16r/cuda_x16_fugue512.cu b/x16r/cuda_x16_fugue512.cu
new file mode 100644
index 0000000..7f3438c
--- /dev/null
+++ b/x16r/cuda_x16_fugue512.cu
@@ -0,0 +1,467 @@
+
+#include
+#include
+
+#define TPB 256
+
+/*
+ * fugue512-80 x16r kernel implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2018 tpruvot
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ */
+
+#ifdef __INTELLISENSE__
+#define __byte_perm(x, y, m) (x|y)
+#define tex1Dfetch(t, n) (n)
+#define __CUDACC__
+#include
+#endif
+
+// store allocated textures device addresses
+static unsigned int* d_textures[MAX_GPUS][1];
+
+#define mixtab0(x) mixtabs[(x)]
+#define mixtab1(x) mixtabs[(x)+256]
+#define mixtab2(x) mixtabs[(x)+512]
+#define mixtab3(x) mixtabs[(x)+768]
+
+static texture mixTab0Tex;
+
+static const uint32_t mixtab0[] = {
+ 0x63633297, 0x7c7c6feb, 0x77775ec7, 0x7b7b7af7, 0xf2f2e8e5, 0x6b6b0ab7, 0x6f6f16a7, 0xc5c56d39,
+ 0x303090c0, 0x01010704, 0x67672e87, 0x2b2bd1ac, 0xfefeccd5, 0xd7d71371, 0xabab7c9a, 0x767659c3,
+ 0xcaca4005, 0x8282a33e, 0xc9c94909, 0x7d7d68ef, 0xfafad0c5, 0x5959947f, 0x4747ce07, 0xf0f0e6ed,
+ 0xadad6e82, 0xd4d41a7d, 0xa2a243be, 0xafaf608a, 0x9c9cf946, 0xa4a451a6, 0x727245d3, 0xc0c0762d,
+ 0xb7b728ea, 0xfdfdc5d9, 0x9393d47a, 0x2626f298, 0x363682d8, 0x3f3fbdfc, 0xf7f7f3f1, 0xcccc521d,
+ 0x34348cd0, 0xa5a556a2, 0xe5e58db9, 0xf1f1e1e9, 0x71714cdf, 0xd8d83e4d, 0x313197c4, 0x15156b54,
+ 0x04041c10, 0xc7c76331, 0x2323e98c, 0xc3c37f21, 0x18184860, 0x9696cf6e, 0x05051b14, 0x9a9aeb5e,
+ 0x0707151c, 0x12127e48, 0x8080ad36, 0xe2e298a5, 0xebeba781, 0x2727f59c, 0xb2b233fe, 0x757550cf,
+ 0x09093f24, 0x8383a43a, 0x2c2cc4b0, 0x1a1a4668, 0x1b1b416c, 0x6e6e11a3, 0x5a5a9d73, 0xa0a04db6,
+ 0x5252a553, 0x3b3ba1ec, 0xd6d61475, 0xb3b334fa, 0x2929dfa4, 0xe3e39fa1, 0x2f2fcdbc, 0x8484b126,
+ 0x5353a257, 0xd1d10169, 0x00000000, 0xededb599, 0x2020e080, 0xfcfcc2dd, 0xb1b13af2, 0x5b5b9a77,
+ 0x6a6a0db3, 0xcbcb4701, 0xbebe17ce, 0x3939afe4, 0x4a4aed33, 0x4c4cff2b, 0x5858937b, 0xcfcf5b11,
+ 0xd0d0066d, 0xefefbb91, 0xaaaa7b9e, 0xfbfbd7c1, 0x4343d217, 0x4d4df82f, 0x333399cc, 0x8585b622,
+ 0x4545c00f, 0xf9f9d9c9, 0x02020e08, 0x7f7f66e7, 0x5050ab5b, 0x3c3cb4f0, 0x9f9ff04a, 0xa8a87596,
+ 0x5151ac5f, 0xa3a344ba, 0x4040db1b, 0x8f8f800a, 0x9292d37e, 0x9d9dfe42, 0x3838a8e0, 0xf5f5fdf9,
+ 0xbcbc19c6, 0xb6b62fee, 0xdada3045, 0x2121e784, 0x10107040, 0xffffcbd1, 0xf3f3efe1, 0xd2d20865,
+ 0xcdcd5519, 0x0c0c2430, 0x1313794c, 0xececb29d, 0x5f5f8667, 0x9797c86a, 0x4444c70b, 0x1717655c,
+ 0xc4c46a3d, 0xa7a758aa, 0x7e7e61e3, 0x3d3db3f4, 0x6464278b, 0x5d5d886f, 0x19194f64, 0x737342d7,
+ 0x60603b9b, 0x8181aa32, 0x4f4ff627, 0xdcdc225d, 0x2222ee88, 0x2a2ad6a8, 0x9090dd76, 0x88889516,
+ 0x4646c903, 0xeeeebc95, 0xb8b805d6, 0x14146c50, 0xdede2c55, 0x5e5e8163, 0x0b0b312c, 0xdbdb3741,
+ 0xe0e096ad, 0x32329ec8, 0x3a3aa6e8, 0x0a0a3628, 0x4949e43f, 0x06061218, 0x2424fc90, 0x5c5c8f6b,
+ 0xc2c27825, 0xd3d30f61, 0xacac6986, 0x62623593, 0x9191da72, 0x9595c662, 0xe4e48abd, 0x797974ff,
+ 0xe7e783b1, 0xc8c84e0d, 0x373785dc, 0x6d6d18af, 0x8d8d8e02, 0xd5d51d79, 0x4e4ef123, 0xa9a97292,
+ 0x6c6c1fab, 0x5656b943, 0xf4f4fafd, 0xeaeaa085, 0x6565208f, 0x7a7a7df3, 0xaeae678e, 0x08083820,
+ 0xbaba0bde, 0x787873fb, 0x2525fb94, 0x2e2ecab8, 0x1c1c5470, 0xa6a65fae, 0xb4b421e6, 0xc6c66435,
+ 0xe8e8ae8d, 0xdddd2559, 0x747457cb, 0x1f1f5d7c, 0x4b4bea37, 0xbdbd1ec2, 0x8b8b9c1a, 0x8a8a9b1e,
+ 0x70704bdb, 0x3e3ebaf8, 0xb5b526e2, 0x66662983, 0x4848e33b, 0x0303090c, 0xf6f6f4f5, 0x0e0e2a38,
+ 0x61613c9f, 0x35358bd4, 0x5757be47, 0xb9b902d2, 0x8686bf2e, 0xc1c17129, 0x1d1d5374, 0x9e9ef74e,
+ 0xe1e191a9, 0xf8f8decd, 0x9898e556, 0x11117744, 0x696904bf, 0xd9d93949, 0x8e8e870e, 0x9494c166,
+ 0x9b9bec5a, 0x1e1e5a78, 0x8787b82a, 0xe9e9a989, 0xcece5c15, 0x5555b04f, 0x2828d8a0, 0xdfdf2b51,
+ 0x8c8c8906, 0xa1a14ab2, 0x89899212, 0x0d0d2334, 0xbfbf10ca, 0xe6e684b5, 0x4242d513, 0x686803bb,
+ 0x4141dc1f, 0x9999e252, 0x2d2dc3b4, 0x0f0f2d3c, 0xb0b03df6, 0x5454b74b, 0xbbbb0cda, 0x16166258
+};
+
+#define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \
+ x22 ^= x00; \
+ x00 = (q); \
+ x08 ^= x00; \
+ x01 ^= x24; \
+ x04 ^= x27; \
+ x07 ^= x30; \
+}
+
+#define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20) { \
+ x00 ^= x04; \
+ x01 ^= x05; \
+ x02 ^= x06; \
+ x18 ^= x04; \
+ x19 ^= x05; \
+ x20 ^= x06; \
+}
+
+#define SMIX(x0, x1, x2, x3) { \
+ uint32_t tmp; \
+ uint32_t r0 = 0; \
+ uint32_t r1 = 0; \
+ uint32_t r2 = 0; \
+ uint32_t r3 = 0; \
+ uint32_t c0 = mixtab0(x0 >> 24); \
+ tmp = mixtab1((x0 >> 16) & 0xFF); \
+ c0 ^= tmp; \
+ r1 ^= tmp; \
+ tmp = mixtab2((x0 >> 8) & 0xFF); \
+ c0 ^= tmp; \
+ r2 ^= tmp; \
+ tmp = mixtab3(x0 & 0xFF); \
+ c0 ^= tmp; \
+ r3 ^= tmp; \
+ tmp = mixtab0(x1 >> 24); \
+ uint32_t c1 = tmp; \
+ r0 ^= tmp; \
+ tmp = mixtab1((x1 >> 16) & 0xFF); \
+ c1 ^= tmp; \
+ tmp = mixtab2((x1 >> 8) & 0xFF); \
+ c1 ^= tmp; \
+ r2 ^= tmp; \
+ tmp = mixtab3(x1 & 0xFF); \
+ c1 ^= tmp; \
+ r3 ^= tmp; \
+ tmp = mixtab0(x2 >> 24); \
+ uint32_t c2 = tmp; \
+ r0 ^= tmp; \
+ tmp = mixtab1((x2 >> 16) & 0xFF); \
+ c2 ^= tmp; \
+ r1 ^= tmp; \
+ tmp = mixtab2((x2 >> 8) & 0xFF); \
+ c2 ^= tmp; \
+ tmp = mixtab3(x2 & 0xFF); \
+ c2 ^= tmp; \
+ r3 ^= tmp; \
+ tmp = mixtab0(x3 >> 24); \
+ uint32_t c3 = tmp; \
+ r0 ^= tmp; \
+ tmp = mixtab1((x3 >> 16) & 0xFF); \
+ c3 ^= tmp; \
+ r1 ^= tmp; \
+ tmp = mixtab2((x3 >> 8) & 0xFF); \
+ c3 ^= tmp; \
+ r2 ^= tmp; \
+ tmp = mixtab3(x3 & 0xFF); \
+ c3 ^= tmp; \
+ x0 = ((c0 ^ r0) & 0xFF000000) | ((c1 ^ r1) & 0x00FF0000) \
+ | ((c2 ^ r2) & 0x0000FF00) | ((c3 ^ r3) & 0x000000FF); \
+ x1 = ((c1 ^ (r0 << 8)) & 0xFF000000) | ((c2 ^ (r1 << 8)) & 0x00FF0000) \
+ | ((c3 ^ (r2 << 8)) & 0x0000FF00) | ((c0 ^ (r3 >> 24)) & 0x000000FF); \
+ x2 = ((c2 ^ (r0 << 16)) & 0xFF000000) | ((c3 ^ (r1 << 16)) & 0x00FF0000) \
+ | ((c0 ^ (r2 >> 16)) & 0x0000FF00) | ((c1 ^ (r3 >> 16)) & 0x000000FF); \
+ x3 = ((c3 ^ (r0 << 24)) & 0xFF000000) | ((c0 ^ (r1 >> 8)) & 0x00FF0000) \
+ | ((c1 ^ (r2 >> 8)) & 0x0000FF00) | ((c2 ^ (r3 >> 8)) & 0x000000FF); \
+}
+
+#define SUB_ROR3 { \
+ B33 = S33, B34 = S34, B35 = S35; \
+ S35 = S32; S34 = S31; S33 = S30; S32 = S29; S31 = S28; S30 = S27; S29 = S26; S28 = S25; S27 = S24; \
+ S26 = S23; S25 = S22; S24 = S21; S23 = S20; S22 = S19; S21 = S18; S20 = S17; S19 = S16; S18 = S15; \
+ S17 = S14; S16 = S13; S15 = S12; S14 = S11; S13 = S10; S12 = S09; S11 = S08; S10 = S07; S09 = S06; \
+ S08 = S05; S07 = S04; S06 = S03; S05 = S02; S04 = S01; S03 = S00; S02 = B35; S01 = B34; S00 = B33; \
+}
+
+#define SUB_ROR8 { \
+ B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
+ S35 = S27; S34 = S26; S33 = S25; S32 = S24; S31 = S23; S30 = S22; S29 = S21; S28 = S20; S27 = S19; \
+ S26 = S18; S25 = S17; S24 = S16; S23 = S15; S22 = S14; S21 = S13; S20 = S12; S19 = S11; S18 = S10; \
+ S17 = S09; S16 = S08; S15 = S07; S14 = S06; S13 = S05; S12 = S04; S11 = S03; S10 = S02; S09 = S01; \
+ S08 = S00; S07 = B35; S06 = B34; S05 = B33; S04 = B32; S03 = B31; S02 = B30; S01 = B29; S00 = B28; \
+}
+
+#define SUB_ROR9 { \
+ B27 = S27, B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
+ S35 = S26; S34 = S25; S33 = S24; S32 = S23; S31 = S22; S30 = S21; S29 = S20; S28 = S19; S27 = S18; \
+ S26 = S17; S25 = S16; S24 = S15; S23 = S14; S22 = S13; S21 = S12; S20 = S11; S19 = S10; S18 = S09; \
+ S17 = S08; S16 = S07; S15 = S06; S14 = S05; S13 = S04; S12 = S03; S11 = S02; S10 = S01; S09 = S00; \
+ S08 = B35; S07 = B34; S06 = B33; S05 = B32; S04 = B31; S03 = B30; S02 = B29; S01 = B28; S00 = B27; \
+}
+
+#define SUB_ROR9_3 { \
+ SUB_ROR3; SUB_ROR3; SUB_ROR3; \
+}
+
+#define SUB_ROR12 { /* to fix */ \
+ B24 = S00; B25 = S01; B26 = S02; B27 = S03; B28 = S04; B29 = S05; B30 = S06; B31 = S07; B32 = S08; B33 = S09; B34 = S10; B35 = S11; \
+ S00 = S12; S01 = S13; S02 = S14; S03 = S15; S04 = S16; S05 = S17; S06 = S18; S07 = S19; S08 = S20; S09 = S21; S10 = S22; S11 = S23; \
+ S12 = S24; S13 = S25; S14 = S26; S15 = S27; S16 = S28; S17 = S29; S18 = S30; S19 = S31; S20 = S32; S21 = S33; S22 = S34; S23 = S35; \
+ S24 = B24; S25 = B25; S26 = B26; S27 = B27; S28 = B28; S29 = B29; S30 = B30; S31 = B31; S32 = B32; S33 = B33; S34 = B34; S35 = B35; \
+}
+
+#define FUGUE512_3(x, y, z) { \
+ TIX4(x, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
+ CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
+ SMIX(S33, S34, S35, S00); \
+ CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
+ SMIX(S30, S31, S32, S33); \
+ CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
+ SMIX(S27, S28, S29, S30); \
+ CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
+ SMIX(S24, S25, S26, S27); \
+ \
+ TIX4(y, S24, S25, S28, S31, S32, S10, S12, S15, S18); \
+ CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \
+ SMIX(S21, S22, S23, S24); \
+ CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \
+ SMIX(S18, S19, S20, S21); \
+ CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \
+ SMIX(S15, S16, S17, S18); \
+ CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \
+ SMIX(S12, S13, S14, S15); \
+ \
+ TIX4(z, S12, S13, S16, S19, S20, S34, S00, S03, S06); \
+ CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \
+ SMIX(S09, S10, S11, S12); \
+ CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \
+ SMIX(S06, S07, S08, S09); \
+ CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \
+ SMIX(S03, S04, S05, S06); \
+ CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \
+ SMIX(S00, S01, S02, S03); \
+}
+
+#define FUGUE512_F(w, x, y, z) { \
+ TIX4(w, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
+ CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
+ SMIX(S33, S34, S35, S00); \
+ CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
+ SMIX(S30, S31, S32, S33); \
+ CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
+ SMIX(S27, S28, S29, S30); \
+ CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
+ SMIX(S24, S25, S26, S27); \
+ \
+ TIX4(x, S24, S25, S28, S31, S32, S10, S12, S15, S18); \
+ CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \
+ SMIX(S21, S22, S23, S24); \
+ CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \
+ SMIX(S18, S19, S20, S21); \
+ CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \
+ SMIX(S15, S16, S17, S18); \
+ CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \
+ SMIX(S12, S13, S14, S15); \
+ \
+ TIX4(y, S12, S13, S16, S19, S20, S34, S00, S03, S06); \
+ CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \
+ SMIX(S09, S10, S11, S12); \
+ CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \
+ SMIX(S06, S07, S08, S09); \
+ CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \
+ SMIX(S03, S04, S05, S06); \
+ CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \
+ SMIX(S00, S01, S02, S03); \
+ \
+ TIX4(z, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
+ CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
+ SMIX(S33, S34, S35, S00); \
+ CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
+ SMIX(S30, S31, S32, S33); \
+ CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
+ SMIX(S27, S28, S29, S30); \
+ CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
+ SMIX(S24, S25, S26, S27); \
+}
+
+#undef ROL8
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__
+uint32_t ROL8(const uint32_t a) {
+ return __byte_perm(a, 0, 0x2103);
+}
+__device__ __forceinline__
+uint32_t ROR8(const uint32_t a) {
+ return __byte_perm(a, 0, 0x0321);
+}
+__device__ __forceinline__
+uint32_t ROL16(const uint32_t a) {
+ return __byte_perm(a, 0, 0x1032);
+}
+#else
+#define ROL8(u) ROTL32(u, 8)
+#define ROR8(u) ROTR32(u, 8)
+#define ROL16(u) ROTL32(u,16)
+#endif
+
+//#define AS_UINT4(addr) *((uint4*)(addr))
+
+__constant__ static uint64_t c_PaddedMessage80[10];
+
+__host__
+void x16_fugue512_setBlock_80(void *pdata)
+{
+ cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+}
+
+/***************************************************/
+
+__global__
+__launch_bounds__(TPB)
+void x16_fugue512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
+{
+ __shared__ uint32_t mixtabs[1024];
+
+ // load shared mem (with 256 threads)
+ const uint32_t thr = threadIdx.x & 0xFF;
+ const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);
+ mixtabs[thr] = tmp;
+ mixtabs[thr+256] = ROR8(tmp);
+ mixtabs[thr+512] = ROL16(tmp);
+ mixtabs[thr+768] = ROL8(tmp);
+#if TPB <= 256
+ if (blockDim.x < 256) {
+ const uint32_t thr = (threadIdx.x + 0x80) & 0xFF;
+ const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);
+ mixtabs[thr] = tmp;
+ mixtabs[thr + 256] = ROR8(tmp);
+ mixtabs[thr + 512] = ROL16(tmp);
+ mixtabs[thr + 768] = ROL8(tmp);
+ }
+#endif
+
+ __syncthreads();
+
+ uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ if (thread < threads)
+ {
+ uint32_t Data[20];
+
+ #pragma unroll
+ for(int i = 0; i < 10; i++)
+ AS_UINT2(&Data[i * 2]) = AS_UINT2(&c_PaddedMessage80[i]);
+ Data[19] = (startNonce + thread);
+
+ uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09, S10, S11;
+ uint32_t S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23;
+ uint32_t S24, S25, S26, S27, S28, S29, S30, S31, S32, S33, S34, S35;
+ //uint32_t B24, B25, B26,
+ uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35;
+ //const uint64_t bc = 640 bits to hash
+ //const uint32_t bclo = (uint32_t)(bc);
+ //const uint32_t bchi = (uint32_t)(bc >> 32);
+
+ S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = 0;
+ S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
+ S20 = 0x8807a57e; S21 = 0xe616af75; S22 = 0xc5d3e4db; S23 = 0xac9ab027;
+ S24 = 0xd915f117; S25 = 0xb6eecc54; S26 = 0x06e8020b; S27 = 0x4a92efd1;
+ S28 = 0xaac6e2c9; S29 = 0xddb21398; S30 = 0xcae65838; S31 = 0x437f203f;
+ S32 = 0x25ea78e7; S33 = 0x951fddd6; S34 = 0xda6ed11d; S35 = 0xe13e3567;
+
+ FUGUE512_3((Data[ 0]), (Data[ 1]), (Data[ 2]));
+ FUGUE512_3((Data[ 3]), (Data[ 4]), (Data[ 5]));
+ FUGUE512_3((Data[ 6]), (Data[ 7]), (Data[ 8]));
+ FUGUE512_3((Data[ 9]), (Data[10]), (Data[11]));
+ FUGUE512_3((Data[12]), (Data[13]), (Data[14]));
+ FUGUE512_3((Data[15]), (Data[16]), (Data[17]));
+ FUGUE512_F((Data[18]), (Data[19]), 0/*bchi*/, (80*8)/*bclo*/);
+
+ // rotate right state by 3 dwords (S00 = S33, S03 = S00)
+ SUB_ROR3;
+ SUB_ROR9;
+
+ #pragma unroll 32
+ for (int i = 0; i < 32; i++) {
+ SUB_ROR3;
+ CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
+ SMIX(S00, S01, S02, S03);
+ }
+ #pragma unroll 13
+ for (int i = 0; i < 13; i++) {
+ S04 ^= S00;
+ S09 ^= S00;
+ S18 ^= S00;
+ S27 ^= S00;
+ SUB_ROR9;
+ SMIX(S00, S01, S02, S03);
+ S04 ^= S00;
+ S10 ^= S00;
+ S18 ^= S00;
+ S27 ^= S00;
+ SUB_ROR9;
+ SMIX(S00, S01, S02, S03);
+ S04 ^= S00;
+ S10 ^= S00;
+ S19 ^= S00;
+ S27 ^= S00;
+ SUB_ROR9;
+ SMIX(S00, S01, S02, S03);
+ S04 ^= S00;
+ S10 ^= S00;
+ S19 ^= S00;
+ S28 ^= S00;
+ SUB_ROR8;
+ SMIX(S00, S01, S02, S03);
+ }
+ S04 ^= S00;
+ S09 ^= S00;
+ S18 ^= S00;
+ S27 ^= S00;
+
+ Data[ 0] = cuda_swab32(S01);
+ Data[ 1] = cuda_swab32(S02);
+ Data[ 2] = cuda_swab32(S03);
+ Data[ 3] = cuda_swab32(S04);
+ Data[ 4] = cuda_swab32(S09);
+ Data[ 5] = cuda_swab32(S10);
+ Data[ 6] = cuda_swab32(S11);
+ Data[ 7] = cuda_swab32(S12);
+ Data[ 8] = cuda_swab32(S18);
+ Data[ 9] = cuda_swab32(S19);
+ Data[10] = cuda_swab32(S20);
+ Data[11] = cuda_swab32(S21);
+ Data[12] = cuda_swab32(S27);
+ Data[13] = cuda_swab32(S28);
+ Data[14] = cuda_swab32(S29);
+ Data[15] = cuda_swab32(S30);
+
+ const size_t hashPosition = thread;
+ uint64_t* pHash = &g_hash[hashPosition << 3];
+ #pragma unroll 4
+ for(int i = 0; i < 4; i++)
+ AS_UINT4(&pHash[i * 2]) = AS_UINT4(&Data[i * 4]);
+ }
+}
+
+#define texDef(id, texname, texmem, texsource, texsize) { \
+ unsigned int *texmem; \
+ cudaMalloc(&texmem, texsize); \
+ d_textures[thr_id][id] = texmem; \
+ cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
+ texname.normalized = 0; \
+ texname.filterMode = cudaFilterModePoint; \
+ texname.addressMode[0] = cudaAddressModeClamp; \
+ { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \
+ cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
+ } \
+}
+
+__host__
+void x16_fugue512_cpu_init(int thr_id, uint32_t threads)
+{
+ texDef(0, mixTab0Tex, mixTab0m, mixtab0, sizeof(uint32_t)*256);
+}
+
+__host__
+void x16_fugue512_cpu_free(int thr_id)
+{
+ cudaFree(d_textures[thr_id][0]);
+}
+
+__host__
+void x16_fugue512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
+{
+ const uint32_t threadsperblock = TPB;
+
+ dim3 grid((threads + threadsperblock-1)/threadsperblock);
+ dim3 block(threadsperblock);
+
+ x16_fugue512_gpu_hash_80 <<>> (threads, startNonce, (uint64_t*)d_hash);
+}
diff --git a/x16r/cuda_x16_shabal512.cu b/x16r/cuda_x16_shabal512.cu
new file mode 100644
index 0000000..ca00b50
--- /dev/null
+++ b/x16r/cuda_x16_shabal512.cu
@@ -0,0 +1,350 @@
+/*
+* Shabal-512 for X16R
+* tpruvot 2018, based on alexis x14 and xevan kernlx code
+*/
+
+#include
+#include
+#include
+
+typedef uint32_t sph_u32;
+
+#define C32(x) (x)
+#define T32(x) (x)
+
+#define INPUT_BLOCK_ADD do { \
+ B0 = T32(B0 + M0); \
+ B1 = T32(B1 + M1); \
+ B2 = T32(B2 + M2); \
+ B3 = T32(B3 + M3); \
+ B4 = T32(B4 + M4); \
+ B5 = T32(B5 + M5); \
+ B6 = T32(B6 + M6); \
+ B7 = T32(B7 + M7); \
+ B8 = T32(B8 + M8); \
+ B9 = T32(B9 + M9); \
+ BA = T32(BA + MA); \
+ BB = T32(BB + MB); \
+ BC = T32(BC + MC); \
+ BD = T32(BD + MD); \
+ BE = T32(BE + ME); \
+ BF = T32(BF + MF); \
+ } while (0)
+
+#define INPUT_BLOCK_SUB do { \
+ C0 = T32(C0 - M0); \
+ C1 = T32(C1 - M1); \
+ C2 = T32(C2 - M2); \
+ C3 = T32(C3 - M3); \
+ C4 = T32(C4 - M4); \
+ C5 = T32(C5 - M5); \
+ C6 = T32(C6 - M6); \
+ C7 = T32(C7 - M7); \
+ C8 = T32(C8 - M8); \
+ C9 = T32(C9 - M9); \
+ CA = T32(CA - MA); \
+ CB = T32(CB - MB); \
+ CC = T32(CC - MC); \
+ CD = T32(CD - MD); \
+ CE = T32(CE - ME); \
+ CF = T32(CF - MF); \
+ } while (0)
+
+#define XOR_W do { \
+ A00 ^= Wlow; \
+ A01 ^= Whigh; \
+ } while (0)
+
+#define SWAP(v1, v2) do { \
+ sph_u32 tmp = (v1); \
+ (v1) = (v2); \
+ (v2) = tmp; \
+ } while (0)
+
+#define SWAP_BC do { \
+ SWAP(B0, C0); \
+ SWAP(B1, C1); \
+ SWAP(B2, C2); \
+ SWAP(B3, C3); \
+ SWAP(B4, C4); \
+ SWAP(B5, C5); \
+ SWAP(B6, C6); \
+ SWAP(B7, C7); \
+ SWAP(B8, C8); \
+ SWAP(B9, C9); \
+ SWAP(BA, CA); \
+ SWAP(BB, CB); \
+ SWAP(BC, CC); \
+ SWAP(BD, CD); \
+ SWAP(BE, CE); \
+ SWAP(BF, CF); \
+ } while (0)
+
+#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) do { \
+ xa0 = T32((xa0 \
+ ^ (((xa1 << 15) | (xa1 >> 17)) * 5U) \
+ ^ xc) * 3U) \
+ ^ xb1 ^ (xb2 & ~xb3) ^ xm; \
+ xb0 = T32(~(((xb0 << 1) | (xb0 >> 31)) ^ xa0)); \
+ } while (0)
+
+#define PERM_STEP_0 do { \
+ PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
+ PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
+ PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
+ PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
+ PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
+ PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
+ PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
+ PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
+ PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
+ PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
+ PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
+ PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
+ PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
+ PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
+ PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
+ PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
+ } while (0)
+
+#define PERM_STEP_1 do { \
+ PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
+ PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
+ PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
+ PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
+ PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
+ PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
+ PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
+ PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
+ PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
+ PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
+ PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
+ PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
+ PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
+ PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
+ PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
+ PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
+ } while (0)
+
+#define PERM_STEP_2 do { \
+ PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
+ PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
+ PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
+ PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
+ PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
+ PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
+ PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
+ PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
+ PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
+ PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
+ PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
+ PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
+ PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
+ PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
+ PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
+ PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+ } while (0)
+
+#define APPLY_P do { \
+ B0 = T32(B0 << 17) | (B0 >> 15); \
+ B1 = T32(B1 << 17) | (B1 >> 15); \
+ B2 = T32(B2 << 17) | (B2 >> 15); \
+ B3 = T32(B3 << 17) | (B3 >> 15); \
+ B4 = T32(B4 << 17) | (B4 >> 15); \
+ B5 = T32(B5 << 17) | (B5 >> 15); \
+ B6 = T32(B6 << 17) | (B6 >> 15); \
+ B7 = T32(B7 << 17) | (B7 >> 15); \
+ B8 = T32(B8 << 17) | (B8 >> 15); \
+ B9 = T32(B9 << 17) | (B9 >> 15); \
+ BA = T32(BA << 17) | (BA >> 15); \
+ BB = T32(BB << 17) | (BB >> 15); \
+ BC = T32(BC << 17) | (BC >> 15); \
+ BD = T32(BD << 17) | (BD >> 15); \
+ BE = T32(BE << 17) | (BE >> 15); \
+ BF = T32(BF << 17) | (BF >> 15); \
+ PERM_STEP_0; \
+ PERM_STEP_1; \
+ PERM_STEP_2; \
+ A0B = T32(A0B + C6); \
+ A0A = T32(A0A + C5); \
+ A09 = T32(A09 + C4); \
+ A08 = T32(A08 + C3); \
+ A07 = T32(A07 + C2); \
+ A06 = T32(A06 + C1); \
+ A05 = T32(A05 + C0); \
+ A04 = T32(A04 + CF); \
+ A03 = T32(A03 + CE); \
+ A02 = T32(A02 + CD); \
+ A01 = T32(A01 + CC); \
+ A00 = T32(A00 + CB); \
+ A0B = T32(A0B + CA); \
+ A0A = T32(A0A + C9); \
+ A09 = T32(A09 + C8); \
+ A08 = T32(A08 + C7); \
+ A07 = T32(A07 + C6); \
+ A06 = T32(A06 + C5); \
+ A05 = T32(A05 + C4); \
+ A04 = T32(A04 + C3); \
+ A03 = T32(A03 + C2); \
+ A02 = T32(A02 + C1); \
+ A01 = T32(A01 + C0); \
+ A00 = T32(A00 + CF); \
+ A0B = T32(A0B + CE); \
+ A0A = T32(A0A + CD); \
+ A09 = T32(A09 + CC); \
+ A08 = T32(A08 + CB); \
+ A07 = T32(A07 + CA); \
+ A06 = T32(A06 + C9); \
+ A05 = T32(A05 + C8); \
+ A04 = T32(A04 + C7); \
+ A03 = T32(A03 + C6); \
+ A02 = T32(A02 + C5); \
+ A01 = T32(A01 + C4); \
+ A00 = T32(A00 + C3); \
+ } while (0)
+
+#define INCR_W do { \
+ if ((Wlow = T32(Wlow + 1)) == 0) \
+ Whigh = T32(Whigh + 1); \
+ } while (0)
+
+__constant__ static const sph_u32 A_init_512[] = {
+ C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
+ C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
+ C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
+};
+
+__constant__ static const sph_u32 B_init_512[] = {
+ C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
+ C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
+ C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
+ C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
+};
+
+__constant__ static const sph_u32 C_init_512[] = {
+ C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
+ C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
+ C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
+ C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
+};
+
+__constant__ static uint32_t c_PaddedMessage80[20];
+
+__host__
+void x16_shabal512_setBlock_80(void *pdata)
+{
+ cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+}
+
+#define TPB_SHABAL 256
+
+__global__ __launch_bounds__(TPB_SHABAL, 2)
+void x16_shabal512_gpu_hash_80(uint32_t threads, const uint32_t startNonce, uint32_t *g_hash)
+{
+ const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+ uint32_t B[] = {
+ 0xC1099CB7, 0x07B385F3, 0xE7442C26, 0xCC8AD640, 0xEB6F56C7, 0x1EA81AA9, 0x73B9D314, 0x1DE85D08,
+ 0x48910A5A, 0x893B22DB, 0xC5A0DF44, 0xBBC4324E, 0x72D2F240, 0x75941D99, 0x6D8BDE82, 0xA1A7502B
+ };
+ uint32_t M[16];
+
+ if (thread < threads)
+ {
+ // todo: try __ldc
+ *(uint2x4*)&M[0] = *(uint2x4*)&c_PaddedMessage80[0];
+ *(uint2x4*)&M[8] = *(uint2x4*)&c_PaddedMessage80[8];
+
+ sph_u32 A00 = A_init_512[0], A01 = A_init_512[1], A02 = A_init_512[ 2], A03 = A_init_512[ 3];
+ sph_u32 A04 = A_init_512[4], A05 = A_init_512[5], A06 = A_init_512[ 6], A07 = A_init_512[ 7];
+ sph_u32 A08 = A_init_512[8], A09 = A_init_512[9], A0A = A_init_512[10], A0B = A_init_512[11];
+
+ sph_u32 B0 = B_init_512[ 0], B1 = B_init_512[ 1], B2 = B_init_512[ 2], B3 = B_init_512 [3];
+ sph_u32 B4 = B_init_512[ 4], B5 = B_init_512[ 5], B6 = B_init_512[ 6], B7 = B_init_512[ 7];
+ sph_u32 B8 = B_init_512[ 8], B9 = B_init_512[ 9], BA = B_init_512[10], BB = B_init_512[11];
+ sph_u32 BC = B_init_512[12], BD = B_init_512[13], BE = B_init_512[14], BF = B_init_512[15];
+
+ sph_u32 C0 = C_init_512[ 0], C1 = C_init_512[ 1], C2 = C_init_512[ 2], C3 = C_init_512[ 3];
+ sph_u32 C4 = C_init_512[ 4], C5 = C_init_512[ 5], C6 = C_init_512[ 6], C7 = C_init_512[ 7];
+ sph_u32 C8 = C_init_512[ 8], C9 = C_init_512[ 9], CA = C_init_512[10], CB = C_init_512[11];
+ sph_u32 CC = C_init_512[12], CD = C_init_512[13], CE = C_init_512[14], CF = C_init_512[15];
+
+ sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
+ sph_u32 Wlow = 1, Whigh = 0;
+
+ M0 = M[ 0];
+ M1 = M[ 1];
+ M2 = M[ 2];
+ M3 = M[ 3];
+ M4 = M[ 4];
+ M5 = M[ 5];
+ M6 = M[ 6];
+ M7 = M[ 7];
+ M8 = M[ 8];
+ M9 = M[ 9];
+ MA = M[10];
+ MB = M[11];
+ MC = M[12];
+ MD = M[13];
+ ME = M[14];
+ MF = M[15];
+
+ INPUT_BLOCK_ADD;
+ XOR_W;
+ APPLY_P;
+ INPUT_BLOCK_SUB;
+ SWAP_BC;
+ INCR_W;
+
+ M0 = c_PaddedMessage80[16];
+ M1 = c_PaddedMessage80[17];
+ M2 = c_PaddedMessage80[18];
+ M3 = cuda_swab32(startNonce + thread);
+ M4 = 0x80;
+ M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0;
+
+ INPUT_BLOCK_ADD;
+ XOR_W;
+ APPLY_P;
+
+ for (unsigned i = 0; i < 3; i++) {
+ SWAP_BC;
+ XOR_W;
+ APPLY_P;
+ }
+
+ B[ 0] = B0;
+ B[ 1] = B1;
+ B[ 2] = B2;
+ B[ 3] = B3;
+ B[ 4] = B4;
+ B[ 5] = B5;
+ B[ 6] = B6;
+ B[ 7] = B7;
+ B[ 8] = B8;
+ B[ 9] = B9;
+ B[10] = BA;
+ B[11] = BB;
+ B[12] = BC;
+ B[13] = BD;
+ B[14] = BE;
+ B[15] = BF;
+
+ // output
+ uint64_t hashPosition = thread;
+ uint32_t *Hash = &g_hash[hashPosition << 4];
+ *(uint2x4*)&Hash[0] = *(uint2x4*)&B[0];
+ *(uint2x4*)&Hash[8] = *(uint2x4*)&B[8];
+ }
+}
+
+__host__
+void x16_shabal512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
+{
+ const uint32_t threadsperblock = TPB_SHABAL;
+
+ dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+ dim3 block(threadsperblock);
+
+ x16_shabal512_gpu_hash_80 <<>>(threads, startNonce, d_hash);
+}
diff --git a/x16r/cuda_x16_simd512_80.cu b/x16r/cuda_x16_simd512_80.cu
new file mode 100644
index 0000000..142180a
--- /dev/null
+++ b/x16r/cuda_x16_simd512_80.cu
@@ -0,0 +1,1836 @@
+/**
+ * SIMD512 CUDA IMPLEMENTATION based on sph simd code
+ * tpruvot 2018 (with the help of kernelx xevan code)
+ */
+
+#include
+#include
+#include
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#define __CUDA_ARCH__ 500
+#define __byte_perm(x, y, m) (x|y)
+#endif
+
+#define TPB50_1 128
+#define TPB50_2 128
+#define TPB52_1 128
+#define TPB52_2 128
+
+#define sph_u32 uint32_t
+#define sph_s32 int32_t
+typedef uint32_t u32;
+typedef int32_t s32;
+
+#define C32 SPH_C32
+#define T32 SPH_T32
+#define ROL32 ROTL32
+#define XCAT(x, y) XCAT_(x, y)
+#define XCAT_(x, y) x ## y
+
+/*
+ * The powers of 41 modulo 257. We use exponents from 0 to 255, inclusive.
+ */
+__constant__ static const s32 alpha_tab[] = {
+ 1, 41, 139, 45, 46, 87, 226, 14, 60, 147, 116, 130, 190, 80, 196, 69,
+ 2, 82, 21, 90, 92, 174, 195, 28, 120, 37, 232, 3, 123, 160, 135, 138,
+ 4, 164, 42, 180, 184, 91, 133, 56, 240, 74, 207, 6, 246, 63, 13, 19,
+ 8, 71, 84, 103, 111, 182, 9, 112, 223, 148, 157, 12, 235, 126, 26, 38,
+ 16, 142, 168, 206, 222, 107, 18, 224, 189, 39, 57, 24, 213, 252, 52, 76,
+ 32, 27, 79, 155, 187, 214, 36, 191, 121, 78, 114, 48, 169, 247, 104, 152,
+ 64, 54, 158, 53, 117, 171, 72, 125, 242, 156, 228, 96, 81, 237, 208, 47,
+ 128, 108, 59, 106, 234, 85, 144, 250, 227, 55, 199, 192, 162, 217, 159, 94,
+ 256, 216, 118, 212, 211, 170, 31, 243, 197, 110, 141, 127, 67, 177, 61, 188,
+ 255, 175, 236, 167, 165, 83, 62, 229, 137, 220, 25, 254, 134, 97, 122, 119,
+ 253, 93, 215, 77, 73, 166, 124, 201, 17, 183, 50, 251, 11, 194, 244, 238,
+ 249, 186, 173, 154, 146, 75, 248, 145, 34, 109, 100, 245, 22, 131, 231, 219,
+ 241, 115, 89, 51, 35, 150, 239, 33, 68, 218, 200, 233, 44, 5, 205, 181,
+ 225, 230, 178, 102, 70, 43, 221, 66, 136, 179, 143, 209, 88, 10, 153, 105,
+ 193, 203, 99, 204, 140, 86, 185, 132, 15, 101, 29, 161, 176, 20, 49, 210,
+ 129, 149, 198, 151, 23, 172, 113, 7, 30, 202, 58, 65, 95, 40, 98, 163
+};
+
+/*
+ * Ranges:
+ * REDS1: from -32768..98302 to -383..383
+ * REDS2: from -2^31..2^31-1 to -32768..98302
+ */
+#define REDS1(x) (((x) & 0x00FF) - ((x) >> 8))
+#define REDS2(x) (((x) & 0xFFFF) + ((x) >> 16))
+
+/*
+ * If, upon entry, the values of q[] are all in the -N..N range (where
+ * N >= 98302) then the new values of q[] are in the -2N..2N range.
+ *
+ * Since alpha_tab[v] <= 256, maximum allowed range is for N = 8388608.
+ */
+#define FFT_LOOP_16_8(rb) do { \
+ s32 m = q[(rb)]; \
+ s32 n = q[(rb) + 16]; \
+ q[(rb)] = m + n; \
+ q[(rb) + 16] = m - n; \
+ s32 t; \
+ m = q[(rb) + 0 + 1]; \
+ n = q[(rb) + 0 + 1 + 16]; \
+ t = REDS2(n * alpha_tab[0 + 1 * 8]); \
+ q[(rb) + 0 + 1] = m + t; \
+ q[(rb) + 0 + 1 + 16] = m - t; \
+ m = q[(rb) + 0 + 2]; \
+ n = q[(rb) + 0 + 2 + 16]; \
+ t = REDS2(n * alpha_tab[0 + 2 * 8]); \
+ q[(rb) + 0 + 2] = m + t; \
+ q[(rb) + 0 + 2 + 16] = m - t; \
+ m = q[(rb) + 0 + 3]; \
+ n = q[(rb) + 0 + 3 + 16]; \
+ t = REDS2(n * alpha_tab[0 + 3 * 8]); \
+ q[(rb) + 0 + 3] = m + t; \
+ q[(rb) + 0 + 3 + 16] = m - t; \
+ \
+ m = q[(rb) + 4 + 0]; \
+ n = q[(rb) + 4 + 0 + 16]; \
+ t = REDS2(n * alpha_tab[32 + 0 * 8]); \
+ q[(rb) + 4 + 0] = m + t; \
+ q[(rb) + 4 + 0 + 16] = m - t; \
+ m = q[(rb) + 4 + 1]; \
+ n = q[(rb) + 4 + 1 + 16]; \
+ t = REDS2(n * alpha_tab[32 + 1 * 8]); \
+ q[(rb) + 4 + 1] = m + t; \
+ q[(rb) + 4 + 1 + 16] = m - t; \
+ m = q[(rb) + 4 + 2]; \
+ n = q[(rb) + 4 + 2 + 16]; \
+ t = REDS2(n * alpha_tab[32 + 2 * 8]); \
+ q[(rb) + 4 + 2] = m + t; \
+ q[(rb) + 4 + 2 + 16] = m - t; \
+ m = q[(rb) + 4 + 3]; \
+ n = q[(rb) + 4 + 3 + 16]; \
+ t = REDS2(n * alpha_tab[32 + 3 * 8]); \
+ q[(rb) + 4 + 3] = m + t; \
+ q[(rb) + 4 + 3 + 16] = m - t; \
+ \
+ m = q[(rb) + 8 + 0]; \
+ n = q[(rb) + 8 + 0 + 16]; \
+ t = REDS2(n * alpha_tab[64 + 0 * 8]); \
+ q[(rb) + 8 + 0] = m + t; \
+ q[(rb) + 8 + 0 + 16] = m - t; \
+ m = q[(rb) + 8 + 1]; \
+ n = q[(rb) + 8 + 1 + 16]; \
+ t = REDS2(n * alpha_tab[64 + 1 * 8]); \
+ q[(rb) + 8 + 1] = m + t; \
+ q[(rb) + 8 + 1 + 16] = m - t; \
+ m = q[(rb) + 8 + 2]; \
+ n = q[(rb) + 8 + 2 + 16]; \
+ t = REDS2(n * alpha_tab[64 + 2 * 8]); \
+ q[(rb) + 8 + 2] = m + t; \
+ q[(rb) + 8 + 2 + 16] = m - t; \
+ m = q[(rb) + 8 + 3]; \
+ n = q[(rb) + 8 + 3 + 16]; \
+ t = REDS2(n * alpha_tab[64 + 3 * 8]); \
+ q[(rb) + 8 + 3] = m + t; \
+ q[(rb) + 8 + 3 + 16] = m - t; \
+ \
+ m = q[(rb) + 12 + 0]; \
+ n = q[(rb) + 12 + 0 + 16]; \
+ t = REDS2(n * alpha_tab[96 + 0 * 8]); \
+ q[(rb) + 12 + 0] = m + t; \
+ q[(rb) + 12 + 0 + 16] = m - t; \
+ m = q[(rb) + 12 + 1]; \
+ n = q[(rb) + 12 + 1 + 16]; \
+ t = REDS2(n * alpha_tab[96 + 1 * 8]); \
+ q[(rb) + 12 + 1] = m + t; \
+ q[(rb) + 12 + 1 + 16] = m - t; \
+ m = q[(rb) + 12 + 2]; \
+ n = q[(rb) + 12 + 2 + 16]; \
+ t = REDS2(n * alpha_tab[96 + 2 * 8]); \
+ q[(rb) + 12 + 2] = m + t; \
+ q[(rb) + 12 + 2 + 16] = m - t; \
+ m = q[(rb) + 12 + 3]; \
+ n = q[(rb) + 12 + 3 + 16]; \
+ t = REDS2(n * alpha_tab[96 + 3 * 8]); \
+ q[(rb) + 12 + 3] = m + t; \
+ q[(rb) + 12 + 3 + 16] = m - t; \
+ } while (0)
+
+#define FFT_LOOP_32_4(rb) do { \
+ s32 m = q[(rb)]; \
+ s32 n = q[(rb) + 32]; \
+ q[(rb)] = m + n; \
+ q[(rb) + 32] = m - n; \
+ s32 t; \
+ m = q[(rb) + 0 + 1]; \
+ n = q[(rb) + 0 + 1 + 32]; \
+ t = REDS2(n * alpha_tab[0 + 1 * 4]); \
+ q[(rb) + 0 + 1] = m + t; \
+ q[(rb) + 0 + 1 + 32] = m - t; \
+ m = q[(rb) + 0 + 2]; \
+ n = q[(rb) + 0 + 2 + 32]; \
+ t = REDS2(n * alpha_tab[0 + 2 * 4]); \
+ q[(rb) + 0 + 2] = m + t; \
+ q[(rb) + 0 + 2 + 32] = m - t; \
+ m = q[(rb) + 0 + 3]; \
+ n = q[(rb) + 0 + 3 + 32]; \
+ t = REDS2(n * alpha_tab[0 + 3 * 4]); \
+ q[(rb) + 0 + 3] = m + t; \
+ q[(rb) + 0 + 3 + 32] = m - t; \
+ \
+ m = q[(rb) + 4 + 0]; \
+ n = q[(rb) + 4 + 0 + 32]; \
+ t = REDS2(n * alpha_tab[16 + 0 * 4]); \
+ q[(rb) + 4 + 0] = m + t; \
+ q[(rb) + 4 + 0 + 32] = m - t; \
+ m = q[(rb) + 4 + 1]; \
+ n = q[(rb) + 4 + 1 + 32]; \
+ t = REDS2(n * alpha_tab[16 + 1 * 4]); \
+ q[(rb) + 4 + 1] = m + t; \
+ q[(rb) + 4 + 1 + 32] = m - t; \
+ m = q[(rb) + 4 + 2]; \
+ n = q[(rb) + 4 + 2 + 32]; \
+ t = REDS2(n * alpha_tab[16 + 2 * 4]); \
+ q[(rb) + 4 + 2] = m + t; \
+ q[(rb) + 4 + 2 + 32] = m - t; \
+ m = q[(rb) + 4 + 3]; \
+ n = q[(rb) + 4 + 3 + 32]; \
+ t = REDS2(n * alpha_tab[16 + 3 * 4]); \
+ q[(rb) + 4 + 3] = m + t; \
+ q[(rb) + 4 + 3 + 32] = m - t; \
+ \
+ m = q[(rb) + 8 + 0]; \
+ n = q[(rb) + 8 + 0 + 32]; \
+ t = REDS2(n * alpha_tab[32 + 0 * 4]); \
+ q[(rb) + 8 + 0] = m + t; \
+ q[(rb) + 8 + 0 + 32] = m - t; \
+ m = q[(rb) + 8 + 1]; \
+ n = q[(rb) + 8 + 1 + 32]; \
+ t = REDS2(n * alpha_tab[32 + 1 * 4]); \
+ q[(rb) + 8 + 1] = m + t; \
+ q[(rb) + 8 + 1 + 32] = m - t; \
+ m = q[(rb) + 8 + 2]; \
+ n = q[(rb) + 8 + 2 + 32]; \
+ t = REDS2(n * alpha_tab[32 + 2 * 4]); \
+ q[(rb) + 8 + 2] = m + t; \
+ q[(rb) + 8 + 2 + 32] = m - t; \
+ m = q[(rb) + 8 + 3]; \
+ n = q[(rb) + 8 + 3 + 32]; \
+ t = REDS2(n * alpha_tab[32 + 3 * 4]); \
+ q[(rb) + 8 + 3] = m + t; \
+ q[(rb) + 8 + 3 + 32] = m - t; \
+ \
+ m = q[(rb) + 12 + 0]; \
+ n = q[(rb) + 12 + 0 + 32]; \
+ t = REDS2(n * alpha_tab[48 + 0 * 4]); \
+ q[(rb) + 12 + 0] = m + t; \
+ q[(rb) + 12 + 0 + 32] = m - t; \
+ m = q[(rb) + 12 + 1]; \
+ n = q[(rb) + 12 + 1 + 32]; \
+ t = REDS2(n * alpha_tab[48 + 1 * 4]); \
+ q[(rb) + 12 + 1] = m + t; \
+ q[(rb) + 12 + 1 + 32] = m - t; \
+ m = q[(rb) + 12 + 2]; \
+ n = q[(rb) + 12 + 2 + 32]; \
+ t = REDS2(n * alpha_tab[48 + 2 * 4]); \
+ q[(rb) + 12 + 2] = m + t; \
+ q[(rb) + 12 + 2 + 32] = m - t; \
+ m = q[(rb) + 12 + 3]; \
+ n = q[(rb) + 12 + 3 + 32]; \
+ t = REDS2(n * alpha_tab[48 + 3 * 4]); \
+ q[(rb) + 12 + 3] = m + t; \
+ q[(rb) + 12 + 3 + 32] = m - t; \
+ \
+ m = q[(rb) + 16 + 0]; \
+ n = q[(rb) + 16 + 0 + 32]; \
+ t = REDS2(n * alpha_tab[64 + 0 * 4]); \
+ q[(rb) + 16 + 0] = m + t; \
+ q[(rb) + 16 + 0 + 32] = m - t; \
+ m = q[(rb) + 16 + 1]; \
+ n = q[(rb) + 16 + 1 + 32]; \
+ t = REDS2(n * alpha_tab[64 + 1 * 4]); \
+ q[(rb) + 16 + 1] = m + t; \
+ q[(rb) + 16 + 1 + 32] = m - t; \
+ m = q[(rb) + 16 + 2]; \
+ n = q[(rb) + 16 + 2 + 32]; \
+ t = REDS2(n * alpha_tab[64 + 2 * 4]); \
+ q[(rb) + 16 + 2] = m + t; \
+ q[(rb) + 16 + 2 + 32] = m - t; \
+ m = q[(rb) + 16 + 3]; \
+ n = q[(rb) + 16 + 3 + 32]; \
+ t = REDS2(n * alpha_tab[64 + 3 * 4]); \
+ q[(rb) + 16 + 3] = m + t; \
+ q[(rb) + 16 + 3 + 32] = m - t; \
+ \
+ m = q[(rb) + 20 + 0]; \
+ n = q[(rb) + 20 + 0 + 32]; \
+ t = REDS2(n * alpha_tab[80 + 0 * 4]); \
+ q[(rb) + 20 + 0] = m + t; \
+ q[(rb) + 20 + 0 + 32] = m - t; \
+ m = q[(rb) + 20 + 1]; \
+ n = q[(rb) + 20 + 1 + 32]; \
+ t = REDS2(n * alpha_tab[80 + 1 * 4]); \
+ q[(rb) + 20 + 1] = m + t; \
+ q[(rb) + 20 + 1 + 32] = m - t; \
+ m = q[(rb) + 20 + 2]; \
+ n = q[(rb) + 20 + 2 + 32]; \
+ t = REDS2(n * alpha_tab[80 + 2 * 4]); \
+ q[(rb) + 20 + 2] = m + t; \
+ q[(rb) + 20 + 2 + 32] = m - t; \
+ m = q[(rb) + 20 + 3]; \
+ n = q[(rb) + 20 + 3 + 32]; \
+ t = REDS2(n * alpha_tab[80 + 3 * 4]); \
+ q[(rb) + 20 + 3] = m + t; \
+ q[(rb) + 20 + 3 + 32] = m - t; \
+ \
+ m = q[(rb) + 24 + 0]; \
+ n = q[(rb) + 24 + 0 + 32]; \
+ t = REDS2(n * alpha_tab[96 + 0 * 4]); \
+ q[(rb) + 24 + 0] = m + t; \
+ q[(rb) + 24 + 0 + 32] = m - t; \
+ m = q[(rb) + 24 + 1]; \
+ n = q[(rb) + 24 + 1 + 32]; \
+ t = REDS2(n * alpha_tab[96 + 1 * 4]); \
+ q[(rb) + 24 + 1] = m + t; \
+ q[(rb) + 24 + 1 + 32] = m - t; \
+ m = q[(rb) + 24 + 2]; \
+ n = q[(rb) + 24 + 2 + 32]; \
+ t = REDS2(n * alpha_tab[96 + 2 * 4]); \
+ q[(rb) + 24 + 2] = m + t; \
+ q[(rb) + 24 + 2 + 32] = m - t; \
+ m = q[(rb) + 24 + 3]; \
+ n = q[(rb) + 24 + 3 + 32]; \
+ t = REDS2(n * alpha_tab[96 + 3 * 4]); \
+ q[(rb) + 24 + 3] = m + t; \
+ q[(rb) + 24 + 3 + 32] = m - t; \
+ \
+ m = q[(rb) + 28 + 0]; \
+ n = q[(rb) + 28 + 0 + 32]; \
+ t = REDS2(n * alpha_tab[112 + 0 * 4]); \
+ q[(rb) + 28 + 0] = m + t; \
+ q[(rb) + 28 + 0 + 32] = m - t; \
+ m = q[(rb) + 28 + 1]; \
+ n = q[(rb) + 28 + 1 + 32]; \
+ t = REDS2(n * alpha_tab[112 + 1 * 4]); \
+ q[(rb) + 28 + 1] = m + t; \
+ q[(rb) + 28 + 1 + 32] = m - t; \
+ m = q[(rb) + 28 + 2]; \
+ n = q[(rb) + 28 + 2 + 32]; \
+ t = REDS2(n * alpha_tab[112 + 2 * 4]); \
+ q[(rb) + 28 + 2] = m + t; \
+ q[(rb) + 28 + 2 + 32] = m - t; \
+ m = q[(rb) + 28 + 3]; \
+ n = q[(rb) + 28 + 3 + 32]; \
+ t = REDS2(n * alpha_tab[112 + 3 * 4]); \
+ q[(rb) + 28 + 3] = m + t; \
+ q[(rb) + 28 + 3 + 32] = m - t; \
+ } while (0)
+
+#define FFT_LOOP_64_2(rb) do { \
+ s32 m = q[(rb)]; \
+ s32 n = q[(rb) + 64]; \
+ q[(rb)] = m + n; \
+ q[(rb) + 64] = m - n; \
+ s32 t; \
+ m = q[(rb) + 0 + 1]; \
+ n = q[(rb) + 0 + 1 + 64]; \
+ t = REDS2(n * alpha_tab[0 + 1 * 2]); \
+ q[(rb) + 0 + 1] = m + t; \
+ q[(rb) + 0 + 1 + 64] = m - t; \
+ m = q[(rb) + 0 + 2]; \
+ n = q[(rb) + 0 + 2 + 64]; \
+ t = REDS2(n * alpha_tab[0 + 2 * 2]); \
+ q[(rb) + 0 + 2] = m + t; \
+ q[(rb) + 0 + 2 + 64] = m - t; \
+ m = q[(rb) + 0 + 3]; \
+ n = q[(rb) + 0 + 3 + 64]; \
+ t = REDS2(n * alpha_tab[0 + 3 * 2]); \
+ q[(rb) + 0 + 3] = m + t; \
+ q[(rb) + 0 + 3 + 64] = m - t; \
+ \
+ m = q[(rb) + 4 + 0]; \
+ n = q[(rb) + 4 + 0 + 64]; \
+ t = REDS2(n * alpha_tab[8 + 0 * 2]); \
+ q[(rb) + 4 + 0] = m + t; \
+ q[(rb) + 4 + 0 + 64] = m - t; \
+ m = q[(rb) + 4 + 1]; \
+ n = q[(rb) + 4 + 1 + 64]; \
+ t = REDS2(n * alpha_tab[8 + 1 * 2]); \
+ q[(rb) + 4 + 1] = m + t; \
+ q[(rb) + 4 + 1 + 64] = m - t; \
+ m = q[(rb) + 4 + 2]; \
+ n = q[(rb) + 4 + 2 + 64]; \
+ t = REDS2(n * alpha_tab[8 + 2 * 2]); \
+ q[(rb) + 4 + 2] = m + t; \
+ q[(rb) + 4 + 2 + 64] = m - t; \
+ m = q[(rb) + 4 + 3]; \
+ n = q[(rb) + 4 + 3 + 64]; \
+ t = REDS2(n * alpha_tab[8 + 3 * 2]); \
+ q[(rb) + 4 + 3] = m + t; \
+ q[(rb) + 4 + 3 + 64] = m - t; \
+ \
+ m = q[(rb) + 8 + 0]; \
+ n = q[(rb) + 8 + 0 + 64]; \
+ t = REDS2(n * alpha_tab[16 + 0 * 2]); \
+ q[(rb) + 8 + 0] = m + t; \
+ q[(rb) + 8 + 0 + 64] = m - t; \
+ m = q[(rb) + 8 + 1]; \
+ n = q[(rb) + 8 + 1 + 64]; \
+ t = REDS2(n * alpha_tab[16 + 1 * 2]); \
+ q[(rb) + 8 + 1] = m + t; \
+ q[(rb) + 8 + 1 + 64] = m - t; \
+ m = q[(rb) + 8 + 2]; \
+ n = q[(rb) + 8 + 2 + 64]; \
+ t = REDS2(n * alpha_tab[16 + 2 * 2]); \
+ q[(rb) + 8 + 2] = m + t; \
+ q[(rb) + 8 + 2 + 64] = m - t; \
+ m = q[(rb) + 8 + 3]; \
+ n = q[(rb) + 8 + 3 + 64]; \
+ t = REDS2(n * alpha_tab[16 + 3 * 2]); \
+ q[(rb) + 8 + 3] = m + t; \
+ q[(rb) + 8 + 3 + 64] = m - t; \
+ \
+ m = q[(rb) + 12 + 0]; \
+ n = q[(rb) + 12 + 0 + 64]; \
+ t = REDS2(n * alpha_tab[24 + 0 * 2]); \
+ q[(rb) + 12 + 0] = m + t; \
+ q[(rb) + 12 + 0 + 64] = m - t; \
+ m = q[(rb) + 12 + 1]; \
+ n = q[(rb) + 12 + 1 + 64]; \
+ t = REDS2(n * alpha_tab[24 + 1 * 2]); \
+ q[(rb) + 12 + 1] = m + t; \
+ q[(rb) + 12 + 1 + 64] = m - t; \
+ m = q[(rb) + 12 + 2]; \
+ n = q[(rb) + 12 + 2 + 64]; \
+ t = REDS2(n * alpha_tab[24 + 2 * 2]); \
+ q[(rb) + 12 + 2] = m + t; \
+ q[(rb) + 12 + 2 + 64] = m - t; \
+ m = q[(rb) + 12 + 3]; \
+ n = q[(rb) + 12 + 3 + 64]; \
+ t = REDS2(n * alpha_tab[24 + 3 * 2]); \
+ q[(rb) + 12 + 3] = m + t; \
+ q[(rb) + 12 + 3 + 64] = m - t; \
+ \
+ m = q[(rb) + 16 + 0]; \
+ n = q[(rb) + 16 + 0 + 64]; \
+ t = REDS2(n * alpha_tab[32 + 0 * 2]); \
+ q[(rb) + 16 + 0] = m + t; \
+ q[(rb) + 16 + 0 + 64] = m - t; \
+ m = q[(rb) + 16 + 1]; \
+ n = q[(rb) + 16 + 1 + 64]; \
+ t = REDS2(n * alpha_tab[32 + 1 * 2]); \
+ q[(rb) + 16 + 1] = m + t; \
+ q[(rb) + 16 + 1 + 64] = m - t; \
+ m = q[(rb) + 16 + 2]; \
+ n = q[(rb) + 16 + 2 + 64]; \
+ t = REDS2(n * alpha_tab[32 + 2 * 2]); \
+ q[(rb) + 16 + 2] = m + t; \
+ q[(rb) + 16 + 2 + 64] = m - t; \
+ m = q[(rb) + 16 + 3]; \
+ n = q[(rb) + 16 + 3 + 64]; \
+ t = REDS2(n * alpha_tab[32 + 3 * 2]); \
+ q[(rb) + 16 + 3] = m + t; \
+ q[(rb) + 16 + 3 + 64] = m - t; \
+ \
+ m = q[(rb) + 20 + 0]; \
+ n = q[(rb) + 20 + 0 + 64]; \
+ t = REDS2(n * alpha_tab[40 + 0 * 2]); \
+ q[(rb) + 20 + 0] = m + t; \
+ q[(rb) + 20 + 0 + 64] = m - t; \
+ m = q[(rb) + 20 + 1]; \
+ n = q[(rb) + 20 + 1 + 64]; \
+ t = REDS2(n * alpha_tab[40 + 1 * 2]); \
+ q[(rb) + 20 + 1] = m + t; \
+ q[(rb) + 20 + 1 + 64] = m - t; \
+ m = q[(rb) + 20 + 2]; \
+ n = q[(rb) + 20 + 2 + 64]; \
+ t = REDS2(n * alpha_tab[40 + 2 * 2]); \
+ q[(rb) + 20 + 2] = m + t; \
+ q[(rb) + 20 + 2 + 64] = m - t; \
+ m = q[(rb) + 20 + 3]; \
+ n = q[(rb) + 20 + 3 + 64]; \
+ t = REDS2(n * alpha_tab[40 + 3 * 2]); \
+ q[(rb) + 20 + 3] = m + t; \
+ q[(rb) + 20 + 3 + 64] = m - t; \
+ \
+ m = q[(rb) + 24 + 0]; \
+ n = q[(rb) + 24 + 0 + 64]; \
+ t = REDS2(n * alpha_tab[48 + 0 * 2]); \
+ q[(rb) + 24 + 0] = m + t; \
+ q[(rb) + 24 + 0 + 64] = m - t; \
+ m = q[(rb) + 24 + 1]; \
+ n = q[(rb) + 24 + 1 + 64]; \
+ t = REDS2(n * alpha_tab[48 + 1 * 2]); \
+ q[(rb) + 24 + 1] = m + t; \
+ q[(rb) + 24 + 1 + 64] = m - t; \
+ m = q[(rb) + 24 + 2]; \
+ n = q[(rb) + 24 + 2 + 64]; \
+ t = REDS2(n * alpha_tab[48 + 2 * 2]); \
+ q[(rb) + 24 + 2] = m + t; \
+ q[(rb) + 24 + 2 + 64] = m - t; \
+ m = q[(rb) + 24 + 3]; \
+ n = q[(rb) + 24 + 3 + 64]; \
+ t = REDS2(n * alpha_tab[48 + 3 * 2]); \
+ q[(rb) + 24 + 3] = m + t; \
+ q[(rb) + 24 + 3 + 64] = m - t; \
+ \
+ m = q[(rb) + 28 + 0]; \
+ n = q[(rb) + 28 + 0 + 64]; \
+ t = REDS2(n * alpha_tab[56 + 0 * 2]); \
+ q[(rb) + 28 + 0] = m + t; \
+ q[(rb) + 28 + 0 + 64] = m - t; \
+ m = q[(rb) + 28 + 1]; \
+ n = q[(rb) + 28 + 1 + 64]; \
+ t = REDS2(n * alpha_tab[56 + 1 * 2]); \
+ q[(rb) + 28 + 1] = m + t; \
+ q[(rb) + 28 + 1 + 64] = m - t; \
+ m = q[(rb) + 28 + 2]; \
+ n = q[(rb) + 28 + 2 + 64]; \
+ t = REDS2(n * alpha_tab[56 + 2 * 2]); \
+ q[(rb) + 28 + 2] = m + t; \
+ q[(rb) + 28 + 2 + 64] = m - t; \
+ m = q[(rb) + 28 + 3]; \
+ n = q[(rb) + 28 + 3 + 64]; \
+ t = REDS2(n * alpha_tab[56 + 3 * 2]); \
+ q[(rb) + 28 + 3] = m + t; \
+ q[(rb) + 28 + 3 + 64] = m - t; \
+ \
+ m = q[(rb) + 32 + 0]; \
+ n = q[(rb) + 32 + 0 + 64]; \
+ t = REDS2(n * alpha_tab[64 + 0 * 2]); \
+ q[(rb) + 32 + 0] = m + t; \
+ q[(rb) + 32 + 0 + 64] = m - t; \
+ m = q[(rb) + 32 + 1]; \
+ n = q[(rb) + 32 + 1 + 64]; \
+ t = REDS2(n * alpha_tab[64 + 1 * 2]); \
+ q[(rb) + 32 + 1] = m + t; \
+ q[(rb) + 32 + 1 + 64] = m - t; \
+ m = q[(rb) + 32 + 2]; \
+ n = q[(rb) + 32 + 2 + 64]; \
+ t = REDS2(n * alpha_tab[64 + 2 * 2]); \
+ q[(rb) + 32 + 2] = m + t; \
+ q[(rb) + 32 + 2 + 64] = m - t; \
+ m = q[(rb) + 32 + 3]; \
+ n = q[(rb) + 32 + 3 + 64]; \
+ t = REDS2(n * alpha_tab[64 + 3 * 2]); \
+ q[(rb) + 32 + 3] = m + t; \
+ q[(rb) + 32 + 3 + 64] = m - t; \
+ \
+ m = q[(rb) + 36 + 0]; \
+ n = q[(rb) + 36 + 0 + 64]; \
+ t = REDS2(n * alpha_tab[72 + 0 * 2]); \
+ q[(rb) + 36 + 0] = m + t; \
+ q[(rb) + 36 + 0 + 64] = m - t; \
+ m = q[(rb) + 36 + 1]; \
+ n = q[(rb) + 36 + 1 + 64]; \
+ t = REDS2(n * alpha_tab[72 + 1 * 2]); \
+ q[(rb) + 36 + 1] = m + t; \
+ q[(rb) + 36 + 1 + 64] = m - t; \
+ m = q[(rb) + 36 + 2]; \
+ n = q[(rb) + 36 + 2 + 64]; \
+ t = REDS2(n * alpha_tab[72 + 2 * 2]); \
+ q[(rb) + 36 + 2] = m + t; \
+ q[(rb) + 36 + 2 + 64] = m - t; \
+ m = q[(rb) + 36 + 3]; \
+ n = q[(rb) + 36 + 3 + 64]; \
+ t = REDS2(n * alpha_tab[72 + 3 * 2]); \
+ q[(rb) + 36 + 3] = m + t; \
+ q[(rb) + 36 + 3 + 64] = m - t; \
+ \
+ m = q[(rb) + 40 + 0]; \
+ n = q[(rb) + 40 + 0 + 64]; \
+ t = REDS2(n * alpha_tab[80 + 0 * 2]); \
+ q[(rb) + 40 + 0] = m + t; \
+ q[(rb) + 40 + 0 + 64] = m - t; \
+ m = q[(rb) + 40 + 1]; \
+ n = q[(rb) + 40 + 1 + 64]; \
+ t = REDS2(n * alpha_tab[80 + 1 * 2]); \
+ q[(rb) + 40 + 1] = m + t; \
+ q[(rb) + 40 + 1 + 64] = m - t; \
+ m = q[(rb) + 40 + 2]; \
+ n = q[(rb) + 40 + 2 + 64]; \
+ t = REDS2(n * alpha_tab[80 + 2 * 2]); \
+ q[(rb) + 40 + 2] = m + t; \
+ q[(rb) + 40 + 2 + 64] = m - t; \
+ m = q[(rb) + 40 + 3]; \
+ n = q[(rb) + 40 + 3 + 64]; \
+ t = REDS2(n * alpha_tab[80 + 3 * 2]); \
+ q[(rb) + 40 + 3] = m + t; \
+ q[(rb) + 40 + 3 + 64] = m - t; \
+ \
+ m = q[(rb) + 44 + 0]; \
+ n = q[(rb) + 44 + 0 + 64]; \
+ t = REDS2(n * alpha_tab[88 + 0 * 2]); \
+ q[(rb) + 44 + 0] = m + t; \
+ q[(rb) + 44 + 0 + 64] = m - t; \
+ m = q[(rb) + 44 + 1]; \
+ n = q[(rb) + 44 + 1 + 64]; \
+ t = REDS2(n * alpha_tab[88 + 1 * 2]); \
+ q[(rb) + 44 + 1] = m + t; \
+ q[(rb) + 44 + 1 + 64] = m - t; \
+ m = q[(rb) + 44 + 2]; \
+ n = q[(rb) + 44 + 2 + 64]; \
+ t = REDS2(n * alpha_tab[88 + 2 * 2]); \
+ q[(rb) + 44 + 2] = m + t; \
+ q[(rb) + 44 + 2 + 64] = m - t; \
+ m = q[(rb) + 44 + 3]; \
+ n = q[(rb) + 44 + 3 + 64]; \
+ t = REDS2(n * alpha_tab[88 + 3 * 2]); \
+ q[(rb) + 44 + 3] = m + t; \
+ q[(rb) + 44 + 3 + 64] = m - t; \
+ \
+ m = q[(rb) + 48 + 0]; \
+ n = q[(rb) + 48 + 0 + 64]; \
+ t = REDS2(n * alpha_tab[96 + 0 * 2]); \
+ q[(rb) + 48 + 0] = m + t; \
+ q[(rb) + 48 + 0 + 64] = m - t; \
+ m = q[(rb) + 48 + 1]; \
+ n = q[(rb) + 48 + 1 + 64]; \
+ t = REDS2(n * alpha_tab[96 + 1 * 2]); \
+ q[(rb) + 48 + 1] = m + t; \
+ q[(rb) + 48 + 1 + 64] = m - t; \
+ m = q[(rb) + 48 + 2]; \
+ n = q[(rb) + 48 + 2 + 64]; \
+ t = REDS2(n * alpha_tab[96 + 2 * 2]); \
+ q[(rb) + 48 + 2] = m + t; \
+ q[(rb) + 48 + 2 + 64] = m - t; \
+ m = q[(rb) + 48 + 3]; \
+ n = q[(rb) + 48 + 3 + 64]; \
+ t = REDS2(n * alpha_tab[96 + 3 * 2]); \
+ q[(rb) + 48 + 3] = m + t; \
+ q[(rb) + 48 + 3 + 64] = m - t; \
+ \
+ m = q[(rb) + 52 + 0]; \
+ n = q[(rb) + 52 + 0 + 64]; \
+ t = REDS2(n * alpha_tab[104 + 0 * 2]); \
+ q[(rb) + 52 + 0] = m + t; \
+ q[(rb) + 52 + 0 + 64] = m - t; \
+ m = q[(rb) + 52 + 1]; \
+ n = q[(rb) + 52 + 1 + 64]; \
+ t = REDS2(n * alpha_tab[104 + 1 * 2]); \
+ q[(rb) + 52 + 1] = m + t; \
+ q[(rb) + 52 + 1 + 64] = m - t; \
+ m = q[(rb) + 52 + 2]; \
+ n = q[(rb) + 52 + 2 + 64]; \
+ t = REDS2(n * alpha_tab[104 + 2 * 2]); \
+ q[(rb) + 52 + 2] = m + t; \
+ q[(rb) + 52 + 2 + 64] = m - t; \
+ m = q[(rb) + 52 + 3]; \
+ n = q[(rb) + 52 + 3 + 64]; \
+ t = REDS2(n * alpha_tab[104 + 3 * 2]); \
+ q[(rb) + 52 + 3] = m + t; \
+ q[(rb) + 52 + 3 + 64] = m - t; \
+ \
+ m = q[(rb) + 56 + 0]; \
+ n = q[(rb) + 56 + 0 + 64]; \
+ t = REDS2(n * alpha_tab[112 + 0 * 2]); \
+ q[(rb) + 56 + 0] = m + t; \
+ q[(rb) + 56 + 0 + 64] = m - t; \
+ m = q[(rb) + 56 + 1]; \
+ n = q[(rb) + 56 + 1 + 64]; \
+ t = REDS2(n * alpha_tab[112 + 1 * 2]); \
+ q[(rb) + 56 + 1] = m + t; \
+ q[(rb) + 56 + 1 + 64] = m - t; \
+ m = q[(rb) + 56 + 2]; \
+ n = q[(rb) + 56 + 2 + 64]; \
+ t = REDS2(n * alpha_tab[112 + 2 * 2]); \
+ q[(rb) + 56 + 2] = m + t; \
+ q[(rb) + 56 + 2 + 64] = m - t; \
+ m = q[(rb) + 56 + 3]; \
+ n = q[(rb) + 56 + 3 + 64]; \
+ t = REDS2(n * alpha_tab[112 + 3 * 2]); \
+ q[(rb) + 56 + 3] = m + t; \
+ q[(rb) + 56 + 3 + 64] = m - t; \
+ \
+ m = q[(rb) + 60 + 0]; \
+ n = q[(rb) + 60 + 0 + 64]; \
+ t = REDS2(n * alpha_tab[120 + 0 * 2]); \
+ q[(rb) + 60 + 0] = m + t; \
+ q[(rb) + 60 + 0 + 64] = m - t; \
+ m = q[(rb) + 60 + 1]; \
+ n = q[(rb) + 60 + 1 + 64]; \
+ t = REDS2(n * alpha_tab[120 + 1 * 2]); \
+ q[(rb) + 60 + 1] = m + t; \
+ q[(rb) + 60 + 1 + 64] = m - t; \
+ m = q[(rb) + 60 + 2]; \
+ n = q[(rb) + 60 + 2 + 64]; \
+ t = REDS2(n * alpha_tab[120 + 2 * 2]); \
+ q[(rb) + 60 + 2] = m + t; \
+ q[(rb) + 60 + 2 + 64] = m - t; \
+ m = q[(rb) + 60 + 3]; \
+ n = q[(rb) + 60 + 3 + 64]; \
+ t = REDS2(n * alpha_tab[120 + 3 * 2]); \
+ q[(rb) + 60 + 3] = m + t; \
+ q[(rb) + 60 + 3 + 64] = m - t; \
+ } while (0)
+
+#define FFT_LOOP_128_1(rb) do { \
+ s32 m = q[(rb)]; \
+ s32 n = q[(rb) + 128]; \
+ q[(rb)] = m + n; \
+ q[(rb) + 128] = m - n; \
+ s32 t; \
+ m = q[(rb) + 0 + 1]; \
+ n = q[(rb) + 0 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[0 + 1 * 1]); \
+ q[(rb) + 0 + 1] = m + t; \
+ q[(rb) + 0 + 1 + 128] = m - t; \
+ m = q[(rb) + 0 + 2]; \
+ n = q[(rb) + 0 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[0 + 2 * 1]); \
+ q[(rb) + 0 + 2] = m + t; \
+ q[(rb) + 0 + 2 + 128] = m - t; \
+ m = q[(rb) + 0 + 3]; \
+ n = q[(rb) + 0 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[0 + 3 * 1]); \
+ q[(rb) + 0 + 3] = m + t; \
+ q[(rb) + 0 + 3 + 128] = m - t; \
+ m = q[(rb) + 4 + 0]; \
+ n = q[(rb) + 4 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[4 + 0 * 1]); \
+ q[(rb) + 4 + 0] = m + t; \
+ q[(rb) + 4 + 0 + 128] = m - t; \
+ m = q[(rb) + 4 + 1]; \
+ n = q[(rb) + 4 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[4 + 1 * 1]); \
+ q[(rb) + 4 + 1] = m + t; \
+ q[(rb) + 4 + 1 + 128] = m - t; \
+ m = q[(rb) + 4 + 2]; \
+ n = q[(rb) + 4 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[4 + 2 * 1]); \
+ q[(rb) + 4 + 2] = m + t; \
+ q[(rb) + 4 + 2 + 128] = m - t; \
+ m = q[(rb) + 4 + 3]; \
+ n = q[(rb) + 4 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[4 + 3 * 1]); \
+ q[(rb) + 4 + 3] = m + t; \
+ q[(rb) + 4 + 3 + 128] = m - t; \
+ m = q[(rb) + 8 + 0]; \
+ n = q[(rb) + 8 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[8 + 0 * 1]); \
+ q[(rb) + 8 + 0] = m + t; \
+ q[(rb) + 8 + 0 + 128] = m - t; \
+ m = q[(rb) + 8 + 1]; \
+ n = q[(rb) + 8 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[8 + 1 * 1]); \
+ q[(rb) + 8 + 1] = m + t; \
+ q[(rb) + 8 + 1 + 128] = m - t; \
+ m = q[(rb) + 8 + 2]; \
+ n = q[(rb) + 8 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[8 + 2 * 1]); \
+ q[(rb) + 8 + 2] = m + t; \
+ q[(rb) + 8 + 2 + 128] = m - t; \
+ m = q[(rb) + 8 + 3]; \
+ n = q[(rb) + 8 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[8 + 3 * 1]); \
+ q[(rb) + 8 + 3] = m + t; \
+ q[(rb) + 8 + 3 + 128] = m - t; \
+ m = q[(rb) + 12 + 0]; \
+ n = q[(rb) + 12 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[12 + 0 * 1]); \
+ q[(rb) + 12 + 0] = m + t; \
+ q[(rb) + 12 + 0 + 128] = m - t; \
+ m = q[(rb) + 12 + 1]; \
+ n = q[(rb) + 12 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[12 + 1 * 1]); \
+ q[(rb) + 12 + 1] = m + t; \
+ q[(rb) + 12 + 1 + 128] = m - t; \
+ m = q[(rb) + 12 + 2]; \
+ n = q[(rb) + 12 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[12 + 2 * 1]); \
+ q[(rb) + 12 + 2] = m + t; \
+ q[(rb) + 12 + 2 + 128] = m - t; \
+ m = q[(rb) + 12 + 3]; \
+ n = q[(rb) + 12 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[12 + 3 * 1]); \
+ q[(rb) + 12 + 3] = m + t; \
+ q[(rb) + 12 + 3 + 128] = m - t; \
+ m = q[(rb) + 16 + 0]; \
+ n = q[(rb) + 16 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[16 + 0 * 1]); \
+ q[(rb) + 16 + 0] = m + t; \
+ q[(rb) + 16 + 0 + 128] = m - t; \
+ m = q[(rb) + 16 + 1]; \
+ n = q[(rb) + 16 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[16 + 1 * 1]); \
+ q[(rb) + 16 + 1] = m + t; \
+ q[(rb) + 16 + 1 + 128] = m - t; \
+ m = q[(rb) + 16 + 2]; \
+ n = q[(rb) + 16 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[16 + 2 * 1]); \
+ q[(rb) + 16 + 2] = m + t; \
+ q[(rb) + 16 + 2 + 128] = m - t; \
+ m = q[(rb) + 16 + 3]; \
+ n = q[(rb) + 16 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[16 + 3 * 1]); \
+ q[(rb) + 16 + 3] = m + t; \
+ q[(rb) + 16 + 3 + 128] = m - t; \
+ m = q[(rb) + 20 + 0]; \
+ n = q[(rb) + 20 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[20 + 0 * 1]); \
+ q[(rb) + 20 + 0] = m + t; \
+ q[(rb) + 20 + 0 + 128] = m - t; \
+ m = q[(rb) + 20 + 1]; \
+ n = q[(rb) + 20 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[20 + 1 * 1]); \
+ q[(rb) + 20 + 1] = m + t; \
+ q[(rb) + 20 + 1 + 128] = m - t; \
+ m = q[(rb) + 20 + 2]; \
+ n = q[(rb) + 20 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[20 + 2 * 1]); \
+ q[(rb) + 20 + 2] = m + t; \
+ q[(rb) + 20 + 2 + 128] = m - t; \
+ m = q[(rb) + 20 + 3]; \
+ n = q[(rb) + 20 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[20 + 3 * 1]); \
+ q[(rb) + 20 + 3] = m + t; \
+ q[(rb) + 20 + 3 + 128] = m - t; \
+ m = q[(rb) + 24 + 0]; \
+ n = q[(rb) + 24 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[24 + 0 * 1]); \
+ q[(rb) + 24 + 0] = m + t; \
+ q[(rb) + 24 + 0 + 128] = m - t; \
+ m = q[(rb) + 24 + 1]; \
+ n = q[(rb) + 24 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[24 + 1 * 1]); \
+ q[(rb) + 24 + 1] = m + t; \
+ q[(rb) + 24 + 1 + 128] = m - t; \
+ m = q[(rb) + 24 + 2]; \
+ n = q[(rb) + 24 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[24 + 2 * 1]); \
+ q[(rb) + 24 + 2] = m + t; \
+ q[(rb) + 24 + 2 + 128] = m - t; \
+ m = q[(rb) + 24 + 3]; \
+ n = q[(rb) + 24 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[24 + 3 * 1]); \
+ q[(rb) + 24 + 3] = m + t; \
+ q[(rb) + 24 + 3 + 128] = m - t; \
+ m = q[(rb) + 28 + 0]; \
+ n = q[(rb) + 28 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[28 + 0 * 1]); \
+ q[(rb) + 28 + 0] = m + t; \
+ q[(rb) + 28 + 0 + 128] = m - t; \
+ m = q[(rb) + 28 + 1]; \
+ n = q[(rb) + 28 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[28 + 1 * 1]); \
+ q[(rb) + 28 + 1] = m + t; \
+ q[(rb) + 28 + 1 + 128] = m - t; \
+ m = q[(rb) + 28 + 2]; \
+ n = q[(rb) + 28 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[28 + 2 * 1]); \
+ q[(rb) + 28 + 2] = m + t; \
+ q[(rb) + 28 + 2 + 128] = m - t; \
+ m = q[(rb) + 28 + 3]; \
+ n = q[(rb) + 28 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[28 + 3 * 1]); \
+ q[(rb) + 28 + 3] = m + t; \
+ q[(rb) + 28 + 3 + 128] = m - t; \
+ m = q[(rb) + 32 + 0]; \
+ n = q[(rb) + 32 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[32 + 0 * 1]); \
+ q[(rb) + 32 + 0] = m + t; \
+ q[(rb) + 32 + 0 + 128] = m - t; \
+ m = q[(rb) + 32 + 1]; \
+ n = q[(rb) + 32 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[32 + 1 * 1]); \
+ q[(rb) + 32 + 1] = m + t; \
+ q[(rb) + 32 + 1 + 128] = m - t; \
+ m = q[(rb) + 32 + 2]; \
+ n = q[(rb) + 32 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[32 + 2 * 1]); \
+ q[(rb) + 32 + 2] = m + t; \
+ q[(rb) + 32 + 2 + 128] = m - t; \
+ m = q[(rb) + 32 + 3]; \
+ n = q[(rb) + 32 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[32 + 3 * 1]); \
+ q[(rb) + 32 + 3] = m + t; \
+ q[(rb) + 32 + 3 + 128] = m - t; \
+ m = q[(rb) + 36 + 0]; \
+ n = q[(rb) + 36 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[36 + 0 * 1]); \
+ q[(rb) + 36 + 0] = m + t; \
+ q[(rb) + 36 + 0 + 128] = m - t; \
+ m = q[(rb) + 36 + 1]; \
+ n = q[(rb) + 36 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[36 + 1 * 1]); \
+ q[(rb) + 36 + 1] = m + t; \
+ q[(rb) + 36 + 1 + 128] = m - t; \
+ m = q[(rb) + 36 + 2]; \
+ n = q[(rb) + 36 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[36 + 2 * 1]); \
+ q[(rb) + 36 + 2] = m + t; \
+ q[(rb) + 36 + 2 + 128] = m - t; \
+ m = q[(rb) + 36 + 3]; \
+ n = q[(rb) + 36 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[36 + 3 * 1]); \
+ q[(rb) + 36 + 3] = m + t; \
+ q[(rb) + 36 + 3 + 128] = m - t; \
+ m = q[(rb) + 40 + 0]; \
+ n = q[(rb) + 40 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[40 + 0 * 1]); \
+ q[(rb) + 40 + 0] = m + t; \
+ q[(rb) + 40 + 0 + 128] = m - t; \
+ m = q[(rb) + 40 + 1]; \
+ n = q[(rb) + 40 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[40 + 1 * 1]); \
+ q[(rb) + 40 + 1] = m + t; \
+ q[(rb) + 40 + 1 + 128] = m - t; \
+ m = q[(rb) + 40 + 2]; \
+ n = q[(rb) + 40 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[40 + 2 * 1]); \
+ q[(rb) + 40 + 2] = m + t; \
+ q[(rb) + 40 + 2 + 128] = m - t; \
+ m = q[(rb) + 40 + 3]; \
+ n = q[(rb) + 40 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[40 + 3 * 1]); \
+ q[(rb) + 40 + 3] = m + t; \
+ q[(rb) + 40 + 3 + 128] = m - t; \
+ m = q[(rb) + 44 + 0]; \
+ n = q[(rb) + 44 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[44 + 0 * 1]); \
+ q[(rb) + 44 + 0] = m + t; \
+ q[(rb) + 44 + 0 + 128] = m - t; \
+ m = q[(rb) + 44 + 1]; \
+ n = q[(rb) + 44 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[44 + 1 * 1]); \
+ q[(rb) + 44 + 1] = m + t; \
+ q[(rb) + 44 + 1 + 128] = m - t; \
+ m = q[(rb) + 44 + 2]; \
+ n = q[(rb) + 44 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[44 + 2 * 1]); \
+ q[(rb) + 44 + 2] = m + t; \
+ q[(rb) + 44 + 2 + 128] = m - t; \
+ m = q[(rb) + 44 + 3]; \
+ n = q[(rb) + 44 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[44 + 3 * 1]); \
+ q[(rb) + 44 + 3] = m + t; \
+ q[(rb) + 44 + 3 + 128] = m - t; \
+ m = q[(rb) + 48 + 0]; \
+ n = q[(rb) + 48 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[48 + 0 * 1]); \
+ q[(rb) + 48 + 0] = m + t; \
+ q[(rb) + 48 + 0 + 128] = m - t; \
+ m = q[(rb) + 48 + 1]; \
+ n = q[(rb) + 48 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[48 + 1 * 1]); \
+ q[(rb) + 48 + 1] = m + t; \
+ q[(rb) + 48 + 1 + 128] = m - t; \
+ m = q[(rb) + 48 + 2]; \
+ n = q[(rb) + 48 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[48 + 2 * 1]); \
+ q[(rb) + 48 + 2] = m + t; \
+ q[(rb) + 48 + 2 + 128] = m - t; \
+ m = q[(rb) + 48 + 3]; \
+ n = q[(rb) + 48 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[48 + 3 * 1]); \
+ q[(rb) + 48 + 3] = m + t; \
+ q[(rb) + 48 + 3 + 128] = m - t; \
+ m = q[(rb) + 52 + 0]; \
+ n = q[(rb) + 52 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[52 + 0 * 1]); \
+ q[(rb) + 52 + 0] = m + t; \
+ q[(rb) + 52 + 0 + 128] = m - t; \
+ m = q[(rb) + 52 + 1]; \
+ n = q[(rb) + 52 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[52 + 1 * 1]); \
+ q[(rb) + 52 + 1] = m + t; \
+ q[(rb) + 52 + 1 + 128] = m - t; \
+ m = q[(rb) + 52 + 2]; \
+ n = q[(rb) + 52 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[52 + 2 * 1]); \
+ q[(rb) + 52 + 2] = m + t; \
+ q[(rb) + 52 + 2 + 128] = m - t; \
+ m = q[(rb) + 52 + 3]; \
+ n = q[(rb) + 52 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[52 + 3 * 1]); \
+ q[(rb) + 52 + 3] = m + t; \
+ q[(rb) + 52 + 3 + 128] = m - t; \
+ m = q[(rb) + 56 + 0]; \
+ n = q[(rb) + 56 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[56 + 0 * 1]); \
+ q[(rb) + 56 + 0] = m + t; \
+ q[(rb) + 56 + 0 + 128] = m - t; \
+ m = q[(rb) + 56 + 1]; \
+ n = q[(rb) + 56 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[56 + 1 * 1]); \
+ q[(rb) + 56 + 1] = m + t; \
+ q[(rb) + 56 + 1 + 128] = m - t; \
+ m = q[(rb) + 56 + 2]; \
+ n = q[(rb) + 56 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[56 + 2 * 1]); \
+ q[(rb) + 56 + 2] = m + t; \
+ q[(rb) + 56 + 2 + 128] = m - t; \
+ m = q[(rb) + 56 + 3]; \
+ n = q[(rb) + 56 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[56 + 3 * 1]); \
+ q[(rb) + 56 + 3] = m + t; \
+ q[(rb) + 56 + 3 + 128] = m - t; \
+ m = q[(rb) + 60 + 0]; \
+ n = q[(rb) + 60 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[60 + 0 * 1]); \
+ q[(rb) + 60 + 0] = m + t; \
+ q[(rb) + 60 + 0 + 128] = m - t; \
+ m = q[(rb) + 60 + 1]; \
+ n = q[(rb) + 60 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[60 + 1 * 1]); \
+ q[(rb) + 60 + 1] = m + t; \
+ q[(rb) + 60 + 1 + 128] = m - t; \
+ m = q[(rb) + 60 + 2]; \
+ n = q[(rb) + 60 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[60 + 2 * 1]); \
+ q[(rb) + 60 + 2] = m + t; \
+ q[(rb) + 60 + 2 + 128] = m - t; \
+ m = q[(rb) + 60 + 3]; \
+ n = q[(rb) + 60 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[60 + 3 * 1]); \
+ q[(rb) + 60 + 3] = m + t; \
+ q[(rb) + 60 + 3 + 128] = m - t; \
+ m = q[(rb) + 64 + 0]; \
+ n = q[(rb) + 64 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[64 + 0 * 1]); \
+ q[(rb) + 64 + 0] = m + t; \
+ q[(rb) + 64 + 0 + 128] = m - t; \
+ m = q[(rb) + 64 + 1]; \
+ n = q[(rb) + 64 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[64 + 1 * 1]); \
+ q[(rb) + 64 + 1] = m + t; \
+ q[(rb) + 64 + 1 + 128] = m - t; \
+ m = q[(rb) + 64 + 2]; \
+ n = q[(rb) + 64 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[64 + 2 * 1]); \
+ q[(rb) + 64 + 2] = m + t; \
+ q[(rb) + 64 + 2 + 128] = m - t; \
+ m = q[(rb) + 64 + 3]; \
+ n = q[(rb) + 64 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[64 + 3 * 1]); \
+ q[(rb) + 64 + 3] = m + t; \
+ q[(rb) + 64 + 3 + 128] = m - t; \
+ m = q[(rb) + 68 + 0]; \
+ n = q[(rb) + 68 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[68 + 0 * 1]); \
+ q[(rb) + 68 + 0] = m + t; \
+ q[(rb) + 68 + 0 + 128] = m - t; \
+ m = q[(rb) + 68 + 1]; \
+ n = q[(rb) + 68 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[68 + 1 * 1]); \
+ q[(rb) + 68 + 1] = m + t; \
+ q[(rb) + 68 + 1 + 128] = m - t; \
+ m = q[(rb) + 68 + 2]; \
+ n = q[(rb) + 68 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[68 + 2 * 1]); \
+ q[(rb) + 68 + 2] = m + t; \
+ q[(rb) + 68 + 2 + 128] = m - t; \
+ m = q[(rb) + 68 + 3]; \
+ n = q[(rb) + 68 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[68 + 3 * 1]); \
+ q[(rb) + 68 + 3] = m + t; \
+ q[(rb) + 68 + 3 + 128] = m - t; \
+ m = q[(rb) + 72 + 0]; \
+ n = q[(rb) + 72 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[72 + 0 * 1]); \
+ q[(rb) + 72 + 0] = m + t; \
+ q[(rb) + 72 + 0 + 128] = m - t; \
+ m = q[(rb) + 72 + 1]; \
+ n = q[(rb) + 72 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[72 + 1 * 1]); \
+ q[(rb) + 72 + 1] = m + t; \
+ q[(rb) + 72 + 1 + 128] = m - t; \
+ m = q[(rb) + 72 + 2]; \
+ n = q[(rb) + 72 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[72 + 2 * 1]); \
+ q[(rb) + 72 + 2] = m + t; \
+ q[(rb) + 72 + 2 + 128] = m - t; \
+ m = q[(rb) + 72 + 3]; \
+ n = q[(rb) + 72 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[72 + 3 * 1]); \
+ q[(rb) + 72 + 3] = m + t; \
+ q[(rb) + 72 + 3 + 128] = m - t; \
+ m = q[(rb) + 76 + 0]; \
+ n = q[(rb) + 76 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[76 + 0 * 1]); \
+ q[(rb) + 76 + 0] = m + t; \
+ q[(rb) + 76 + 0 + 128] = m - t; \
+ m = q[(rb) + 76 + 1]; \
+ n = q[(rb) + 76 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[76 + 1 * 1]); \
+ q[(rb) + 76 + 1] = m + t; \
+ q[(rb) + 76 + 1 + 128] = m - t; \
+ m = q[(rb) + 76 + 2]; \
+ n = q[(rb) + 76 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[76 + 2 * 1]); \
+ q[(rb) + 76 + 2] = m + t; \
+ q[(rb) + 76 + 2 + 128] = m - t; \
+ m = q[(rb) + 76 + 3]; \
+ n = q[(rb) + 76 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[76 + 3 * 1]); \
+ q[(rb) + 76 + 3] = m + t; \
+ q[(rb) + 76 + 3 + 128] = m - t; \
+ m = q[(rb) + 80 + 0]; \
+ n = q[(rb) + 80 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[80 + 0 * 1]); \
+ q[(rb) + 80 + 0] = m + t; \
+ q[(rb) + 80 + 0 + 128] = m - t; \
+ m = q[(rb) + 80 + 1]; \
+ n = q[(rb) + 80 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[80 + 1 * 1]); \
+ q[(rb) + 80 + 1] = m + t; \
+ q[(rb) + 80 + 1 + 128] = m - t; \
+ m = q[(rb) + 80 + 2]; \
+ n = q[(rb) + 80 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[80 + 2 * 1]); \
+ q[(rb) + 80 + 2] = m + t; \
+ q[(rb) + 80 + 2 + 128] = m - t; \
+ m = q[(rb) + 80 + 3]; \
+ n = q[(rb) + 80 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[80 + 3 * 1]); \
+ q[(rb) + 80 + 3] = m + t; \
+ q[(rb) + 80 + 3 + 128] = m - t; \
+ m = q[(rb) + 84 + 0]; \
+ n = q[(rb) + 84 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[84 + 0 * 1]); \
+ q[(rb) + 84 + 0] = m + t; \
+ q[(rb) + 84 + 0 + 128] = m - t; \
+ m = q[(rb) + 84 + 1]; \
+ n = q[(rb) + 84 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[84 + 1 * 1]); \
+ q[(rb) + 84 + 1] = m + t; \
+ q[(rb) + 84 + 1 + 128] = m - t; \
+ m = q[(rb) + 84 + 2]; \
+ n = q[(rb) + 84 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[84 + 2 * 1]); \
+ q[(rb) + 84 + 2] = m + t; \
+ q[(rb) + 84 + 2 + 128] = m - t; \
+ m = q[(rb) + 84 + 3]; \
+ n = q[(rb) + 84 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[84 + 3 * 1]); \
+ q[(rb) + 84 + 3] = m + t; \
+ q[(rb) + 84 + 3 + 128] = m - t; \
+ m = q[(rb) + 88 + 0]; \
+ n = q[(rb) + 88 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[88 + 0 * 1]); \
+ q[(rb) + 88 + 0] = m + t; \
+ q[(rb) + 88 + 0 + 128] = m - t; \
+ m = q[(rb) + 88 + 1]; \
+ n = q[(rb) + 88 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[88 + 1 * 1]); \
+ q[(rb) + 88 + 1] = m + t; \
+ q[(rb) + 88 + 1 + 128] = m - t; \
+ m = q[(rb) + 88 + 2]; \
+ n = q[(rb) + 88 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[88 + 2 * 1]); \
+ q[(rb) + 88 + 2] = m + t; \
+ q[(rb) + 88 + 2 + 128] = m - t; \
+ m = q[(rb) + 88 + 3]; \
+ n = q[(rb) + 88 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[88 + 3 * 1]); \
+ q[(rb) + 88 + 3] = m + t; \
+ q[(rb) + 88 + 3 + 128] = m - t; \
+ m = q[(rb) + 92 + 0]; \
+ n = q[(rb) + 92 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[92 + 0 * 1]); \
+ q[(rb) + 92 + 0] = m + t; \
+ q[(rb) + 92 + 0 + 128] = m - t; \
+ m = q[(rb) + 92 + 1]; \
+ n = q[(rb) + 92 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[92 + 1 * 1]); \
+ q[(rb) + 92 + 1] = m + t; \
+ q[(rb) + 92 + 1 + 128] = m - t; \
+ m = q[(rb) + 92 + 2]; \
+ n = q[(rb) + 92 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[92 + 2 * 1]); \
+ q[(rb) + 92 + 2] = m + t; \
+ q[(rb) + 92 + 2 + 128] = m - t; \
+ m = q[(rb) + 92 + 3]; \
+ n = q[(rb) + 92 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[92 + 3 * 1]); \
+ q[(rb) + 92 + 3] = m + t; \
+ q[(rb) + 92 + 3 + 128] = m - t; \
+ m = q[(rb) + 96 + 0]; \
+ n = q[(rb) + 96 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[96 + 0 * 1]); \
+ q[(rb) + 96 + 0] = m + t; \
+ q[(rb) + 96 + 0 + 128] = m - t; \
+ m = q[(rb) + 96 + 1]; \
+ n = q[(rb) + 96 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[96 + 1 * 1]); \
+ q[(rb) + 96 + 1] = m + t; \
+ q[(rb) + 96 + 1 + 128] = m - t; \
+ m = q[(rb) + 96 + 2]; \
+ n = q[(rb) + 96 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[96 + 2 * 1]); \
+ q[(rb) + 96 + 2] = m + t; \
+ q[(rb) + 96 + 2 + 128] = m - t; \
+ m = q[(rb) + 96 + 3]; \
+ n = q[(rb) + 96 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[96 + 3 * 1]); \
+ q[(rb) + 96 + 3] = m + t; \
+ q[(rb) + 96 + 3 + 128] = m - t; \
+ m = q[(rb) + 100 + 0]; \
+ n = q[(rb) + 100 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[100 + 0 * 1]); \
+ q[(rb) + 100 + 0] = m + t; \
+ q[(rb) + 100 + 0 + 128] = m - t; \
+ m = q[(rb) + 100 + 1]; \
+ n = q[(rb) + 100 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[100 + 1 * 1]); \
+ q[(rb) + 100 + 1] = m + t; \
+ q[(rb) + 100 + 1 + 128] = m - t; \
+ m = q[(rb) + 100 + 2]; \
+ n = q[(rb) + 100 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[100 + 2 * 1]); \
+ q[(rb) + 100 + 2] = m + t; \
+ q[(rb) + 100 + 2 + 128] = m - t; \
+ m = q[(rb) + 100 + 3]; \
+ n = q[(rb) + 100 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[100 + 3 * 1]); \
+ q[(rb) + 100 + 3] = m + t; \
+ q[(rb) + 100 + 3 + 128] = m - t; \
+ m = q[(rb) + 104 + 0]; \
+ n = q[(rb) + 104 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[104 + 0 * 1]); \
+ q[(rb) + 104 + 0] = m + t; \
+ q[(rb) + 104 + 0 + 128] = m - t; \
+ m = q[(rb) + 104 + 1]; \
+ n = q[(rb) + 104 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[104 + 1 * 1]); \
+ q[(rb) + 104 + 1] = m + t; \
+ q[(rb) + 104 + 1 + 128] = m - t; \
+ m = q[(rb) + 104 + 2]; \
+ n = q[(rb) + 104 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[104 + 2 * 1]); \
+ q[(rb) + 104 + 2] = m + t; \
+ q[(rb) + 104 + 2 + 128] = m - t; \
+ m = q[(rb) + 104 + 3]; \
+ n = q[(rb) + 104 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[104 + 3 * 1]); \
+ q[(rb) + 104 + 3] = m + t; \
+ q[(rb) + 104 + 3 + 128] = m - t; \
+ m = q[(rb) + 108 + 0]; \
+ n = q[(rb) + 108 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[108 + 0 * 1]); \
+ q[(rb) + 108 + 0] = m + t; \
+ q[(rb) + 108 + 0 + 128] = m - t; \
+ m = q[(rb) + 108 + 1]; \
+ n = q[(rb) + 108 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[108 + 1 * 1]); \
+ q[(rb) + 108 + 1] = m + t; \
+ q[(rb) + 108 + 1 + 128] = m - t; \
+ m = q[(rb) + 108 + 2]; \
+ n = q[(rb) + 108 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[108 + 2 * 1]); \
+ q[(rb) + 108 + 2] = m + t; \
+ q[(rb) + 108 + 2 + 128] = m - t; \
+ m = q[(rb) + 108 + 3]; \
+ n = q[(rb) + 108 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[108 + 3 * 1]); \
+ q[(rb) + 108 + 3] = m + t; \
+ q[(rb) + 108 + 3 + 128] = m - t; \
+ m = q[(rb) + 112 + 0]; \
+ n = q[(rb) + 112 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[112 + 0 * 1]); \
+ q[(rb) + 112 + 0] = m + t; \
+ q[(rb) + 112 + 0 + 128] = m - t; \
+ m = q[(rb) + 112 + 1]; \
+ n = q[(rb) + 112 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[112 + 1 * 1]); \
+ q[(rb) + 112 + 1] = m + t; \
+ q[(rb) + 112 + 1 + 128] = m - t; \
+ m = q[(rb) + 112 + 2]; \
+ n = q[(rb) + 112 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[112 + 2 * 1]); \
+ q[(rb) + 112 + 2] = m + t; \
+ q[(rb) + 112 + 2 + 128] = m - t; \
+ m = q[(rb) + 112 + 3]; \
+ n = q[(rb) + 112 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[112 + 3 * 1]); \
+ q[(rb) + 112 + 3] = m + t; \
+ q[(rb) + 112 + 3 + 128] = m - t; \
+ m = q[(rb) + 116 + 0]; \
+ n = q[(rb) + 116 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[116 + 0 * 1]); \
+ q[(rb) + 116 + 0] = m + t; \
+ q[(rb) + 116 + 0 + 128] = m - t; \
+ m = q[(rb) + 116 + 1]; \
+ n = q[(rb) + 116 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[116 + 1 * 1]); \
+ q[(rb) + 116 + 1] = m + t; \
+ q[(rb) + 116 + 1 + 128] = m - t; \
+ m = q[(rb) + 116 + 2]; \
+ n = q[(rb) + 116 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[116 + 2 * 1]); \
+ q[(rb) + 116 + 2] = m + t; \
+ q[(rb) + 116 + 2 + 128] = m - t; \
+ m = q[(rb) + 116 + 3]; \
+ n = q[(rb) + 116 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[116 + 3 * 1]); \
+ q[(rb) + 116 + 3] = m + t; \
+ q[(rb) + 116 + 3 + 128] = m - t; \
+ m = q[(rb) + 120 + 0]; \
+ n = q[(rb) + 120 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[120 + 0 * 1]); \
+ q[(rb) + 120 + 0] = m + t; \
+ q[(rb) + 120 + 0 + 128] = m - t; \
+ m = q[(rb) + 120 + 1]; \
+ n = q[(rb) + 120 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[120 + 1 * 1]); \
+ q[(rb) + 120 + 1] = m + t; \
+ q[(rb) + 120 + 1 + 128] = m - t; \
+ m = q[(rb) + 120 + 2]; \
+ n = q[(rb) + 120 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[120 + 2 * 1]); \
+ q[(rb) + 120 + 2] = m + t; \
+ q[(rb) + 120 + 2 + 128] = m - t; \
+ m = q[(rb) + 120 + 3]; \
+ n = q[(rb) + 120 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[120 + 3 * 1]); \
+ q[(rb) + 120 + 3] = m + t; \
+ q[(rb) + 120 + 3 + 128] = m - t; \
+ m = q[(rb) + 124 + 0]; \
+ n = q[(rb) + 124 + 0 + 128]; \
+ t = REDS2(n * alpha_tab[124 + 0 * 1]); \
+ q[(rb) + 124 + 0] = m + t; \
+ q[(rb) + 124 + 0 + 128] = m - t; \
+ m = q[(rb) + 124 + 1]; \
+ n = q[(rb) + 124 + 1 + 128]; \
+ t = REDS2(n * alpha_tab[124 + 1 * 1]); \
+ q[(rb) + 124 + 1] = m + t; \
+ q[(rb) + 124 + 1 + 128] = m - t; \
+ m = q[(rb) + 124 + 2]; \
+ n = q[(rb) + 124 + 2 + 128]; \
+ t = REDS2(n * alpha_tab[124 + 2 * 1]); \
+ q[(rb) + 124 + 2] = m + t; \
+ q[(rb) + 124 + 2 + 128] = m - t; \
+ m = q[(rb) + 124 + 3]; \
+ n = q[(rb) + 124 + 3 + 128]; \
+ t = REDS2(n * alpha_tab[124 + 3 * 1]); \
+ q[(rb) + 124 + 3] = m + t; \
+ q[(rb) + 124 + 3 + 128] = m - t; \
+ } while (0)
+
+/*
+ * Output ranges:
+ * d0: min= 0 max= 1020
+ * d1: min= -67 max= 4587
+ * d2: min=-4335 max= 4335
+ * d3: min=-4147 max= 507
+ * d4: min= -510 max= 510
+ * d5: min= -252 max= 4402
+ * d6: min=-4335 max= 4335
+ * d7: min=-4332 max= 322
+ */
+#define FFT8(xb, xs, d) do { \
+ s32 x0 = x[(xb)]; \
+ s32 x1 = x[(xb) + (xs)]; \
+ s32 x2 = x[(xb) + 2 * (xs)]; \
+ s32 x3 = x[(xb) + 3 * (xs)]; \
+ s32 a0 = x0 + x2; \
+ s32 a1 = x0 + (x2 << 4); \
+ s32 a2 = x0 - x2; \
+ s32 a3 = x0 - (x2 << 4); \
+ s32 b0 = x1 + x3; \
+ s32 b1 = REDS1((x1 << 2) + (x3 << 6)); \
+ s32 b2 = (x1 << 4) - (x3 << 4); \
+ s32 b3 = REDS1((x1 << 6) + (x3 << 2)); \
+ d ## 0 = a0 + b0; \
+ d ## 1 = a1 + b1; \
+ d ## 2 = a2 + b2; \
+ d ## 3 = a3 + b3; \
+ d ## 4 = a0 - b0; \
+ d ## 5 = a1 - b1; \
+ d ## 6 = a2 - b2; \
+ d ## 7 = a3 - b3; \
+ } while (0)
+
+/*
+ * When k=16, we have alpha=2. Multiplication by alpha^i is then reduced
+ * to some shifting.
+ *
+ * Output: within -591471..591723
+ */
+#define FFT16(xb, xs, rb) do { \
+ s32 d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7; \
+ s32 d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7; \
+ FFT8(xb, (xs) << 1, d1_); \
+ FFT8((xb) + (xs), (xs) << 1, d2_); \
+ q[(rb) + 0] = d1_0 + d2_0; \
+ q[(rb) + 1] = d1_1 + (d2_1 << 1); \
+ q[(rb) + 2] = d1_2 + (d2_2 << 2); \
+ q[(rb) + 3] = d1_3 + (d2_3 << 3); \
+ q[(rb) + 4] = d1_4 + (d2_4 << 4); \
+ q[(rb) + 5] = d1_5 + (d2_5 << 5); \
+ q[(rb) + 6] = d1_6 + (d2_6 << 6); \
+ q[(rb) + 7] = d1_7 + (d2_7 << 7); \
+ q[(rb) + 8] = d1_0 - d2_0; \
+ q[(rb) + 9] = d1_1 - (d2_1 << 1); \
+ q[(rb) + 10] = d1_2 - (d2_2 << 2); \
+ q[(rb) + 11] = d1_3 - (d2_3 << 3); \
+ q[(rb) + 12] = d1_4 - (d2_4 << 4); \
+ q[(rb) + 13] = d1_5 - (d2_5 << 5); \
+ q[(rb) + 14] = d1_6 - (d2_6 << 6); \
+ q[(rb) + 15] = d1_7 - (d2_7 << 7); \
+ } while (0)
+
+/*
+ * Output range: |q| <= 1183446
+ */
+#define FFT32(xb, xs, rb, id) do { \
+ FFT16(xb, (xs) << 1, rb); \
+ FFT16((xb) + (xs), (xs) << 1, (rb) + 16); \
+ FFT_LOOP_16_8(rb); \
+ } while (0)
+
+/*
+ * Output range: |q| <= 2366892
+ */
+#define FFT64(xb, xs, rb) do { \
+ FFT32(xb, (xs) << 1, (rb), label_a); \
+ FFT32((xb) + (xs), (xs) << 1, (rb) + 32, label_b); \
+ FFT_LOOP_32_4(rb); \
+ } while (0)
+
+/*
+ * Output range: |q| <= 9467568
+ */
+#define FFT256(xb, xs, rb, id) do { \
+ FFT64((xb) + ((xs) * 0), (xs) << 2, (rb + 0)); \
+ FFT64((xb) + ((xs) * 2), (xs) << 2, (rb + 64)); \
+ FFT_LOOP_64_2(rb); \
+ FFT64((xb) + ((xs) * 1), (xs) << 2, (rb + 128)); \
+ FFT64((xb) + ((xs) * 3), (xs) << 2, (rb + 192)); \
+ FFT_LOOP_64_2((rb) + 128); \
+ FFT_LOOP_128_1(rb); \
+ } while (0)
+
+/*
+ * beta^(255*i) mod 257
+ */
+__constant__ static const unsigned short yoff_b_n[] = {
+ 1, 163, 98, 40, 95, 65, 58, 202, 30, 7, 113, 172,
+ 23, 151, 198, 149, 129, 210, 49, 20, 176, 161, 29, 101,
+ 15, 132, 185, 86, 140, 204, 99, 203, 193, 105, 153, 10,
+ 88, 209, 143, 179, 136, 66, 221, 43, 70, 102, 178, 230,
+ 225, 181, 205, 5, 44, 233, 200, 218, 68, 33, 239, 150,
+ 35, 51, 89, 115, 241, 219, 231, 131, 22, 245, 100, 109,
+ 34, 145, 248, 75, 146, 154, 173, 186, 249, 238, 244, 194,
+ 11, 251, 50, 183, 17, 201, 124, 166, 73, 77, 215, 93,
+ 253, 119, 122, 97, 134, 254, 25, 220, 137, 229, 62, 83,
+ 165, 167, 236, 175, 255, 188, 61, 177, 67, 127, 141, 110,
+ 197, 243, 31, 170, 211, 212, 118, 216, 256, 94, 159, 217,
+ 162, 192, 199, 55, 227, 250, 144, 85, 234, 106, 59, 108,
+ 128, 47, 208, 237, 81, 96, 228, 156, 242, 125, 72, 171,
+ 117, 53, 158, 54, 64, 152, 104, 247, 169, 48, 114, 78,
+ 121, 191, 36, 214, 187, 155, 79, 27, 32, 76, 52, 252,
+ 213, 24, 57, 39, 189, 224, 18, 107, 222, 206, 168, 142,
+ 16, 38, 26, 126, 235, 12, 157, 148, 223, 112, 9, 182,
+ 111, 103, 84, 71, 8, 19, 13, 63, 246, 6, 207, 74,
+ 240, 56, 133, 91, 184, 180, 42, 164, 4, 138, 135, 160,
+ 123, 3, 232, 37, 120, 28, 195, 174, 92, 90, 21, 82,
+ 2, 69, 196, 80, 190, 130, 116, 147, 60, 14, 226, 87,
+ 46, 45, 139, 41
+};
+
+#define INNER(l, h, mm) (((u32)((l) * (mm)) & 0xFFFFU) \
+ + ((u32)((h) * (mm)) << 16))
+
+#define W_BIG(sb, o1, o2, mm) \
+ (INNER(q[16 * (sb) + 2 * 0 + o1], q[16 * (sb) + 2 * 0 + o2], mm), \
+ INNER(q[16 * (sb) + 2 * 1 + o1], q[16 * (sb) + 2 * 1 + o2], mm), \
+ INNER(q[16 * (sb) + 2 * 2 + o1], q[16 * (sb) + 2 * 2 + o2], mm), \
+ INNER(q[16 * (sb) + 2 * 3 + o1], q[16 * (sb) + 2 * 3 + o2], mm), \
+ INNER(q[16 * (sb) + 2 * 4 + o1], q[16 * (sb) + 2 * 4 + o2], mm), \
+ INNER(q[16 * (sb) + 2 * 5 + o1], q[16 * (sb) + 2 * 5 + o2], mm), \
+ INNER(q[16 * (sb) + 2 * 6 + o1], q[16 * (sb) + 2 * 6 + o2], mm), \
+ INNER(q[16 * (sb) + 2 * 7 + o1], q[16 * (sb) + 2 * 7 + o2], mm)
+
+#define WB_0_0 W_BIG( 4, 0, 1, 185)
+#define WB_0_1 W_BIG( 6, 0, 1, 185)
+#define WB_0_2 W_BIG( 0, 0, 1, 185)
+#define WB_0_3 W_BIG( 2, 0, 1, 185)
+#define WB_0_4 W_BIG( 7, 0, 1, 185)
+#define WB_0_5 W_BIG( 5, 0, 1, 185)
+#define WB_0_6 W_BIG( 3, 0, 1, 185)
+#define WB_0_7 W_BIG( 1, 0, 1, 185)
+#define WB_1_0 W_BIG(15, 0, 1, 185)
+#define WB_1_1 W_BIG(11, 0, 1, 185)
+#define WB_1_2 W_BIG(12, 0, 1, 185)
+#define WB_1_3 W_BIG( 8, 0, 1, 185)
+#define WB_1_4 W_BIG( 9, 0, 1, 185)
+#define WB_1_5 W_BIG(13, 0, 1, 185)
+#define WB_1_6 W_BIG(10, 0, 1, 185)
+#define WB_1_7 W_BIG(14, 0, 1, 185)
+#define WB_2_0 W_BIG(17, -256, -128, 233)
+#define WB_2_1 W_BIG(18, -256, -128, 233)
+#define WB_2_2 W_BIG(23, -256, -128, 233)
+#define WB_2_3 W_BIG(20, -256, -128, 233)
+#define WB_2_4 W_BIG(22, -256, -128, 233)
+#define WB_2_5 W_BIG(21, -256, -128, 233)
+#define WB_2_6 W_BIG(16, -256, -128, 233)
+#define WB_2_7 W_BIG(19, -256, -128, 233)
+#define WB_3_0 W_BIG(30, -383, -255, 233)
+#define WB_3_1 W_BIG(24, -383, -255, 233)
+#define WB_3_2 W_BIG(25, -383, -255, 233)
+#define WB_3_3 W_BIG(31, -383, -255, 233)
+#define WB_3_4 W_BIG(27, -383, -255, 233)
+#define WB_3_5 W_BIG(29, -383, -255, 233)
+#define WB_3_6 W_BIG(28, -383, -255, 233)
+#define WB_3_7 W_BIG(26, -383, -255, 233)
+
+#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z))
+#define MAJ(x, y, z) (((x) & (y)) | (((x) | (y)) & (z)))
+
+#define PP4_0_0 1
+#define PP4_0_1 0
+#define PP4_0_2 3
+#define PP4_0_3 2
+#define PP4_1_0 2
+#define PP4_1_1 3
+#define PP4_1_2 0
+#define PP4_1_3 1
+#define PP4_2_0 3
+#define PP4_2_1 2
+#define PP4_2_2 1
+#define PP4_2_3 0
+
+#define PP8_0_0 1
+#define PP8_0_1 0
+#define PP8_0_2 3
+#define PP8_0_3 2
+#define PP8_0_4 5
+#define PP8_0_5 4
+#define PP8_0_6 7
+#define PP8_0_7 6
+
+#define PP8_1_0 6
+#define PP8_1_1 7
+#define PP8_1_2 4
+#define PP8_1_3 5
+#define PP8_1_4 2
+#define PP8_1_5 3
+#define PP8_1_6 0
+#define PP8_1_7 1
+
+#define PP8_2_0 2
+#define PP8_2_1 3
+#define PP8_2_2 0
+#define PP8_2_3 1
+#define PP8_2_4 6
+#define PP8_2_5 7
+#define PP8_2_6 4
+#define PP8_2_7 5
+
+#define PP8_3_0 3
+#define PP8_3_1 2
+#define PP8_3_2 1
+#define PP8_3_3 0
+#define PP8_3_4 7
+#define PP8_3_5 6
+#define PP8_3_6 5
+#define PP8_3_7 4
+
+#define PP8_4_0 5
+#define PP8_4_1 4
+#define PP8_4_2 7
+#define PP8_4_3 6
+#define PP8_4_4 1
+#define PP8_4_5 0
+#define PP8_4_6 3
+#define PP8_4_7 2
+
+#define PP8_5_0 7
+#define PP8_5_1 6
+#define PP8_5_2 5
+#define PP8_5_3 4
+#define PP8_5_4 3
+#define PP8_5_5 2
+#define PP8_5_6 1
+#define PP8_5_7 0
+
+#define PP8_6_0 4
+#define PP8_6_1 5
+#define PP8_6_2 6
+#define PP8_6_3 7
+#define PP8_6_4 0
+#define PP8_6_5 1
+#define PP8_6_6 2
+#define PP8_6_7 3
+
+#define STEP_ELT(n, w, fun, s, ppb) do { \
+ u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \
+ A ## n = T32(ROL32(tt, s) + XCAT(tA, XCAT(ppb, n))); \
+ D ## n = C ## n; \
+ C ## n = B ## n; \
+ B ## n = tA ## n; \
+ } while (0)
+
+#define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \
+ u32 tA0 = ROL32(A0, r); \
+ u32 tA1 = ROL32(A1, r); \
+ u32 tA2 = ROL32(A2, r); \
+ u32 tA3 = ROL32(A3, r); \
+ u32 tA4 = ROL32(A4, r); \
+ u32 tA5 = ROL32(A5, r); \
+ u32 tA6 = ROL32(A6, r); \
+ u32 tA7 = ROL32(A7, r); \
+ STEP_ELT(0, w0, fun, s, pp8b); \
+ STEP_ELT(1, w1, fun, s, pp8b); \
+ STEP_ELT(2, w2, fun, s, pp8b); \
+ STEP_ELT(3, w3, fun, s, pp8b); \
+ STEP_ELT(4, w4, fun, s, pp8b); \
+ STEP_ELT(5, w5, fun, s, pp8b); \
+ STEP_ELT(6, w6, fun, s, pp8b); \
+ STEP_ELT(7, w7, fun, s, pp8b); \
+ } while (0)
+
+#define SIMD_M3_0_0 0_
+#define SIMD_M3_1_0 1_
+#define SIMD_M3_2_0 2_
+#define SIMD_M3_3_0 0_
+#define SIMD_M3_4_0 1_
+#define SIMD_M3_5_0 2_
+#define SIMD_M3_6_0 0_
+#define SIMD_M3_7_0 1_
+
+#define SIMD_M3_0_1 1_
+#define SIMD_M3_1_1 2_
+#define SIMD_M3_2_1 0_
+#define SIMD_M3_3_1 1_
+#define SIMD_M3_4_1 2_
+#define SIMD_M3_5_1 0_
+#define SIMD_M3_6_1 1_
+#define SIMD_M3_7_1 2_
+
+#define SIMD_M3_0_2 2_
+#define SIMD_M3_1_2 0_
+#define SIMD_M3_2_2 1_
+#define SIMD_M3_3_2 2_
+#define SIMD_M3_4_2 0_
+#define SIMD_M3_5_2 1_
+#define SIMD_M3_6_2 2_
+#define SIMD_M3_7_2 0_
+
+#define M7_0_0 0_
+#define M7_1_0 1_
+#define M7_2_0 2_
+#define M7_3_0 3_
+#define M7_4_0 4_
+#define M7_5_0 5_
+#define M7_6_0 6_
+#define M7_7_0 0_
+
+#define M7_0_1 1_
+#define M7_1_1 2_
+#define M7_2_1 3_
+#define M7_3_1 4_
+#define M7_4_1 5_
+#define M7_5_1 6_
+#define M7_6_1 0_
+#define M7_7_1 1_
+
+#define M7_0_2 2_
+#define M7_1_2 3_
+#define M7_2_2 4_
+#define M7_3_2 5_
+#define M7_4_2 6_
+#define M7_5_2 0_
+#define M7_6_2 1_
+#define M7_7_2 2_
+
+#define M7_0_3 3_
+#define M7_1_3 4_
+#define M7_2_3 5_
+#define M7_3_3 6_
+#define M7_4_3 0_
+#define M7_5_3 1_
+#define M7_6_3 2_
+#define M7_7_3 3_
+
+#define STEP_BIG_(w, fun, r, s, pp8b) STEP_BIG w, fun, r, s, pp8b)
+
+#define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3) do { \
+ STEP_BIG_(WB_ ## ri ## 0, \
+ IF, p0, p1, XCAT(PP8_, M7_0_ ## isp)); \
+ STEP_BIG_(WB_ ## ri ## 1, \
+ IF, p1, p2, XCAT(PP8_, M7_1_ ## isp)); \
+ STEP_BIG_(WB_ ## ri ## 2, \
+ IF, p2, p3, XCAT(PP8_, M7_2_ ## isp)); \
+ STEP_BIG_(WB_ ## ri ## 3, \
+ IF, p3, p0, XCAT(PP8_, M7_3_ ## isp)); \
+ STEP_BIG_(WB_ ## ri ## 4, \
+ MAJ, p0, p1, XCAT(PP8_, M7_4_ ## isp)); \
+ STEP_BIG_(WB_ ## ri ## 5, \
+ MAJ, p1, p2, XCAT(PP8_, M7_5_ ## isp)); \
+ STEP_BIG_(WB_ ## ri ## 6, \
+ MAJ, p2, p3, XCAT(PP8_, M7_6_ ## isp)); \
+ STEP_BIG_(WB_ ## ri ## 7, \
+ MAJ, p3, p0, XCAT(PP8_, M7_7_ ## isp)); \
+ } while (0)
+
+//__constant__ static const s32 SIMD_Q_64[] = {
+// 4, 28, -80, -120, -47, -126, 45, -123, -92, -127, -70, 23, -23, -24, 40, -125, 101, 122, 34, -24, -119, 110, -121, -112, 32, 24, 51, 73, -117, -64, -21, 42, -60, 16, 5, 85, 107, 52, -44, -96, 42, 127, -18, -108, -47, 26, 91, 117, 112, 46, 87, 79, 126, -120, 65, -24, 121, 29, 118, -7, -53, 85, -98, -117, 32, 115, -47, -116, 63, 16, -108, 49, -119, 57, -110, 4, -76, -76, -42, -86, 58, 115, 4, 4, -83, -51, -37, 116, 32, 15, 36, -42, 73, -99, 94, 87, 60, -20, 67, 12, -76, 55, 117, -68, -82, -80, 93, -20, 92, -21, -128, -91, -11, 84, -28, 76, 94, -124, 37, 93, 17, -78, -106, -29, 88, -15, -47, 102, -4, -28, 80, 120, 47, 126, -45, 123, 92, 127, 70, -23, 23, 24, -40, 125, -101, -122, -34, 24, 119, -110, 121, 112, -32, -24, -51, -73, 117, 64, 21, -42, 60, -16, -5, -85, -107, -52, 44, 96, -42, -127, 18, 108, 47, -26, -91, -117, -112, -46, -87, -79, -126, 120, -65, 24, -121, -29, -118, 7, 53, -85, 98, 117, -32, -115, 47, 116, -63, -16, 108, -49, 119, -57, 110, -4, 76, 76, 42, 86, -58, -115, -4, -4, 83, 51, 37, -116, -32, -15, -36, 42, -73, 99, -94, -87, -60, 20, -67, -12, 76, -55, -117, 68, 82, 80, -93, 20, -92, 21, 128, 91, 11, -84, 28, -76, -94, 124, -37, -93, -17, 78, 106, 29, -88, 15, 47, -102
+//};
+__constant__ static const s32 SIMD_Q_80[] = {
+ -125, -101, 48, 8, 81, 2, -84, 5, 36, 1, 58, -106, 105, 104, -89, 3, -28, -7, -95, 104, 9, -19, 7, 16, -97, -105, -78, -56, 11, 64, 107, -87, 68, -113, -124, -44, -22, -77, 84, 32, -87, -2, 110, 20, 81, -103, -38, -12, -17, -83, -42, -50, -3, 8, -64, 104, -8, -100, -11, 121, 75, -44, 30, 11, -97, -14, 81, 12, -66, -113, 20, -80, 9, -72, 18, -125, 52, 52, 86, 42, -71, -14, -125, -125, 45, 77, 91, -13, -97, -114, -93, 86, -56, 29, -35, -42, -69, 108, -62, -117, 52, -74, -12, 60, 46, 48, -36, 108, -37, 107, 0, 37, 117, -45, 100, -53, -35, 4, -92, -36, -112, 50, 22, 99, -41, 113, 81, -27, 124, 100, -49, -9, -82, -3, 83, -6, -37, -2, -59, 105, -106, -105, 88, -4, 27, 6, 94, -105, -10, 18, -8, -17, 96, 104, 77, 55, -12, -65, -108, 86, -69, 112, 123, 43, 21, 76, -85, -33, 86, 1, -111, -21, -82, 102, 37, 11, 16, 82, 41, 49, 2, -9, 63, -105, 7, 99, 10, -122, -76, 43, -31, -12, 96, 13, -82, -13, 65, 112, -21, 79, -10, 71, -19, 124, -53, -53, -87, -43, 70, 13, 124, 124, -46, -78, -92, 12, 96, 113, 92, -87, 55, -30, 34, 41, 68, -109, 61, 116, -53, 73, 11, -61, -47, -49, 35, -109, 36, -108, -1, -38, -118, 44, -101, 52, 34, -5, 91, 35, 111, -51, -23, -100, 40, -114, -82, 26
+};
+
+__constant__ static uint32_t c_PaddedMessage80[20];
+
+__host__
+void x16_simd512_setBlock_80(void *pdata)
+{
+ cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+}
+
+#define TPB_SIMD 128
+__global__
+__launch_bounds__(TPB_SIMD,1)
+static void x16_simd512_gpu_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_outputhash)
+{
+ const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ if (thread < threads)
+ {
+ uint32_t A[20];
+ #pragma unroll 10
+ for (int i=0; i < 20; i += 2)
+ AS_UINT2(&A[i]) = AS_UINT2(&c_PaddedMessage80[i]);
+ A[19] = cuda_swab32(startNonce + thread);
+
+ // simd
+ unsigned char x[128];
+ #pragma unroll
+ for (int i = 0; i < 20; i += 2)
+ AS_UINT2(&x[i*4]) = AS_UINT2(&A[i]);
+ #pragma unroll
+ for(int i = 80; i < 128; i+=4) AS_U32(&x[i]) = 0;
+
+ // SIMD_IV512
+ u32 A0 = 0x0BA16B95, A1 = 0x72F999AD, A2 = 0x9FECC2AE, A3 = 0xBA3264FC, A4 = 0x5E894929, A5 = 0x8E9F30E5, A6 = 0x2F1DAA37, A7 = 0xF0F2C558;
+ u32 B0 = 0xAC506643, B1 = 0xA90635A5, B2 = 0xE25B878B, B3 = 0xAAB7878F, B4 = 0x88817F7A, B5 = 0x0A02892B, B6 = 0x559A7550, B7 = 0x598F657E;
+ u32 C0 = 0x7EEF60A1, C1 = 0x6B70E3E8, C2 = 0x9C1714D1, C3 = 0xB958E2A8, C4 = 0xAB02675E, C5 = 0xED1C014F, C6 = 0xCD8D65BB, C7 = 0xFDB7A257;
+ u32 D0 = 0x09254899, D1 = 0xD699C7BC, D2 = 0x9019B6DC, D3 = 0x2B9022E4, D4 = 0x8FA14956, D5 = 0x21BF9BD3, D6 = 0xB94D0943, D7 = 0x6FFDDC22;
+
+ s32 q[256];
+ FFT256(0, 1, 0, ll1);
+
+ #pragma unroll
+ for (int i = 0; i < 256; i ++) {
+ s32 tq = q[i] + yoff_b_n[i];
+ tq = REDS2(tq);
+ tq = REDS1(tq);
+ tq = REDS1(tq);
+ q[i] = (tq <= 128 ? tq : tq - 257);
+ }
+
+ A0 ^= A[ 0];
+ A1 ^= A[ 1];
+ A2 ^= A[ 2];
+ A3 ^= A[ 3];
+ A4 ^= A[ 4];
+ A5 ^= A[ 5];
+ A6 ^= A[ 6];
+ A7 ^= A[ 7];
+ B0 ^= A[ 8];
+ B1 ^= A[ 9];
+ B2 ^= A[10];
+ B3 ^= A[11];
+ B4 ^= A[12];
+ B5 ^= A[13];
+ B6 ^= A[14];
+ B7 ^= A[15];
+ C0 ^= A[16];
+ C1 ^= A[17];
+ C2 ^= A[18];
+ C3 ^= A[19];
+
+ ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27);
+ ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7);
+ ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5);
+ ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25);
+
+ STEP_BIG(
+ C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC),
+ C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558),
+ IF, 4, 13, PP8_4_);
+
+ STEP_BIG(
+ C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F),
+ C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E),
+ IF, 13, 10, PP8_5_);
+
+ STEP_BIG(
+ C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8),
+ C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257),
+ IF, 10, 25, PP8_6_);
+
+ STEP_BIG(
+ C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4),
+ C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22),
+ IF, 25, 4, PP8_0_);
+
+ // Second round
+
+ u32 COPY_A0 = A0, COPY_A1 = A1, COPY_A2 = A2, COPY_A3 = A3, COPY_A4 = A4, COPY_A5 = A5, COPY_A6 = A6, COPY_A7 = A7;
+ u32 COPY_B0 = B0, COPY_B1 = B1, COPY_B2 = B2, COPY_B3 = B3, COPY_B4 = B4, COPY_B5 = B5, COPY_B6 = B6, COPY_B7 = B7;
+ u32 COPY_C0 = C0, COPY_C1 = C1, COPY_C2 = C2, COPY_C3 = C3, COPY_C4 = C4, COPY_C5 = C5, COPY_C6 = C6, COPY_C7 = C7;
+ u32 COPY_D0 = D0, COPY_D1 = D1, COPY_D2 = D2, COPY_D3 = D3, COPY_D4 = D4, COPY_D5 = D5, COPY_D6 = D6, COPY_D7 = D7;
+
+ #define q SIMD_Q_80
+
+ A0 ^= 0x280; // bitlen
+
+ ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27);
+ ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7);
+ ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5);
+ ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25);
+
+ STEP_BIG(
+ COPY_A0, COPY_A1, COPY_A2, COPY_A3,
+ COPY_A4, COPY_A5, COPY_A6, COPY_A7,
+ IF, 4, 13, PP8_4_);
+
+ STEP_BIG(
+ COPY_B0, COPY_B1, COPY_B2, COPY_B3,
+ COPY_B4, COPY_B5, COPY_B6, COPY_B7,
+ IF, 13, 10, PP8_5_);
+
+ STEP_BIG(
+ COPY_C0, COPY_C1, COPY_C2, COPY_C3,
+ COPY_C4, COPY_C5, COPY_C6, COPY_C7,
+ IF, 10, 25, PP8_6_);
+
+ STEP_BIG(
+ COPY_D0, COPY_D1, COPY_D2, COPY_D3,
+ COPY_D4, COPY_D5, COPY_D6, COPY_D7,
+ IF, 25, 4, PP8_0_);
+
+ #undef q
+
+ A[ 0] = A0;
+ A[ 1] = A1;
+ A[ 2] = A2;
+ A[ 3] = A3;
+ A[ 4] = A4;
+ A[ 5] = A5;
+ A[ 6] = A6;
+ A[ 7] = A7;
+ A[ 8] = B0;
+ A[ 9] = B1;
+ A[10] = B2;
+ A[11] = B3;
+ A[12] = B4;
+ A[13] = B5;
+ A[14] = B6;
+ A[15] = B7;
+
+ const uint64_t hashPosition = thread;
+ uint32_t *Hash = (uint32_t*)(&g_outputhash[(size_t)8 * hashPosition]);
+ #pragma unroll
+ for (int i=0; i < 16; i += 2)
+ *(uint2*)&Hash[i] = *(uint2*)&A[i];
+ }
+}
+
+/***************************************************/
+
+__host__
+void x16_simd512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
+{
+ const uint32_t tpb = 128;
+ const dim3 grid((threads + tpb - 1) / tpb);
+ const dim3 block(tpb);
+ x16_simd512_gpu_80 <<>> (threads, startNonce, (uint64_t*) d_hash);
+}
diff --git a/x16r/cuda_x16r.h b/x16r/cuda_x16r.h
new file mode 100644
index 0000000..976793c
--- /dev/null
+++ b/x16r/cuda_x16r.h
@@ -0,0 +1,75 @@
+#include "x11/cuda_x11.h"
+
+extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
+
+extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
+extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag);
+extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x15_whirlpool_cpu_free(int thr_id);
+
+extern void x17_sha512_cpu_init(int thr_id, uint32_t threads);
+extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash);
+
+extern void x17_haval256_cpu_init(int thr_id, uint32_t threads);
+extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, const int outlen);
+
+void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
+
+// ---- 80 bytes kernels
+
+void quark_bmw512_cpu_setBlock_80(void *pdata);
+void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
+
+void groestl512_setBlock_80(int thr_id, uint32_t *endiandata);
+void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void skein512_cpu_setBlock_80(void *pdata);
+void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap);
+
+void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
+void qubit_luffa512_cpu_setBlock_80(void *pdata);
+void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
+
+void jh512_setBlock_80(int thr_id, uint32_t *endiandata);
+void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void keccak512_setBlock_80(int thr_id, uint32_t *endiandata);
+void keccak512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
+void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x11_shavite512_setBlock_80(void *pdata);
+void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
+
+void x16_shabal512_setBlock_80(void *pdata);
+void x16_shabal512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x16_simd512_setBlock_80(void *pdata);
+void x16_simd512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x16_echo512_cuda_init(int thr_id, const uint32_t threads);
+void x16_echo512_setBlock_80(void *pdata);
+void x16_echo512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x16_hamsi512_setBlock_80(void *pdata);
+void x16_hamsi512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x16_fugue512_cpu_init(int thr_id, uint32_t threads);
+void x16_fugue512_cpu_free(int thr_id);
+void x16_fugue512_setBlock_80(void *pdata);
+void x16_fugue512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x16_whirlpool512_init(int thr_id, uint32_t threads);
+void x16_whirlpool512_setBlock_80(void* endiandata);
+void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x16_sha512_setBlock_80(void *pdata);
+void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
diff --git a/x16r/x16r.cu b/x16r/x16r.cu
new file mode 100644
index 0000000..65c2155
--- /dev/null
+++ b/x16r/x16r.cu
@@ -0,0 +1,625 @@
+/**
+ * X16R algorithm (X16 with Randomized chain order)
+ *
+ * tpruvot 2018 - GPL code
+ */
+
+#include
+#include
+#include
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+
+#include "sph/sph_hamsi.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_shabal.h"
+#include "sph/sph_whirlpool.h"
+#include "sph/sph_sha2.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "cuda_x16r.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+enum Algo {
+ BLAKE = 0,
+ BMW,
+ GROESTL,
+ JH,
+ KECCAK,
+ SKEIN,
+ LUFFA,
+ CUBEHASH,
+ SHAVITE,
+ SIMD,
+ ECHO,
+ HAMSI,
+ FUGUE,
+ SHABAL,
+ WHIRLPOOL,
+ SHA512,
+ HASH_FUNC_COUNT
+};
+
+static const char* algo_strings[] = {
+ "blake",
+ "bmw512",
+ "groestl",
+ "jh512",
+ "keccak",
+ "skein",
+ "luffa",
+ "cube",
+ "shavite",
+ "simd",
+ "echo",
+ "hamsi",
+ "fugue",
+ "shabal",
+ "whirlpool",
+ "sha512",
+ NULL
+};
+
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread bool s_implemented = false;
+static __thread char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
+
+static void getAlgoString(const uint32_t* prevblock, char *output)
+{
+ char *sptr = output;
+ uint8_t* data = (uint8_t*)prevblock;
+
+ for (uint8_t j = 0; j < HASH_FUNC_COUNT; j++) {
+ uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed
+ uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4;
+ if (algoDigit >= 10)
+ sprintf(sptr, "%c", 'A' + (algoDigit - 10));
+ else
+ sprintf(sptr, "%u", (uint32_t) algoDigit);
+ sptr++;
+ }
+ *sptr = '\0';
+}
+
+// X16R CPU Hash (Validation)
+extern "C" void x16r_hash(void *output, const void *input)
+{
+ unsigned char _ALIGN(64) hash[128];
+
+ sph_blake512_context ctx_blake;
+ sph_bmw512_context ctx_bmw;
+ sph_groestl512_context ctx_groestl;
+ sph_jh512_context ctx_jh;
+ sph_keccak512_context ctx_keccak;
+ sph_skein512_context ctx_skein;
+ sph_luffa512_context ctx_luffa;
+ sph_cubehash512_context ctx_cubehash;
+ sph_shavite512_context ctx_shavite;
+ sph_simd512_context ctx_simd;
+ sph_echo512_context ctx_echo;
+ sph_hamsi512_context ctx_hamsi;
+ sph_fugue512_context ctx_fugue;
+ sph_shabal512_context ctx_shabal;
+ sph_whirlpool_context ctx_whirlpool;
+ sph_sha512_context ctx_sha512;
+
+ void *in = (void*) input;
+ int size = 80;
+
+ uint32_t *in32 = (uint32_t*) input;
+ getAlgoString(&in32[1], hashOrder);
+
+ for (int i = 0; i < 16; i++)
+ {
+ const char elem = hashOrder[i];
+ const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+ switch (algo) {
+ case BLAKE:
+ sph_blake512_init(&ctx_blake);
+ sph_blake512(&ctx_blake, in, size);
+ sph_blake512_close(&ctx_blake, hash);
+ break;
+ case BMW:
+ sph_bmw512_init(&ctx_bmw);
+ sph_bmw512(&ctx_bmw, in, size);
+ sph_bmw512_close(&ctx_bmw, hash);
+ break;
+ case GROESTL:
+ sph_groestl512_init(&ctx_groestl);
+ sph_groestl512(&ctx_groestl, in, size);
+ sph_groestl512_close(&ctx_groestl, hash);
+ break;
+ case SKEIN:
+ sph_skein512_init(&ctx_skein);
+ sph_skein512(&ctx_skein, in, size);
+ sph_skein512_close(&ctx_skein, hash);
+ break;
+ case JH:
+ sph_jh512_init(&ctx_jh);
+ sph_jh512(&ctx_jh, in, size);
+ sph_jh512_close(&ctx_jh, hash);
+ break;
+ case KECCAK:
+ sph_keccak512_init(&ctx_keccak);
+ sph_keccak512(&ctx_keccak, in, size);
+ sph_keccak512_close(&ctx_keccak, hash);
+ break;
+ case LUFFA:
+ sph_luffa512_init(&ctx_luffa);
+ sph_luffa512(&ctx_luffa, in, size);
+ sph_luffa512_close(&ctx_luffa, hash);
+ break;
+ case CUBEHASH:
+ sph_cubehash512_init(&ctx_cubehash);
+ sph_cubehash512(&ctx_cubehash, in, size);
+ sph_cubehash512_close(&ctx_cubehash, hash);
+ break;
+ case SHAVITE:
+ sph_shavite512_init(&ctx_shavite);
+ sph_shavite512(&ctx_shavite, in, size);
+ sph_shavite512_close(&ctx_shavite, hash);
+ break;
+ case SIMD:
+ sph_simd512_init(&ctx_simd);
+ sph_simd512(&ctx_simd, in, size);
+ sph_simd512_close(&ctx_simd, hash);
+ break;
+ case ECHO:
+ sph_echo512_init(&ctx_echo);
+ sph_echo512(&ctx_echo, in, size);
+ sph_echo512_close(&ctx_echo, hash);
+ break;
+ case HAMSI:
+ sph_hamsi512_init(&ctx_hamsi);
+ sph_hamsi512(&ctx_hamsi, in, size);
+ sph_hamsi512_close(&ctx_hamsi, hash);
+ break;
+ case FUGUE:
+ sph_fugue512_init(&ctx_fugue);
+ sph_fugue512(&ctx_fugue, in, size);
+ sph_fugue512_close(&ctx_fugue, hash);
+ break;
+ case SHABAL:
+ sph_shabal512_init(&ctx_shabal);
+ sph_shabal512(&ctx_shabal, in, size);
+ sph_shabal512_close(&ctx_shabal, hash);
+ break;
+ case WHIRLPOOL:
+ sph_whirlpool_init(&ctx_whirlpool);
+ sph_whirlpool(&ctx_whirlpool, in, size);
+ sph_whirlpool_close(&ctx_whirlpool, hash);
+ break;
+ case SHA512:
+ sph_sha512_init(&ctx_sha512);
+ sph_sha512(&ctx_sha512,(const void*) in, size);
+ sph_sha512_close(&ctx_sha512,(void*) hash);
+ break;
+ }
+ in = (void*) hash;
+ size = 64;
+ }
+ memcpy(output, hash, 32);
+}
+
+void whirlpool_midstate(void *state, const void *input)
+{
+ sph_whirlpool_context ctx;
+
+ sph_whirlpool_init(&ctx);
+ sph_whirlpool(&ctx, input, 64);
+
+ memcpy(state, ctx.state, 64);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+//#define _DEBUG
+#define _DEBUG_PREFIX "x16r-"
+#include "cuda_debug.cuh"
+
+//static int algo80_tests[HASH_FUNC_COUNT] = { 0 };
+//static int algo64_tests[HASH_FUNC_COUNT] = { 0 };
+static int algo80_fails[HASH_FUNC_COUNT] = { 0 };
+
+extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+ uint32_t *pdata = work->data;
+ uint32_t *ptarget = work->target;
+ const uint32_t first_nonce = pdata[19];
+ const int dev_id = device_map[thr_id];
+ int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19;
+ if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20;
+ uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+ //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+ if (!init[thr_id])
+ {
+ cudaSetDevice(device_map[thr_id]);
+ if (opt_cudaschedule == -1 && gpu_threads == 1) {
+ cudaDeviceReset();
+ // reduce cpu usage
+ cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+ }
+ gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+ quark_blake512_cpu_init(thr_id, throughput);
+ quark_bmw512_cpu_init(thr_id, throughput);
+ quark_groestl512_cpu_init(thr_id, throughput);
+ quark_skein512_cpu_init(thr_id, throughput);
+ quark_jh512_cpu_init(thr_id, throughput);
+ quark_keccak512_cpu_init(thr_id, throughput);
+ qubit_luffa512_cpu_init(thr_id, throughput);
+ x11_luffa512_cpu_init(thr_id, throughput); // 64
+ x11_shavite512_cpu_init(thr_id, throughput);
+ x11_simd512_cpu_init(thr_id, throughput); // 64
+ x11_echo512_cpu_init(thr_id, throughput);
+ x16_echo512_cuda_init(thr_id, throughput);
+ x13_hamsi512_cpu_init(thr_id, throughput);
+ x13_fugue512_cpu_init(thr_id, throughput);
+ x16_fugue512_cpu_init(thr_id, throughput);
+ x14_shabal512_cpu_init(thr_id, throughput);
+ x15_whirlpool_cpu_init(thr_id, throughput, 0);
+ x16_whirlpool512_init(thr_id, throughput);
+ x17_sha512_cpu_init(thr_id, throughput);
+
+ CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
+
+ cuda_check_cpu_init(thr_id, throughput);
+
+ init[thr_id] = true;
+ }
+
+ if (opt_benchmark) {
+ ((uint32_t*)ptarget)[7] = 0x003f;
+ ((uint8_t*)pdata)[8] = 0x90; // hashOrder[0] = '9'; for simd 80 + blake512 64
+ //((uint8_t*)pdata)[8] = 0xA0; // hashOrder[0] = 'A'; for echo 80 + blake512 64
+ //((uint8_t*)pdata)[8] = 0xB0; // hashOrder[0] = 'B'; for hamsi 80 + blake512 64
+ //((uint8_t*)pdata)[8] = 0xC0; // hashOrder[0] = 'C'; for fugue 80 + blake512 64
+ //((uint8_t*)pdata)[8] = 0xE0; // hashOrder[0] = 'E'; for whirlpool 80 + blake512 64
+ }
+ uint32_t _ALIGN(64) endiandata[20];
+
+ for (int k=0; k < 19; k++)
+ be32enc(&endiandata[k], pdata[k]);
+
+ uint32_t ntime = swab32(pdata[17]);
+ if (s_ntime != ntime) {
+ getAlgoString(&endiandata[1], hashOrder);
+ s_ntime = ntime;
+ s_implemented = true;
+ if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime);
+ }
+
+ if (!s_implemented) {
+ sleep(1);
+ return -1;
+ }
+
+ cuda_check_cpu_setTarget(ptarget);
+
+ char elem = hashOrder[0];
+ const uint8_t algo80 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+ switch (algo80) {
+ case BLAKE:
+ quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+ break;
+ case BMW:
+ quark_bmw512_cpu_setBlock_80(endiandata);
+ break;
+ case GROESTL:
+ groestl512_setBlock_80(thr_id, endiandata);
+ break;
+ case JH:
+ jh512_setBlock_80(thr_id, endiandata);
+ break;
+ case KECCAK:
+ keccak512_setBlock_80(thr_id, endiandata);
+ break;
+ case SKEIN:
+ skein512_cpu_setBlock_80((void*)endiandata);
+ break;
+ case LUFFA:
+ qubit_luffa512_cpu_setBlock_80((void*)endiandata);
+ break;
+ case CUBEHASH:
+ cubehash512_setBlock_80(thr_id, endiandata);
+ break;
+ case SHAVITE:
+ x11_shavite512_setBlock_80((void*)endiandata);
+ break;
+ case SIMD:
+ x16_simd512_setBlock_80((void*)endiandata);
+ break;
+ case ECHO:
+ x16_echo512_setBlock_80((void*)endiandata);
+ break;
+ case HAMSI:
+ x16_hamsi512_setBlock_80((void*)endiandata);
+ break;
+ case FUGUE:
+ x16_fugue512_setBlock_80((void*)pdata);
+ break;
+ case SHABAL:
+ x16_shabal512_setBlock_80((void*)endiandata);
+ break;
+ case WHIRLPOOL:
+ x16_whirlpool512_setBlock_80((void*)endiandata);
+ break;
+ case SHA512:
+ x16_sha512_setBlock_80(endiandata);
+ break;
+ default: {
+ if (!thr_id)
+ applog(LOG_WARNING, "kernel %s %c unimplemented, order %s", algo_strings[algo80], elem, hashOrder);
+ s_implemented = false;
+ sleep(5);
+ return -1;
+ }
+ }
+
+ int warn = 0;
+
+ do {
+ int order = 0;
+
+ // Hash with CUDA
+
+ switch (algo80) {
+ case BLAKE:
+ quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+ TRACE("blake80:");
+ break;
+ case BMW:
+ quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+ TRACE("bmw80 :");
+ break;
+ case GROESTL:
+ groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+ TRACE("grstl80:");
+ break;
+ case JH:
+ jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+ TRACE("jh51280:");
+ break;
+ case KECCAK:
+ keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+ TRACE("kecck80:");
+ break;
+ case SKEIN:
+ skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
+ TRACE("skein80:");
+ break;
+ case LUFFA:
+ qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+ TRACE("luffa80:");
+ break;
+ case CUBEHASH:
+ cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+ TRACE("cube 80:");
+ break;
+ case SHAVITE:
+ x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+ TRACE("shavite:");
+ break;
+ case SIMD:
+ x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+ TRACE("simd512:");
+ break;
+ case ECHO:
+ x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+ TRACE("echo :");
+ break;
+ case HAMSI:
+ x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+ TRACE("hamsi :");
+ break;
+ case FUGUE:
+ x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+ TRACE("fugue :");
+ break;
+ case SHABAL:
+ x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+ TRACE("shabal :");
+ break;
+ case WHIRLPOOL:
+ x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+ TRACE("whirl :");
+ break;
+ case SHA512:
+ x16_sha512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+ TRACE("sha512 :");
+ break;
+ }
+
+ for (int i = 1; i < 16; i++)
+ {
+ const char elem = hashOrder[i];
+ const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+ switch (algo64) {
+ case BLAKE:
+ quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ TRACE("blake :");
+ break;
+ case BMW:
+ quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ TRACE("bmw :");
+ break;
+ case GROESTL:
+ quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ TRACE("groestl:");
+ break;
+ case JH:
+ quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ TRACE("jh512 :");
+ break;
+ case KECCAK:
+ quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ TRACE("keccak :");
+ break;
+ case SKEIN:
+ quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ TRACE("skein :");
+ break;
+ case LUFFA:
+ x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ TRACE("luffa :");
+ break;
+ case CUBEHASH:
+ x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ TRACE("cube :");
+ break;
+ case SHAVITE:
+ x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ TRACE("shavite:");
+ break;
+ case SIMD:
+ x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ TRACE("simd :");
+ break;
+ case ECHO:
+ x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ TRACE("echo :");
+ break;
+ case HAMSI:
+ x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ TRACE("hamsi :");
+ break;
+ case FUGUE:
+ x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ TRACE("fugue :");
+ break;
+ case SHABAL:
+ x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ TRACE("shabal :");
+ break;
+ case WHIRLPOOL:
+ x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+ TRACE("shabal :");
+ break;
+ case SHA512:
+ x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+ TRACE("sha512 :");
+ break;
+ }
+ }
+
+ *hashes_done = pdata[19] - first_nonce + throughput;
+
+ work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+#ifdef _DEBUG
+ uint32_t _ALIGN(64) dhash[8];
+ be32enc(&endiandata[19], pdata[19]);
+ x16r_hash(dhash, endiandata);
+ applog_hash(dhash);
+ return -1;
+#endif
+ if (work->nonces[0] != UINT32_MAX)
+ {
+ const uint32_t Htarg = ptarget[7];
+ uint32_t _ALIGN(64) vhash[8];
+ be32enc(&endiandata[19], work->nonces[0]);
+ x16r_hash(vhash, endiandata);
+
+ if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+ work->valid_nonces = 1;
+ work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+ work_set_target_ratio(work, vhash);
+ if (work->nonces[1] != 0) {
+ be32enc(&endiandata[19], work->nonces[1]);
+ x16r_hash(vhash, endiandata);
+ bn_set_target_ratio(work, vhash, 1);
+ work->valid_nonces++;
+ pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+ } else {
+ pdata[19] = work->nonces[0] + 1; // cursor
+ }
+#if 0
+ gpulog(LOG_INFO, thr_id, "hash found with %s 80!", algo_strings[algo80]);
+
+ algo80_tests[algo80] += work->valid_nonces;
+ char oks64[128] = { 0 };
+ char oks80[128] = { 0 };
+ char fails[128] = { 0 };
+ for (int a = 0; a < HASH_FUNC_COUNT; a++) {
+ const char elem = hashOrder[a];
+ const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+ if (a > 0) algo64_tests[algo64] += work->valid_nonces;
+ sprintf(&oks64[strlen(oks64)], "|%X:%2d", a, algo64_tests[a] < 100 ? algo64_tests[a] : 99);
+ sprintf(&oks80[strlen(oks80)], "|%X:%2d", a, algo80_tests[a] < 100 ? algo80_tests[a] : 99);
+ sprintf(&fails[strlen(fails)], "|%X:%2d", a, algo80_fails[a] < 100 ? algo80_fails[a] : 99);
+ }
+ applog(LOG_INFO, "K64: %s", oks64);
+ applog(LOG_INFO, "K80: %s", oks80);
+ applog(LOG_ERR, "F80: %s", fails);
+#endif
+ return work->valid_nonces;
+ }
+ else if (vhash[7] > Htarg) {
+ // x11+ coins could do some random error, but not on retry
+ gpu_increment_reject(thr_id);
+ algo80_fails[algo80]++;
+ if (!warn) {
+ warn++;
+ pdata[19] = work->nonces[0] + 1;
+ continue;
+ } else {
+ if (!opt_quiet) gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! %s %s",
+ work->nonces[0], algo_strings[algo80], hashOrder);
+ warn = 0;
+ }
+ }
+ }
+
+ if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+ pdata[19] = max_nonce;
+ break;
+ }
+
+ pdata[19] += throughput;
+
+ } while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+ *hashes_done = pdata[19] - first_nonce;
+ return 0;
+}
+
+// cleanup
+extern "C" void free_x16r(int thr_id)
+{
+ if (!init[thr_id])
+ return;
+
+ cudaThreadSynchronize();
+
+ cudaFree(d_hash[thr_id]);
+
+ quark_blake512_cpu_free(thr_id);
+ quark_groestl512_cpu_free(thr_id);
+ x11_simd512_cpu_free(thr_id);
+ x13_fugue512_cpu_free(thr_id);
+ x16_fugue512_cpu_free(thr_id); // to merge with x13_fugue512 ?
+ x15_whirlpool_cpu_free(thr_id);
+
+ cuda_check_cpu_free(thr_id);
+
+ cudaDeviceSynchronize();
+ init[thr_id] = false;
+}
diff --git a/x17/cuda_x17_sha512.cu b/x17/cuda_x17_sha512.cu
index bebf17d..a0757d0 100644
--- a/x17/cuda_x17_sha512.cu
+++ b/x17/cuda_x17_sha512.cu
@@ -169,3 +169,80 @@ void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,
x17_sha512_gpu_hash_64 <<>> (threads, (uint64_t*)d_hash);
}
+
+__constant__
+static uint64_t c_PaddedMessage80[10];
+
+__global__
+/*__launch_bounds__(256, 4)*/
+void x16_sha512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
+{
+ const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+ if (thread < threads)
+ {
+ uint64_t W[80];
+ #pragma unroll
+ for (int i = 0; i < 9; i ++) {
+ W[i] = SWAP64(c_PaddedMessage80[i]);
+ }
+ const uint32_t nonce = startNonce + thread;
+ //((uint32_t*)W)[19] = cuda_swab32(nonce);
+ W[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nonce));
+ W[9] = cuda_swab64(W[9]);
+ W[10] = 0x8000000000000000;
+
+ #pragma unroll
+ for (int i = 11; i<15; i++) {
+ W[i] = 0U;
+ }
+ W[15] = 0x0000000000000280;
+
+ #pragma unroll 64
+ for (int i = 16; i < 80; i ++) {
+ W[i] = SSG5_1(W[i-2]) + W[i-7];
+ W[i] += SSG5_0(W[i-15]) + W[i-16];
+ }
+
+ const uint64_t IV512[8] = {
+ 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
+ 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+ 0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
+ 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
+ };
+
+ uint64_t r[8];
+ #pragma unroll
+ for (int i = 0; i < 8; i++) {
+ r[i] = IV512[i];
+ }
+
+ #pragma unroll
+ for (int i = 0; i < 80; i++) {
+ SHA3_STEP(c_WB, r, W, i&7, i);
+ }
+
+ const uint64_t hashPosition = thread;
+ uint64_t *pHash = &g_hash[hashPosition << 3];
+ #pragma unroll
+ for (int u = 0; u < 8; u ++) {
+ pHash[u] = SWAP64(r[u] + IV512[u]);
+ }
+ }
+}
+
+__host__
+void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
+{
+ const uint32_t threadsperblock = 256;
+
+ dim3 grid((threads + threadsperblock-1)/threadsperblock);
+ dim3 block(threadsperblock);
+
+ x16_sha512_gpu_hash_80 <<>> (threads, startNounce, (uint64_t*)d_hash);
+}
+
+__host__
+void x16_sha512_setBlock_80(void *pdata)
+{
+ cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+}
\ No newline at end of file