diff --git a/Makefile.am b/Makefile.am index 01fc6b1..0d802ed 100644 --- a/Makefile.am +++ b/Makefile.am @@ -76,6 +76,8 @@ ccminer_SOURCES = elist.h miner.h compat.h \ x13/hsr.cu x13/cuda_hsr_sm3.cu x13/sm3.c \ x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \ x15/whirlpool.cu x15/cuda_x15_whirlpool_sm3.cu \ + x16r/x16r.cu x16r/cuda_x16_echo512.cu x16r/cuda_x16_fugue512.cu \ + x16r/cuda_x16_shabal512.cu x16r/cuda_x16_simd512_80.cu \ x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \ x11/phi.cu x11/cuda_streebog_maxwell.cu \ x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu diff --git a/README.txt b/README.txt index 327ce31..74d38d1 100644 --- a/README.txt +++ b/README.txt @@ -1,5 +1,5 @@ -ccminer 2.2.4 (Jan. 2018) "lyra2v2 and keccak improvements" +ccminer 2.2.5 (Feb 2018) "x16r algo" --------------------------------------------------------------- *************************************************************** @@ -122,6 +122,7 @@ its command line interface and options. x11 use to mine DarkCoin x14 use to mine X14Coin x15 use to mine Halcyon + x16r use to mine Raven x17 use to mine X17 vanilla use to mine Vanilla (Blake256) veltor use to mine VeltorCoin @@ -277,6 +278,9 @@ so we can more efficiently implement new algorithms using the latest hardware features. >>> RELEASE HISTORY <<< + Feb. 2017 v2.2.5 + New x16r algo + Jan. 04th 2017 v2.2.4 Improve lyra2v2 Higher keccak default intensity diff --git a/algos.h b/algos.h index c77b5e8..014b4cb 100644 --- a/algos.h +++ b/algos.h @@ -59,6 +59,7 @@ enum sha_algos { ALGO_X13, ALGO_X14, ALGO_X15, + ALGO_X16R, ALGO_X17, ALGO_VANILLA, ALGO_VELTOR, @@ -128,6 +129,7 @@ static const char *algo_names[] = { "x13", "x14", "x15", + "x16r", "x17", "vanilla", "veltor", diff --git a/bench.cpp b/bench.cpp index baa999d..0fe248f 100644 --- a/bench.cpp +++ b/bench.cpp @@ -102,6 +102,7 @@ void algo_free_all(int thr_id) free_x13(thr_id); free_x14(thr_id); free_x15(thr_id); + free_x16r(thr_id); free_x17(thr_id); free_zr5(thr_id); free_scrypt(thr_id); diff --git a/ccminer.cpp b/ccminer.cpp index 6c6d33f..46957e7 100644 --- a/ccminer.cpp +++ b/ccminer.cpp @@ -293,6 +293,7 @@ Options:\n\ x13 X13 (MaruCoin)\n\ x14 X14\n\ x15 X15\n\ + x16r X16R (Raven)\n\ x17 X17\n\ wildkeccak Boolberry\n\ zr5 ZR5 (ZiftrCoin)\n\ @@ -1705,6 +1706,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work) case ALGO_LYRA2Z: case ALGO_TIMETRAVEL: case ALGO_BITCORE: + case ALGO_X16R: work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty)); break; case ALGO_KECCAK: @@ -2499,6 +2501,9 @@ static void *miner_thread(void *userdata) case ALGO_X15: rc = scanhash_x15(thr_id, &work, max_nonce, &hashes_done); break; + case ALGO_X16R: + rc = scanhash_x16r(thr_id, &work, max_nonce, &hashes_done); + break; case ALGO_X17: rc = scanhash_x17(thr_id, &work, max_nonce, &hashes_done); break; diff --git a/ccminer.vcxproj b/ccminer.vcxproj index 2a37505..5fe11d7 100644 --- a/ccminer.vcxproj +++ b/ccminer.vcxproj @@ -269,6 +269,7 @@ + 76 @@ -434,6 +435,12 @@ + + + + + + @@ -587,8 +594,7 @@ - - + @@ -615,4 +621,4 @@ - + \ No newline at end of file diff --git a/ccminer.vcxproj.filters b/ccminer.vcxproj.filters index a508e22..a8f149e 100644 --- a/ccminer.vcxproj.filters +++ b/ccminer.vcxproj.filters @@ -58,6 +58,9 @@ {a2403c22-6777-46ab-a55a-3fcc7386c974} + + {55dfae6a-66ba-43e2-8ceb-98ee70cbdf16} + {85dfae6a-66ca-4332-8cec-98ee70cbdf2f} @@ -596,6 +599,9 @@ Source Files\equi + + Header Files\CUDA + @@ -967,6 +973,24 @@ Source Files\equi + + Source Files\CUDA\x15 + + + Source Files\CUDA\x16r + + + Source Files\CUDA\x16r + + + Source Files\CUDA\x16r + + + Source Files\CUDA\x16r + + + Source Files\CUDA\x16r + @@ -983,4 +1007,4 @@ Ressources - + \ No newline at end of file diff --git a/compat/ccminer-config.h b/compat/ccminer-config.h index d3aeabc..17efd4c 100644 --- a/compat/ccminer-config.h +++ b/compat/ccminer-config.h @@ -164,7 +164,7 @@ #define PACKAGE_URL "http://github.com/tpruvot/ccminer" /* Define to the version of this package. */ -#define PACKAGE_VERSION "2.2.4" +#define PACKAGE_VERSION "2.2.5" /* If using the C implementation of alloca, define if you know the direction of stack growth for your system; otherwise it will be diff --git a/miner.h b/miner.h index 0ad8523..86b1c1d 100644 --- a/miner.h +++ b/miner.h @@ -325,6 +325,7 @@ extern int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, unsig extern int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); +extern int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done); extern int scanhash_zr5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done); @@ -389,6 +390,7 @@ extern void free_x11(int thr_id); extern void free_x13(int thr_id); extern void free_x14(int thr_id); extern void free_x15(int thr_id); +extern void free_x16r(int thr_id); extern void free_x17(int thr_id); extern void free_zr5(int thr_id); //extern void free_sha256d(int thr_id); @@ -933,6 +935,7 @@ void x11hash(void *output, const void *input); void x13hash(void *output, const void *input); void x14hash(void *output, const void *input); void x15hash(void *output, const void *input); +void x16r_hash(void *output, const void *input); void x17hash(void *output, const void *input); void wildkeccak_hash(void *output, const void *input, uint64_t* scratchpad, uint64_t ssize); void zr5hash(void *output, const void *input); diff --git a/res/ccminer.rc b/res/ccminer.rc index 832e7be..fa9a8a9 100644 --- a/res/ccminer.rc +++ b/res/ccminer.rc @@ -60,8 +60,8 @@ IDI_ICON1 ICON "ccminer.ico" // VS_VERSION_INFO VERSIONINFO - FILEVERSION 2,2,4,0 - PRODUCTVERSION 2,2,4,0 + FILEVERSION 2,2,5,0 + PRODUCTVERSION 2,2,5,0 FILEFLAGSMASK 0x3fL #ifdef _DEBUG FILEFLAGS 0x21L @@ -76,10 +76,10 @@ BEGIN BEGIN BLOCK "040904e4" BEGIN - VALUE "FileVersion", "2.2.4" + VALUE "FileVersion", "2.2.5" VALUE "LegalCopyright", "Copyright (C) 2018" VALUE "ProductName", "ccminer" - VALUE "ProductVersion", "2.2.4" + VALUE "ProductVersion", "2.2.5" END END BLOCK "VarFileInfo" diff --git a/util.cpp b/util.cpp index 3141305..f5a922d 100644 --- a/util.cpp +++ b/util.cpp @@ -2325,6 +2325,9 @@ void print_hash_tests(void) x15hash(&hash[0], &buf[0]); printpfx("X15", hash); + x16r_hash(&hash[0], &buf[0]); + printpfx("X16r", hash); + x17hash(&hash[0], &buf[0]); printpfx("X17", hash); diff --git a/x13/cuda_x13_hamsi512.cu b/x13/cuda_x13_hamsi512.cu index 69070fb..3c21f7f 100644 --- a/x13/cuda_x13_hamsi512.cu +++ b/x13/cuda_x13_hamsi512.cu @@ -1,6 +1,6 @@ /* - * Quick Hamsi-512 for X13 - * by tsiv - 2014 + * Quick Hamsi-512 for X13 by tsiv - 2014 + * + Hamsi-512 80 by tpruvot - 2018 */ #include @@ -16,31 +16,17 @@ static __constant__ uint32_t d_alpha_f[32]; static __constant__ uint32_t d_T512[64][16]; static const uint32_t alpha_n[] = { - SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc), - SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), - SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc), - SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0), - SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0), - SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0), - SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00), - SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc), - SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0), - SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0), - SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0) + 0xff00f0f0, 0xccccaaaa, 0xf0f0cccc, 0xff00aaaa, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xf0f0ff00, + 0xf0f0cccc, 0xaaaaff00, 0xccccff00, 0xaaaaf0f0, 0xaaaaf0f0, 0xff00cccc, 0xccccf0f0, 0xff00aaaa, + 0xccccaaaa, 0xff00f0f0, 0xff00aaaa, 0xf0f0cccc, 0xf0f0ff00, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, + 0xaaaaff00, 0xf0f0cccc, 0xaaaaf0f0, 0xccccff00, 0xff00cccc, 0xaaaaf0f0, 0xff00aaaa, 0xccccf0f0 }; static const uint32_t alpha_f[] = { - SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0), - SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), - SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0), - SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c), - SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c), - SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c), - SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9), - SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0), - SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c), - SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c), - SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c) + 0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0, 0xcaf9f9c0, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, 0x639ccaf9, + 0x639c0ff0, 0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c, 0xf9c0639c, 0xcaf90ff0, 0x0ff0639c, 0xcaf9f9c0, + 0x0ff0f9c0, 0xcaf9639c, 0xcaf9f9c0, 0x639c0ff0, 0x639ccaf9, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, + 0xf9c0caf9, 0x639c0ff0, 0xf9c0639c, 0x0ff0caf9, 0xcaf90ff0, 0xf9c0639c, 0xcaf9f9c0, 0x0ff0639c }; #define hamsi_s00 m0 @@ -200,390 +186,134 @@ static const uint32_t alpha_f[] = { static const uint32_t T512[64][16] = { - { SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000), - SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9), - SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030), - SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000), - SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984), - SPH_C32(0x9e69af68) }, - { SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000), - SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), - SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240), - SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000), - SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5), - SPH_C32(0x0c26f262) }, - { SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000), - SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78), - SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400), - SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000), - SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f), - SPH_C32(0xdc24e61f) }, - { SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), - SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), - SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800), - SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000), - SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f), - SPH_C32(0x3daac2da) }, - { SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000), - SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1), - SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800), - SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000), - SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da), - SPH_C32(0x78cace29) }, - { SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000), - SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), - SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400), - SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000), - SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247), - SPH_C32(0x2dd1f9ab) }, - { SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000), - SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745), - SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00), - SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000), - SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f), - SPH_C32(0xbf2c0be2) }, - { SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000), - SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93), - SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000), - SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000), - SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36), - SPH_C32(0x32219526) }, - { SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000), - SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae), - SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001), - SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000), - SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f), - SPH_C32(0xac8e6c88) }, - { SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000), - SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), - SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004), - SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000), - SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96), - SPH_C32(0x7b1bd6b9) }, - { SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000), - SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba), - SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000), - SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000), - SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604), - SPH_C32(0xf746c320) }, - { SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), - SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), - SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009), - SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000), - SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a), - SPH_C32(0x69505b3a) }, - { SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000), - SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25), - SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050), - SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000), - SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2), - SPH_C32(0x8a341574) }, - { SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000), - SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), - SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0), - SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000), - SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc), - SPH_C32(0x450360bf) }, - { SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000), - SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543), - SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060), - SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000), - SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d), - SPH_C32(0xf3d45758) }, - { SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), - SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), - SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110), - SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000), - SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25), - SPH_C32(0x925c44e9) }, - { SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000), - SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514), - SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000), - SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000), - SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315), - SPH_C32(0xa123ff9f) }, - { SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000), - SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860), - SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000), - SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000), - SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e), - SPH_C32(0x1568ff0f) }, - { SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000), - SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6), - SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000), - SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000), - SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616), - SPH_C32(0xc5c1eb3e) }, - { SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000), - SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145), - SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000), - SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000), - SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6), - SPH_C32(0x1af21fe1) }, - { SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000), - SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae), - SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000), - SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000), - SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17), - SPH_C32(0x857f3c2b) }, - { SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), - SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), - SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000), - SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000), - SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94), - SPH_C32(0x2ba05a55) }, - { SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000), - SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757), - SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001), - SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000), - SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba), - SPH_C32(0xfeabf254) }, - { SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000), - SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), - SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002), - SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000), - SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7), - SPH_C32(0xfe1cdc7f) }, - { SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000), - SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea), - SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000), - SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000), - SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea), - SPH_C32(0xb0a51834) }, - { SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), - SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), - SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000), - SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000), - SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae), - SPH_C32(0xa6b8c28d) }, - { SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000), - SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75), - SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000), - SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000), - SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156), - SPH_C32(0x3a4e99d7) }, - { SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), - SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), - SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000), - SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000), - SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6), - SPH_C32(0xe1844257) }, - { SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000), - SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512), - SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000), - SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000), - SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37), - SPH_C32(0x2c3b504e) }, - { SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000), - SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856), - SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000), - SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000), - SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4), - SPH_C32(0x524a0d59) }, - { SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000), - SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc), - SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000), - SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000), - SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88), - SPH_C32(0x378dd173) }, - { SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), - SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f), - SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000), - SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000), - SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4), - SPH_C32(0x8b6c72bd) }, - { SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780), - SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418), - SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000), - SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000), - SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d), - SPH_C32(0x8e67b7fa) }, - { SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280), - SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), - SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000), - SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000), - SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec), - SPH_C32(0x443d3004) }, - { SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80), - SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924), - SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000), - SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000), - SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a), - SPH_C32(0xf4f6ea7b) }, - { SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300), - SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), - SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000), - SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000), - SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8), - SPH_C32(0x979961d0) }, - { SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380), - SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6), - SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000), - SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000), - SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812), - SPH_C32(0x98aa496e) }, - { SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180), - SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), - SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000), - SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000), - SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec), - SPH_C32(0x094e3198) }, - { SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000), - SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736), - SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000), - SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000), - SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76), - SPH_C32(0xe86cba2e) }, - { SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000), - SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431), - SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000), - SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000), - SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9), - SPH_C32(0x4b7eec55) }, - { SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001), - SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd), - SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000), - SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800), - SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429), - SPH_C32(0x1e7536a6) }, - { SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000), - SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), - SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000), - SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000), - SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46), - SPH_C32(0x24314f17) }, - { SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e), - SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d), - SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000), - SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000), - SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222), - SPH_C32(0x9075b1ce) }, - { SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a), - SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), - SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000), - SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000), - SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa), - SPH_C32(0x9b6ef888) }, - { SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e), - SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167), - SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000), - SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000), - SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e), - SPH_C32(0xd8b61463) }, - { SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c), - SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), - SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000), - SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000), - SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2), - SPH_C32(0x3ea660f7) }, - { SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e), - SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce), - SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000), - SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000), - SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018), - SPH_C32(0x7f975691) }, - { SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), - SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), - SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000), - SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000), - SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd), - SPH_C32(0x2c94459e) }, - { SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000), - SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da), - SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000), - SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0), - SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c), - SPH_C32(0x56a7b19f) }, - { SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000), - SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), - SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000), - SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220), - SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8), - SPH_C32(0x81fdf908) }, - { SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000), - SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d), - SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000), - SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060), - SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06), - SPH_C32(0x5bd61539) }, - { SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000), - SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), - SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000), - SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480), - SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f), - SPH_C32(0x15b961e7) }, - { SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000), - SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6), - SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000), - SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800), - SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14), - SPH_C32(0x2a2c18f0) }, - { SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), - SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), - SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000), - SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000), - SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23), - SPH_C32(0x551e3d6e) }, - { SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000), - SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da), - SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000), - SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000), - SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254), - SPH_C32(0x33c5244f) }, - { SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000), - SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), - SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000), - SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800), - SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c), - SPH_C32(0x8a58e6a4) }, - { SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000), - SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f), - SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000), - SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002), - SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808), - SPH_C32(0xda878000) }, - { SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), - SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a), - SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000), - SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005), - SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb), - SPH_C32(0x3c5dfffe) }, - { SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000), - SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e), - SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000), - SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003), - SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752), - SPH_C32(0x7b1675d7) }, - { SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000), - SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), - SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000), - SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008), - SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3), - SPH_C32(0x2879ebac) }, - { SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000), - SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e), - SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000), - SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001), - SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60), - SPH_C32(0xbe0a679e) }, - { SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000), - SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), - SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000), - SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012), - SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf), - SPH_C32(0x30aebcf7) }, - { SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000), - SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57), - SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000), - SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0), - SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03), - SPH_C32(0xc7ff60f0) }, - { SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000), - SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), - SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000), - SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140), - SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877), - SPH_C32(0xe7e00a94) } +{ 0xef0b0270, 0x3afd0000, 0x5dae0000, 0x69490000, 0x9b0f3c06, 0x4405b5f9, 0x66140a51, 0x924f5d0a, // 0 + 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68 }, +{ 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68, + 0x26600240, 0xddd80000, 0x722a0000, 0x4f060000, 0x936667ff, 0x29f944ce, 0x368b63d5, 0x0c26f262 }, +{ 0x145a3c00, 0xb9e90000, 0x61270000, 0xf1610000, 0xce613d6c, 0xb0493d78, 0x47a96720, 0xe18e24c5, + 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f }, +{ 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f, + 0x373d2800, 0x71500000, 0x95e00000, 0x0a140000, 0xbdac1909, 0x48ef9831, 0x456d6d1f, 0x3daac2da }, +{ 0x54285c00, 0xeaed0000, 0xc5d60000, 0xa1c50000, 0xb3a26770, 0x94a5c4e1, 0x6bb0419d, 0x551b3782, + 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29 }, +{ 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29, + 0xc8934400, 0x5a3e0000, 0x57870000, 0x4c560000, 0xea982435, 0x75b11115, 0x28b67247, 0x2dd1f9ab }, +{ 0x29449c00, 0x64e70000, 0xf24b0000, 0xc2f30000, 0x0ede4e8f, 0x56c23745, 0xf3e04259, 0x8d0d9ec4, + 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2 }, +{ 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2, // 7 + 0x6f299000, 0x6c850000, 0x2f160000, 0x782e0000, 0x644c37cd, 0x12dd1cd6, 0xd26a8c36, 0x32219526 }, +{ 0xf6800005, 0x3443c000, 0x24070000, 0x8f3d0000, 0x21373bfb, 0x0ab8d5ae, 0xcdc58b19, 0xd795ba31, + 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88 }, +{ 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88, + 0x50ff0004, 0x45744000, 0x3dfb0000, 0x19e60000, 0x1bbc5606, 0xe1727b5d, 0xe1a8cc96, 0x7b1bd6b9 }, +{ 0xf7750009, 0xcf3cc000, 0xc3d60000, 0x04920000, 0x029519a9, 0xf8e836ba, 0x7a87f14e, 0x9e16981a, + 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320 }, +{ 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320, + 0x231f0009, 0x42f40000, 0x66790000, 0x4ebb0000, 0xfedb5bd3, 0x315cb0d6, 0xe2b1674a, 0x69505b3a }, +{ 0x774400f0, 0xf15a0000, 0xf5b20000, 0x34140000, 0x89377e8c, 0x5a8bec25, 0x0bc3cd1e, 0xcf3775cb, + 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574 }, +{ 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574, + 0x832800a0, 0x67420000, 0xe1170000, 0x370b0000, 0xcba30034, 0x3c34923c, 0x9767bdcc, 0x450360bf }, +{ 0xe8870170, 0x9d720000, 0x12db0000, 0xd4220000, 0xf2886b27, 0xa921e543, 0x4ef8b518, 0x618813b1, // 14 + 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758 }, +{ 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758, + 0x5cb00110, 0x913e0000, 0x44190000, 0x888c0000, 0x66dc7418, 0x921f1d66, 0x55ceea25, 0x925c44e9 }, +{ 0x0c720000, 0x49e50f00, 0x42790000, 0x5cea0000, 0x33aa301a, 0x15822514, 0x95a34b7b, 0xb44b0090, + 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f }, +{ 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f, + 0xf2500000, 0xeebd0a00, 0x67a80000, 0xab8a0000, 0xba9b48c0, 0x0a56dd74, 0xdb73e86e, 0x1568ff0f }, +{ 0x45180000, 0xa5b51700, 0xf96a0000, 0x3b480000, 0x1ecc142c, 0x231395d6, 0x16bca6b0, 0xdf33f4df, + 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e }, +{ 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e, + 0xfd250000, 0xb3c41100, 0xcef00000, 0xcef90000, 0x3c4d7580, 0x8d5b6493, 0x7098b0a6, 0x1af21fe1 }, +{ 0x75a40000, 0xc28b2700, 0x94a40000, 0x90f50000, 0xfb7857e0, 0x49ce0bae, 0x1767c483, 0xaedf667e, + 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b }, +{ 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b, // 21 + 0xa4c20000, 0xd9372400, 0x0a480000, 0x66610000, 0xf87a12c7, 0x86bef75c, 0xa324df94, 0x2ba05a55 }, +{ 0x75c90003, 0x0e10c000, 0xd1200000, 0xbaea0000, 0x8bc42f3e, 0x8758b757, 0xbb28761d, 0x00b72e2b, + 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254 }, +{ 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254, + 0x9b060002, 0x61468000, 0x221e0000, 0x1d740000, 0x36715d27, 0x30495c92, 0xf11336a7, 0xfe1cdc7f }, +{ 0x86790000, 0x3f390002, 0xe19ae000, 0x98560000, 0x9565670e, 0x4e88c8ea, 0xd3dd4944, 0x161ddab9, + 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834 }, +{ 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834, + 0xb6ce0000, 0xdae90002, 0x156e8000, 0xda920000, 0xf6dd5a64, 0x36325c8a, 0xf272e8ae, 0xa6b8c28d }, +{ 0x14190000, 0x23ca003c, 0x50df0000, 0x44b60000, 0x1b6c67b0, 0x3cf3ac75, 0x61e610b0, 0xdbcadb80, + 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7 }, +{ 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7, + 0xf75a0000, 0x19840028, 0xa2190000, 0xeef80000, 0xc0722516, 0x19981260, 0x73dba1e6, 0xe1844257 }, +{ 0x54500000, 0x0671005c, 0x25ae0000, 0x6a1e0000, 0x2ea54edf, 0x664e8512, 0xbfba18c3, 0x7e715d17, // 28 + 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e }, +{ 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e, + 0xe8dd0000, 0xfa4a0044, 0x3c2d0000, 0xbb150000, 0x80bd361b, 0x24e81d44, 0xbfa8c2f4, 0x524a0d59 }, +{ 0x69510000, 0xd4e1009c, 0xc3230000, 0xac2f0000, 0xe4950bae, 0xcea415dc, 0x87ec287c, 0xbce1a3ce, + 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173 }, +{ 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173, + 0xaf220000, 0x7b6c0090, 0x67e20000, 0x8da20000, 0xc7841e29, 0xb7b744f3, 0x9ac484f4, 0x8b6c72bd }, +{ 0xcc140000, 0xa5630000, 0x5ab90780, 0x3b500000, 0x4bd013ff, 0x879b3418, 0x694348c1, 0xca5a87fe, + 0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa }, +{ 0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa, + 0x4d8a0000, 0x49340000, 0x3c8b0500, 0xaea30000, 0x16793bfd, 0xcf6f08a4, 0x8f19eaec, 0x443d3004 }, +{ 0x78230000, 0x12fc0000, 0xa93a0b80, 0x90a50000, 0x713e2879, 0x7ee98924, 0xf08ca062, 0x636f8bab, + 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b }, +{ 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b, // 35 + 0x7a8c0000, 0xa5d40000, 0x13260880, 0xc63d0000, 0xcbb36daa, 0xfea14f43, 0x59d0b4f8, 0x979961d0 }, +{ 0xac480000, 0x1ba60000, 0x45fb1380, 0x03430000, 0x5a85316a, 0x1fb250b6, 0xfe72c7fe, 0x91e478f6, + 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e }, +{ 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e, + 0xb2060000, 0xc5690000, 0x28031200, 0x74670000, 0xb6c236f4, 0xeb1239f8, 0x33d1dfec, 0x094e3198 }, +{ 0xaec30000, 0x9c4f0001, 0x79d1e000, 0x2c150000, 0x45cc75b3, 0x6650b736, 0xab92f78f, 0xa312567b, + 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e }, +{ 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e, + 0x75e60000, 0x95660001, 0x307b2000, 0xadf40000, 0x8f321eea, 0x24298307, 0xe8c49cf9, 0x4b7eec55 }, +{ 0x58430000, 0x807e0000, 0x78330001, 0xc66b3800, 0xe7375cdc, 0x79ad3fdd, 0xac73fe6f, 0x3a4479b1, + 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6 }, +{ 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6, + 0x45190000, 0xab0c0000, 0x30be0001, 0x690a2000, 0xc2fc7219, 0xb1d4800d, 0x2dd1fa46, 0x24314f17 }, +{ 0xa53b0000, 0x14260000, 0x4e30001e, 0x7cae0000, 0x8f9e0dd5, 0x78dfaa3d, 0xf73168d8, 0x0b1b4946, // 42 + 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce }, +{ 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce, + 0xa2d60000, 0xa6760000, 0xc9440014, 0xeba30000, 0xccec2e7b, 0x3018c499, 0x03490afa, 0x9b6ef888 }, +{ 0x88980000, 0x1f940000, 0x7fcf002e, 0xfb4e0000, 0xf158079a, 0x61ae9167, 0xa895706c, 0xe6107494, + 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463 }, +{ 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463, + 0x835a0000, 0xc4f70000, 0x01470022, 0xeec80000, 0x60a54f69, 0x142f2a24, 0x5cf534f2, 0x3ea660f7 }, +{ 0x52500000, 0x29540000, 0x6a61004e, 0xf0ff0000, 0x9a317eec, 0x452341ce, 0xcf568fe5, 0x5303130f, + 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691 }, +{ 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691, + 0x01dd0000, 0x80a80000, 0xf4960048, 0xa6000000, 0x90d57ea2, 0xd7e68c37, 0x6612cffd, 0x2c94459e }, +{ 0xe6280000, 0x4c4b0000, 0xa8550000, 0xd3d002e0, 0xd86130b8, 0x98a7b0da, 0x289506b4, 0xd75a4897, + 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f }, +{ 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f, // 49 + 0x16ed0000, 0x15680000, 0xedd70000, 0x325d0220, 0xe30c3689, 0x5a4ae643, 0xe375f8a8, 0x81fdf908 }, +{ 0xb4310000, 0x77330000, 0xb15d0000, 0x7fd004e0, 0x78a26138, 0xd116c35d, 0xd256d489, 0x4e6f74de, + 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539 }, +{ 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539, + 0x57370000, 0xcaf20000, 0x364e0000, 0xc0220480, 0x56186b22, 0x5ca3f40c, 0xa1937f8f, 0x15b961e7 }, +{ 0x02f20000, 0xa2810000, 0x873f0000, 0xe36c7800, 0x1e1d74ef, 0x073d2bd6, 0xc4c23237, 0x7f32259e, + 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0 }, +{ 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0, + 0xb82f0000, 0xb12c0000, 0x30d80000, 0x14445000, 0xc15860a2, 0x3127e8ec, 0x2e98bf23, 0x551e3d6e }, +{ 0x1e6c0000, 0xc4420000, 0x8a2e0000, 0xbcb6b800, 0x2c4413b6, 0x8bfdd3da, 0x6a0c1bc8, 0xb99dc2eb, + 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f }, +{ 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f, + 0x8c3a0000, 0xda980000, 0x607f0000, 0x54078800, 0x85714513, 0x6006b243, 0xdb50399c, 0x8a58e6a4 }, +{ 0x033d0000, 0x08b30000, 0xf33a0000, 0x3ac20007, 0x51298a50, 0x6b6e661f, 0x0ea5cfe3, 0xe6da7ffe, // 56 + 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000 }, +{ 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000, + 0xabe70000, 0x9e0d0000, 0xaf270000, 0x3d180005, 0x2c4f1fd3, 0x74f61695, 0xb5c347eb, 0x3c5dfffe }, +{ 0x01930000, 0xe7820000, 0xedfb0000, 0xcf0c000b, 0x8dd08d58, 0xbca3b42e, 0x063661e1, 0x536f9e7b, + 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7 }, +{ 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7, + 0x93bb0000, 0x3b070000, 0xba010000, 0x99d00008, 0x3739ae4e, 0xe64c1722, 0x96f896b3, 0x2879ebac }, +{ 0x5fa80000, 0x56030000, 0x43ae0000, 0x64f30013, 0x257e86bf, 0x1311944e, 0x541e95bf, 0x8ea4db69, + 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e }, +{ 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e, + 0x5fec0000, 0x294b0000, 0x99d20000, 0x4ed00012, 0x1ed34f73, 0xbaa708c9, 0x57140bdf, 0x30aebcf7 }, +{ 0xee930000, 0xd6070000, 0x92c10000, 0x2b9801e0, 0x9451287c, 0x3b6cfb57, 0x45312374, 0x201f6a64, + 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0 }, +{ 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0, + 0x95bb0000, 0x81450000, 0x3b240000, 0x48db0140, 0x0a8a6c53, 0x56f56eec, 0x62c91877, 0xe7e00a94 } }; __global__ @@ -598,12 +328,12 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3]; unsigned char *h1 = (unsigned char *)Hash; - uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265); - uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c); - uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48); - uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d); - uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF; + uint32_t c0 = 0x73746565, c1 = 0x6c706172, c2 = 0x6b204172, c3 = 0x656e6265; + uint32_t c4 = 0x72672031, c5 = 0x302c2062, c6 = 0x75732032, c7 = 0x3434362c; + uint32_t c8 = 0x20422d33, c9 = 0x30303120, cA = 0x4c657576, cB = 0x656e2d48; + uint32_t cC = 0x65766572, cD = 0x6c65652c, cE = 0x2042656c, cF = 0x6769756d; uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; + uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF; uint32_t *tp, db, dm; for(int i = 0; i < 64; i += 8) { @@ -637,16 +367,16 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * T_BIG; } + // precomputed for 64 bytes blocks ? tp = &d_T512[0][0] + 112; - - m0 = *(tp+ 0); m1 = *(tp+ 1); - m2 = *(tp+ 2); m3 = *(tp+ 3); - m4 = *(tp+ 4); m5 = *(tp+ 5); - m6 = *(tp+ 6); m7 = *(tp+ 7); - m8 = *(tp+ 8); m9 = *(tp+ 9); - mA = *(tp+10); mB = *(tp+11); - mC = *(tp+12); mD = *(tp+13); - mE = *(tp+14); mF = *(tp+15); + m0 = tp[ 0]; m1 = tp[ 1]; + m2 = tp[ 2]; m3 = tp[ 3]; + m4 = tp[ 4]; m5 = tp[ 5]; + m6 = tp[ 6]; m7 = tp[ 7]; + m8 = tp[ 8]; m9 = tp[ 9]; + mA = tp[10]; mB = tp[11]; + mC = tp[12]; mD = tp[13]; + mE = tp[14]; mF = tp[15]; for( int r = 0; r < 6; r += 2 ) { ROUND_BIG(r, d_alpha_n); @@ -655,15 +385,14 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t * T_BIG; tp = &d_T512[0][0] + 784; - - m0 = *(tp+ 0); m1 = *(tp+ 1); - m2 = *(tp+ 2); m3 = *(tp+ 3); - m4 = *(tp+ 4); m5 = *(tp+ 5); - m6 = *(tp+ 6); m7 = *(tp+ 7); - m8 = *(tp+ 8); m9 = *(tp+ 9); - mA = *(tp+10); mB = *(tp+11); - mC = *(tp+12); mD = *(tp+13); - mE = *(tp+14); mF = *(tp+15); + m0 = tp[ 0]; m1 = tp[ 1]; + m2 = tp[ 2]; m3 = tp[ 3]; + m4 = tp[ 4]; m5 = tp[ 5]; + m6 = tp[ 6]; m7 = tp[ 7]; + m8 = tp[ 8]; m9 = tp[ 9]; + mA = tp[10]; mB = tp[11]; + mC = tp[12]; mD = tp[13]; + mE = tp[14]; mF = tp[15]; for( int r = 0; r < 12; r += 2 ) { ROUND_BIG(r, d_alpha_f); @@ -696,3 +425,127 @@ void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce x13_hamsi512_gpu_hash_64<<>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector); //MyStreamSynchronize(NULL, order, thr_id); } + +__constant__ static uint64_t c_PaddedMessage80[10]; + +__host__ +void x16_hamsi512_setBlock_80(void *pdata) +{ + cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice); +} + +__global__ +void x16_hamsi512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash) +{ + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + unsigned char h1[80]; + #pragma unroll + for (int i = 0; i < 10; i++) + ((uint2*)h1)[i] = ((uint2*)c_PaddedMessage80)[i]; + //((uint64_t*)h1)[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(startNonce + thread)); + ((uint32_t*)h1)[19] = cuda_swab32(startNonce + thread); + + uint32_t c0 = 0x73746565, c1 = 0x6c706172, c2 = 0x6b204172, c3 = 0x656e6265; + uint32_t c4 = 0x72672031, c5 = 0x302c2062, c6 = 0x75732032, c7 = 0x3434362c; + uint32_t c8 = 0x20422d33, c9 = 0x30303120, cA = 0x4c657576, cB = 0x656e2d48; + uint32_t cC = 0x65766572, cD = 0x6c65652c, cE = 0x2042656c, cF = 0x6769756d; + uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF }; + uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF; + uint32_t *tp, db, dm; + + for(int i = 0; i < 80; i += 8) + { + m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0; + m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0; + tp = &d_T512[0][0]; + + #pragma unroll + for (int u = 0; u < 8; u++) { + db = h1[i + u]; + #pragma unroll 2 + for (int v = 0; v < 8; v++, db >>= 1) { + dm = -(uint32_t)(db & 1); + m0 ^= dm & tp[ 0]; m1 ^= dm & tp[ 1]; + m2 ^= dm & tp[ 2]; m3 ^= dm & tp[ 3]; + m4 ^= dm & tp[ 4]; m5 ^= dm & tp[ 5]; + m6 ^= dm & tp[ 6]; m7 ^= dm & tp[ 7]; + m8 ^= dm & tp[ 8]; m9 ^= dm & tp[ 9]; + mA ^= dm & tp[10]; mB ^= dm & tp[11]; + mC ^= dm & tp[12]; mD ^= dm & tp[13]; + mE ^= dm & tp[14]; mF ^= dm & tp[15]; + tp += 16; + } + } + + #pragma unroll + for (int r = 0; r < 6; r++) { + ROUND_BIG(r, d_alpha_n); + } + T_BIG; + } + + #define INPUT_BIG { \ + m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0; \ + m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0; \ + tp = &d_T512[0][0]; \ + for (int u = 0; u < 8; u++) { \ + db = endtag[u]; \ + for (int v = 0; v < 8; v++, db >>= 1) { \ + dm = -(uint32_t)(db & 1); \ + m0 ^= dm & tp[ 0]; m1 ^= dm & tp[ 1]; \ + m2 ^= dm & tp[ 2]; m3 ^= dm & tp[ 3]; \ + m4 ^= dm & tp[ 4]; m5 ^= dm & tp[ 5]; \ + m6 ^= dm & tp[ 6]; m7 ^= dm & tp[ 7]; \ + m8 ^= dm & tp[ 8]; m9 ^= dm & tp[ 9]; \ + mA ^= dm & tp[10]; mB ^= dm & tp[11]; \ + mC ^= dm & tp[12]; mD ^= dm & tp[13]; \ + mE ^= dm & tp[14]; mF ^= dm & tp[15]; \ + tp += 16; \ + } \ + } \ + } + + // close + uint8_t endtag[8] = { 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; + INPUT_BIG; + + #pragma unroll + for (int r = 0; r < 6; r++) { + ROUND_BIG(r, d_alpha_n); + } + T_BIG; + + endtag[0] = endtag[1] = 0x00; + endtag[6] = 0x02; + endtag[7] = 0x80; + INPUT_BIG; + + // PF_BIG + #pragma unroll + for(int r = 0; r < 12; r++) { + ROUND_BIG(r, d_alpha_f); + } + T_BIG; + + uint64_t hashPosition = thread; + uint32_t *Hash = (uint32_t*)&g_hash[hashPosition << 3]; + #pragma unroll 16 + for(int i = 0; i < 16; i++) + Hash[i] = cuda_swab32(h[i]); + + #undef INPUT_BIG + } +} + +__host__ +void x16_hamsi512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash) +{ + const uint32_t threadsperblock = 128; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + x16_hamsi512_gpu_hash_80 <<>> (threads, startNounce, (uint64_t*)d_hash); +} diff --git a/x15/cuda_x15_whirlpool_sm3.cu b/x15/cuda_x15_whirlpool_sm3.cu index e2df3dc..3110a69 100644 --- a/x15/cuda_x15_whirlpool_sm3.cu +++ b/x15/cuda_x15_whirlpool_sm3.cu @@ -1998,7 +1998,7 @@ const int i0, const int i1, const int i2, const int i3, const int i4, const int __global__ -void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, int swab) +void oldwhirlpool_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, void *outputHash, int swab) { __shared__ uint64_t sharedMemory[2048]; @@ -2014,7 +2014,8 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x]; #endif } - __threadfence_block(); // ensure shared mem is ready + //__threadfence_block(); // ensure shared mem is ready + __syncthreads(); uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); if (thread < threads) @@ -2028,7 +2029,8 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp uint64_t state[8]; #pragma unroll 8 for (int i=0; i < 8; i++) { - state[i] = c_PaddedMessage80[i]; + //state[i] = c_PaddedMessage80[i]; + AS_UINT2(&state[i]) = AS_UINT2(&c_PaddedMessage80[i]); } #else #pragma unroll 8 @@ -2050,6 +2052,7 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp state[i] = xor1(n[i],c_PaddedMessage80[i]); } #endif + /// round 2 /////// ////////////////////////////////// n[0] = c_PaddedMessage80[8]; //read data @@ -2331,7 +2334,7 @@ extern uint32_t whirlpool512_finalhash_64(int thr_id, uint32_t threads, uint32_t } __host__ -void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash) +void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash) { dim3 grid((threads + threadsperblock-1) / threadsperblock); dim3 block(threadsperblock); @@ -2339,7 +2342,7 @@ void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNounce if (threads < 256) applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!"); - oldwhirlpool_gpu_hash_80<<>>(threads, startNounce, d_outputHash, 1); + oldwhirlpool_gpu_hash_80<<>>(threads, startNonce, d_outputHash, 1); } extern void whirl_midstate(void *state, const void *input); @@ -2363,3 +2366,54 @@ void whirlpool512_setBlock_80_sm3(void *pdata, const void *ptarget) cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice); cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice); } + +// ------------------------------------------------------------------------------------------------ + +__host__ +void x16_whirlpool512_init(int thr_id, uint32_t threads) +{ + cudaMemcpyToSymbol(InitVector_RC, plain_RC, sizeof(plain_RC), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(mixTob0Tox, plain_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice); +#if USE_ALL_TABLES + cudaMemcpyToSymbol(mixTob1Tox, plain_T1, (256 * 8), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(mixTob2Tox, plain_T2, (256 * 8), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(mixTob3Tox, plain_T3, (256 * 8), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(mixTob4Tox, plain_T4, (256 * 8), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(mixTob5Tox, plain_T5, (256 * 8), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(mixTob6Tox, plain_T6, (256 * 8), 0, cudaMemcpyHostToDevice); + cudaMemcpyToSymbol(mixTob7Tox, plain_T7, (256 * 8), 0, cudaMemcpyHostToDevice); +#endif +} + +extern void whirlpool_midstate(void *state, const void *input); + +__host__ +void x16_whirlpool512_setBlock_80(void *pdata) +{ + unsigned char PaddedMessage[128]; + + memcpy(PaddedMessage, pdata, 80); + memset(PaddedMessage + 80, 0, 48); + PaddedMessage[80] = 0x80; /* ending */ + +#if HOST_MIDSTATE + // compute constant first block + unsigned char midstate[64] = { 0 }; + whirlpool_midstate(midstate, pdata); + memcpy(PaddedMessage, midstate, 64); +#endif + + cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice); +} + +__host__ +void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_outputHash) +{ + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + if (threads < 256) + applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!"); + + oldwhirlpool_gpu_hash_80 <<>> (threads, startNonce, d_outputHash, 1); +} diff --git a/x16r/cuda_x16_echo512.cu b/x16r/cuda_x16_echo512.cu new file mode 100644 index 0000000..5e6013d --- /dev/null +++ b/x16r/cuda_x16_echo512.cu @@ -0,0 +1,214 @@ +/** + * echo512-80 cuda kernel for X16R algorithm + * + * tpruvot 2018 - GPL code + */ + +#include +#include + +#include "cuda_helper.h" + +extern __device__ __device_builtin__ void __threadfence_block(void); + +#include "../x11/cuda_x11_aes.cuh" + +__device__ __forceinline__ void AES_2ROUND(const uint32_t* __restrict__ sharedMemory, + uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3, + uint32_t &k0) +{ + uint32_t y0, y1, y2, y3; + + aes_round(sharedMemory, + x0, x1, x2, x3, + k0, + y0, y1, y2, y3); + + aes_round(sharedMemory, + y0, y1, y2, y3, + x0, x1, x2, x3); + + k0++; +} + +__device__ +static void echo_round(uint32_t* const sharedMemory, uint32_t *W, uint32_t &k0) +{ + // Big Sub Words + #pragma unroll 16 + for (int idx = 0; idx < 16; idx++) { + AES_2ROUND(sharedMemory, W[(idx << 2) + 0], W[(idx << 2) + 1], W[(idx << 2) + 2], W[(idx << 2) + 3], k0); + } + + // Shift Rows + #pragma unroll 4 + for (int i = 0; i < 4; i++) + { + uint32_t t[4]; + /// 1, 5, 9, 13 + t[0] = W[i + 4]; + t[1] = W[i + 8]; + t[2] = W[i + 24]; + t[3] = W[i + 60]; + + W[i + 4] = W[i + 20]; + W[i + 8] = W[i + 40]; + W[i + 24] = W[i + 56]; + W[i + 60] = W[i + 44]; + + W[i + 20] = W[i + 36]; + W[i + 40] = t[1]; + W[i + 56] = t[2]; + W[i + 44] = W[i + 28]; + + W[i + 28] = W[i + 12]; + W[i + 12] = t[3]; + W[i + 36] = W[i + 52]; + W[i + 52] = t[0]; + } + + // Mix Columns + #pragma unroll 4 + for (int i = 0; i < 4; i++) + { + #pragma unroll 4 + for (int idx = 0; idx < 64; idx += 16) + { + uint32_t a[4]; + a[0] = W[idx + i]; + a[1] = W[idx + i + 4]; + a[2] = W[idx + i + 8]; + a[3] = W[idx + i + 12]; + + uint32_t ab = a[0] ^ a[1]; + uint32_t bc = a[1] ^ a[2]; + uint32_t cd = a[2] ^ a[3]; + + uint32_t t, t2, t3; + t = (ab & 0x80808080); + t2 = (bc & 0x80808080); + t3 = (cd & 0x80808080); + + uint32_t abx = (t >> 7) * 27U ^ ((ab^t) << 1); + uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1); + uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1); + + W[idx + i] = bc ^ a[3] ^ abx; + W[idx + i + 4] = a[0] ^ cd ^ bcx; + W[idx + i + 8] = ab ^ a[3] ^ cdx; + W[idx + i + 12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx); + } + } +} + +__device__ __forceinline__ +void cuda_echo_round_80(uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__ data, const uint32_t nonce, uint32_t *hash) +{ + uint32_t h[29]; // <= 127 bytes input + + #pragma unroll 8 + for (int i = 0; i < 18; i += 2) + AS_UINT2(&h[i]) = AS_UINT2(&data[i]); + h[18] = data[18]; + h[19] = cuda_swab32(nonce); + h[20] = 0x80; + h[21] = h[22] = h[23] = h[24] = h[25] = h[26] = 0; + //((uint8_t*)h)[80] = 0x80; + //((uint8_t*)h)[128-17] = 0x02; + //((uint8_t*)h)[128-16] = 0x80; + //((uint8_t*)h)[128-15] = 0x02; + h[27] = 0x2000000; + h[28] = 0x280; + //h[29] = h[30] = h[31] = 0; + + uint32_t k0 = 640; // bitlen + uint32_t W[64]; + + #pragma unroll 8 + for (int i = 0; i < 32; i+=4) { + W[i] = 512; // L + W[i+1] = 0; // H + W[i+2] = 0; // X + W[i+3] = 0; + } + + uint32_t Z[16]; + #pragma unroll + for (int i = 0; i<16; i++) Z[i] = W[i]; + #pragma unroll + for (int i = 32; i<61; i++) W[i] = h[i - 32]; + #pragma unroll + for (int i = 61; i<64; i++) W[i] = 0; + + for (int i = 0; i < 10; i++) + echo_round(sharedMemory, W, k0); + + #pragma unroll 16 + for (int i = 0; i < 16; i++) { + Z[i] ^= h[i] ^ W[i] ^ W[i + 32]; + } + + #pragma unroll 8 + for (int i = 0; i < 16; i += 2) + AS_UINT2(&hash[i]) = AS_UINT2(&Z[i]); +} + +__device__ __forceinline__ +void echo_gpu_init(uint32_t *const __restrict__ sharedMemory) +{ + /* each thread startup will fill a uint32 */ + if (threadIdx.x < 128) { + sharedMemory[threadIdx.x] = d_AES0[threadIdx.x]; + sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x]; + sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x]; + sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x]; + + sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2]; + sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2]; + sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2]; + sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2]; + } +} + +__host__ +void x16_echo512_cuda_init(int thr_id, const uint32_t threads) +{ + aes_cpu_init(thr_id); +} + +__constant__ static uint32_t c_PaddedMessage80[20]; + +__host__ +void x16_echo512_setBlock_80(void *endiandata) +{ + cudaMemcpyToSymbol(c_PaddedMessage80, endiandata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice); +} + +__global__ __launch_bounds__(128, 7) /* will force 72 registers */ +void x16_echo512_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint64_t *g_hash) +{ + __shared__ uint32_t sharedMemory[1024]; + + echo_gpu_init(sharedMemory); + __threadfence_block(); + + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint64_t hashPosition = thread; + uint32_t *pHash = (uint32_t*)&g_hash[hashPosition<<3]; + + cuda_echo_round_80(sharedMemory, c_PaddedMessage80, startNonce + thread, pHash); + } +} + +__host__ +void x16_echo512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash) +{ + const uint32_t threadsperblock = 128; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + x16_echo512_gpu_hash_80<<>>(threads, startNonce, (uint64_t*)d_hash); +} diff --git a/x16r/cuda_x16_fugue512.cu b/x16r/cuda_x16_fugue512.cu new file mode 100644 index 0000000..7f3438c --- /dev/null +++ b/x16r/cuda_x16_fugue512.cu @@ -0,0 +1,467 @@ + +#include +#include + +#define TPB 256 + +/* + * fugue512-80 x16r kernel implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2018 tpruvot + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + */ + +#ifdef __INTELLISENSE__ +#define __byte_perm(x, y, m) (x|y) +#define tex1Dfetch(t, n) (n) +#define __CUDACC__ +#include +#endif + +// store allocated textures device addresses +static unsigned int* d_textures[MAX_GPUS][1]; + +#define mixtab0(x) mixtabs[(x)] +#define mixtab1(x) mixtabs[(x)+256] +#define mixtab2(x) mixtabs[(x)+512] +#define mixtab3(x) mixtabs[(x)+768] + +static texture mixTab0Tex; + +static const uint32_t mixtab0[] = { + 0x63633297, 0x7c7c6feb, 0x77775ec7, 0x7b7b7af7, 0xf2f2e8e5, 0x6b6b0ab7, 0x6f6f16a7, 0xc5c56d39, + 0x303090c0, 0x01010704, 0x67672e87, 0x2b2bd1ac, 0xfefeccd5, 0xd7d71371, 0xabab7c9a, 0x767659c3, + 0xcaca4005, 0x8282a33e, 0xc9c94909, 0x7d7d68ef, 0xfafad0c5, 0x5959947f, 0x4747ce07, 0xf0f0e6ed, + 0xadad6e82, 0xd4d41a7d, 0xa2a243be, 0xafaf608a, 0x9c9cf946, 0xa4a451a6, 0x727245d3, 0xc0c0762d, + 0xb7b728ea, 0xfdfdc5d9, 0x9393d47a, 0x2626f298, 0x363682d8, 0x3f3fbdfc, 0xf7f7f3f1, 0xcccc521d, + 0x34348cd0, 0xa5a556a2, 0xe5e58db9, 0xf1f1e1e9, 0x71714cdf, 0xd8d83e4d, 0x313197c4, 0x15156b54, + 0x04041c10, 0xc7c76331, 0x2323e98c, 0xc3c37f21, 0x18184860, 0x9696cf6e, 0x05051b14, 0x9a9aeb5e, + 0x0707151c, 0x12127e48, 0x8080ad36, 0xe2e298a5, 0xebeba781, 0x2727f59c, 0xb2b233fe, 0x757550cf, + 0x09093f24, 0x8383a43a, 0x2c2cc4b0, 0x1a1a4668, 0x1b1b416c, 0x6e6e11a3, 0x5a5a9d73, 0xa0a04db6, + 0x5252a553, 0x3b3ba1ec, 0xd6d61475, 0xb3b334fa, 0x2929dfa4, 0xe3e39fa1, 0x2f2fcdbc, 0x8484b126, + 0x5353a257, 0xd1d10169, 0x00000000, 0xededb599, 0x2020e080, 0xfcfcc2dd, 0xb1b13af2, 0x5b5b9a77, + 0x6a6a0db3, 0xcbcb4701, 0xbebe17ce, 0x3939afe4, 0x4a4aed33, 0x4c4cff2b, 0x5858937b, 0xcfcf5b11, + 0xd0d0066d, 0xefefbb91, 0xaaaa7b9e, 0xfbfbd7c1, 0x4343d217, 0x4d4df82f, 0x333399cc, 0x8585b622, + 0x4545c00f, 0xf9f9d9c9, 0x02020e08, 0x7f7f66e7, 0x5050ab5b, 0x3c3cb4f0, 0x9f9ff04a, 0xa8a87596, + 0x5151ac5f, 0xa3a344ba, 0x4040db1b, 0x8f8f800a, 0x9292d37e, 0x9d9dfe42, 0x3838a8e0, 0xf5f5fdf9, + 0xbcbc19c6, 0xb6b62fee, 0xdada3045, 0x2121e784, 0x10107040, 0xffffcbd1, 0xf3f3efe1, 0xd2d20865, + 0xcdcd5519, 0x0c0c2430, 0x1313794c, 0xececb29d, 0x5f5f8667, 0x9797c86a, 0x4444c70b, 0x1717655c, + 0xc4c46a3d, 0xa7a758aa, 0x7e7e61e3, 0x3d3db3f4, 0x6464278b, 0x5d5d886f, 0x19194f64, 0x737342d7, + 0x60603b9b, 0x8181aa32, 0x4f4ff627, 0xdcdc225d, 0x2222ee88, 0x2a2ad6a8, 0x9090dd76, 0x88889516, + 0x4646c903, 0xeeeebc95, 0xb8b805d6, 0x14146c50, 0xdede2c55, 0x5e5e8163, 0x0b0b312c, 0xdbdb3741, + 0xe0e096ad, 0x32329ec8, 0x3a3aa6e8, 0x0a0a3628, 0x4949e43f, 0x06061218, 0x2424fc90, 0x5c5c8f6b, + 0xc2c27825, 0xd3d30f61, 0xacac6986, 0x62623593, 0x9191da72, 0x9595c662, 0xe4e48abd, 0x797974ff, + 0xe7e783b1, 0xc8c84e0d, 0x373785dc, 0x6d6d18af, 0x8d8d8e02, 0xd5d51d79, 0x4e4ef123, 0xa9a97292, + 0x6c6c1fab, 0x5656b943, 0xf4f4fafd, 0xeaeaa085, 0x6565208f, 0x7a7a7df3, 0xaeae678e, 0x08083820, + 0xbaba0bde, 0x787873fb, 0x2525fb94, 0x2e2ecab8, 0x1c1c5470, 0xa6a65fae, 0xb4b421e6, 0xc6c66435, + 0xe8e8ae8d, 0xdddd2559, 0x747457cb, 0x1f1f5d7c, 0x4b4bea37, 0xbdbd1ec2, 0x8b8b9c1a, 0x8a8a9b1e, + 0x70704bdb, 0x3e3ebaf8, 0xb5b526e2, 0x66662983, 0x4848e33b, 0x0303090c, 0xf6f6f4f5, 0x0e0e2a38, + 0x61613c9f, 0x35358bd4, 0x5757be47, 0xb9b902d2, 0x8686bf2e, 0xc1c17129, 0x1d1d5374, 0x9e9ef74e, + 0xe1e191a9, 0xf8f8decd, 0x9898e556, 0x11117744, 0x696904bf, 0xd9d93949, 0x8e8e870e, 0x9494c166, + 0x9b9bec5a, 0x1e1e5a78, 0x8787b82a, 0xe9e9a989, 0xcece5c15, 0x5555b04f, 0x2828d8a0, 0xdfdf2b51, + 0x8c8c8906, 0xa1a14ab2, 0x89899212, 0x0d0d2334, 0xbfbf10ca, 0xe6e684b5, 0x4242d513, 0x686803bb, + 0x4141dc1f, 0x9999e252, 0x2d2dc3b4, 0x0f0f2d3c, 0xb0b03df6, 0x5454b74b, 0xbbbb0cda, 0x16166258 +}; + +#define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \ + x22 ^= x00; \ + x00 = (q); \ + x08 ^= x00; \ + x01 ^= x24; \ + x04 ^= x27; \ + x07 ^= x30; \ +} + +#define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20) { \ + x00 ^= x04; \ + x01 ^= x05; \ + x02 ^= x06; \ + x18 ^= x04; \ + x19 ^= x05; \ + x20 ^= x06; \ +} + +#define SMIX(x0, x1, x2, x3) { \ + uint32_t tmp; \ + uint32_t r0 = 0; \ + uint32_t r1 = 0; \ + uint32_t r2 = 0; \ + uint32_t r3 = 0; \ + uint32_t c0 = mixtab0(x0 >> 24); \ + tmp = mixtab1((x0 >> 16) & 0xFF); \ + c0 ^= tmp; \ + r1 ^= tmp; \ + tmp = mixtab2((x0 >> 8) & 0xFF); \ + c0 ^= tmp; \ + r2 ^= tmp; \ + tmp = mixtab3(x0 & 0xFF); \ + c0 ^= tmp; \ + r3 ^= tmp; \ + tmp = mixtab0(x1 >> 24); \ + uint32_t c1 = tmp; \ + r0 ^= tmp; \ + tmp = mixtab1((x1 >> 16) & 0xFF); \ + c1 ^= tmp; \ + tmp = mixtab2((x1 >> 8) & 0xFF); \ + c1 ^= tmp; \ + r2 ^= tmp; \ + tmp = mixtab3(x1 & 0xFF); \ + c1 ^= tmp; \ + r3 ^= tmp; \ + tmp = mixtab0(x2 >> 24); \ + uint32_t c2 = tmp; \ + r0 ^= tmp; \ + tmp = mixtab1((x2 >> 16) & 0xFF); \ + c2 ^= tmp; \ + r1 ^= tmp; \ + tmp = mixtab2((x2 >> 8) & 0xFF); \ + c2 ^= tmp; \ + tmp = mixtab3(x2 & 0xFF); \ + c2 ^= tmp; \ + r3 ^= tmp; \ + tmp = mixtab0(x3 >> 24); \ + uint32_t c3 = tmp; \ + r0 ^= tmp; \ + tmp = mixtab1((x3 >> 16) & 0xFF); \ + c3 ^= tmp; \ + r1 ^= tmp; \ + tmp = mixtab2((x3 >> 8) & 0xFF); \ + c3 ^= tmp; \ + r2 ^= tmp; \ + tmp = mixtab3(x3 & 0xFF); \ + c3 ^= tmp; \ + x0 = ((c0 ^ r0) & 0xFF000000) | ((c1 ^ r1) & 0x00FF0000) \ + | ((c2 ^ r2) & 0x0000FF00) | ((c3 ^ r3) & 0x000000FF); \ + x1 = ((c1 ^ (r0 << 8)) & 0xFF000000) | ((c2 ^ (r1 << 8)) & 0x00FF0000) \ + | ((c3 ^ (r2 << 8)) & 0x0000FF00) | ((c0 ^ (r3 >> 24)) & 0x000000FF); \ + x2 = ((c2 ^ (r0 << 16)) & 0xFF000000) | ((c3 ^ (r1 << 16)) & 0x00FF0000) \ + | ((c0 ^ (r2 >> 16)) & 0x0000FF00) | ((c1 ^ (r3 >> 16)) & 0x000000FF); \ + x3 = ((c3 ^ (r0 << 24)) & 0xFF000000) | ((c0 ^ (r1 >> 8)) & 0x00FF0000) \ + | ((c1 ^ (r2 >> 8)) & 0x0000FF00) | ((c2 ^ (r3 >> 8)) & 0x000000FF); \ +} + +#define SUB_ROR3 { \ + B33 = S33, B34 = S34, B35 = S35; \ + S35 = S32; S34 = S31; S33 = S30; S32 = S29; S31 = S28; S30 = S27; S29 = S26; S28 = S25; S27 = S24; \ + S26 = S23; S25 = S22; S24 = S21; S23 = S20; S22 = S19; S21 = S18; S20 = S17; S19 = S16; S18 = S15; \ + S17 = S14; S16 = S13; S15 = S12; S14 = S11; S13 = S10; S12 = S09; S11 = S08; S10 = S07; S09 = S06; \ + S08 = S05; S07 = S04; S06 = S03; S05 = S02; S04 = S01; S03 = S00; S02 = B35; S01 = B34; S00 = B33; \ +} + +#define SUB_ROR8 { \ + B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \ + S35 = S27; S34 = S26; S33 = S25; S32 = S24; S31 = S23; S30 = S22; S29 = S21; S28 = S20; S27 = S19; \ + S26 = S18; S25 = S17; S24 = S16; S23 = S15; S22 = S14; S21 = S13; S20 = S12; S19 = S11; S18 = S10; \ + S17 = S09; S16 = S08; S15 = S07; S14 = S06; S13 = S05; S12 = S04; S11 = S03; S10 = S02; S09 = S01; \ + S08 = S00; S07 = B35; S06 = B34; S05 = B33; S04 = B32; S03 = B31; S02 = B30; S01 = B29; S00 = B28; \ +} + +#define SUB_ROR9 { \ + B27 = S27, B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \ + S35 = S26; S34 = S25; S33 = S24; S32 = S23; S31 = S22; S30 = S21; S29 = S20; S28 = S19; S27 = S18; \ + S26 = S17; S25 = S16; S24 = S15; S23 = S14; S22 = S13; S21 = S12; S20 = S11; S19 = S10; S18 = S09; \ + S17 = S08; S16 = S07; S15 = S06; S14 = S05; S13 = S04; S12 = S03; S11 = S02; S10 = S01; S09 = S00; \ + S08 = B35; S07 = B34; S06 = B33; S05 = B32; S04 = B31; S03 = B30; S02 = B29; S01 = B28; S00 = B27; \ +} + +#define SUB_ROR9_3 { \ + SUB_ROR3; SUB_ROR3; SUB_ROR3; \ +} + +#define SUB_ROR12 { /* to fix */ \ + B24 = S00; B25 = S01; B26 = S02; B27 = S03; B28 = S04; B29 = S05; B30 = S06; B31 = S07; B32 = S08; B33 = S09; B34 = S10; B35 = S11; \ + S00 = S12; S01 = S13; S02 = S14; S03 = S15; S04 = S16; S05 = S17; S06 = S18; S07 = S19; S08 = S20; S09 = S21; S10 = S22; S11 = S23; \ + S12 = S24; S13 = S25; S14 = S26; S15 = S27; S16 = S28; S17 = S29; S18 = S30; S19 = S31; S20 = S32; S21 = S33; S22 = S34; S23 = S35; \ + S24 = B24; S25 = B25; S26 = B26; S27 = B27; S28 = B28; S29 = B29; S30 = B30; S31 = B31; S32 = B32; S33 = B33; S34 = B34; S35 = B35; \ +} + +#define FUGUE512_3(x, y, z) { \ + TIX4(x, S00, S01, S04, S07, S08, S22, S24, S27, S30); \ + CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \ + SMIX(S33, S34, S35, S00); \ + CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \ + SMIX(S30, S31, S32, S33); \ + CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \ + SMIX(S27, S28, S29, S30); \ + CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \ + SMIX(S24, S25, S26, S27); \ + \ + TIX4(y, S24, S25, S28, S31, S32, S10, S12, S15, S18); \ + CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \ + SMIX(S21, S22, S23, S24); \ + CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \ + SMIX(S18, S19, S20, S21); \ + CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \ + SMIX(S15, S16, S17, S18); \ + CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \ + SMIX(S12, S13, S14, S15); \ + \ + TIX4(z, S12, S13, S16, S19, S20, S34, S00, S03, S06); \ + CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \ + SMIX(S09, S10, S11, S12); \ + CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \ + SMIX(S06, S07, S08, S09); \ + CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \ + SMIX(S03, S04, S05, S06); \ + CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \ + SMIX(S00, S01, S02, S03); \ +} + +#define FUGUE512_F(w, x, y, z) { \ + TIX4(w, S00, S01, S04, S07, S08, S22, S24, S27, S30); \ + CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \ + SMIX(S33, S34, S35, S00); \ + CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \ + SMIX(S30, S31, S32, S33); \ + CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \ + SMIX(S27, S28, S29, S30); \ + CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \ + SMIX(S24, S25, S26, S27); \ + \ + TIX4(x, S24, S25, S28, S31, S32, S10, S12, S15, S18); \ + CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \ + SMIX(S21, S22, S23, S24); \ + CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \ + SMIX(S18, S19, S20, S21); \ + CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \ + SMIX(S15, S16, S17, S18); \ + CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \ + SMIX(S12, S13, S14, S15); \ + \ + TIX4(y, S12, S13, S16, S19, S20, S34, S00, S03, S06); \ + CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \ + SMIX(S09, S10, S11, S12); \ + CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \ + SMIX(S06, S07, S08, S09); \ + CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \ + SMIX(S03, S04, S05, S06); \ + CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \ + SMIX(S00, S01, S02, S03); \ + \ + TIX4(z, S00, S01, S04, S07, S08, S22, S24, S27, S30); \ + CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \ + SMIX(S33, S34, S35, S00); \ + CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \ + SMIX(S30, S31, S32, S33); \ + CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \ + SMIX(S27, S28, S29, S30); \ + CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \ + SMIX(S24, S25, S26, S27); \ +} + +#undef ROL8 +#ifdef __CUDA_ARCH__ +__device__ __forceinline__ +uint32_t ROL8(const uint32_t a) { + return __byte_perm(a, 0, 0x2103); +} +__device__ __forceinline__ +uint32_t ROR8(const uint32_t a) { + return __byte_perm(a, 0, 0x0321); +} +__device__ __forceinline__ +uint32_t ROL16(const uint32_t a) { + return __byte_perm(a, 0, 0x1032); +} +#else +#define ROL8(u) ROTL32(u, 8) +#define ROR8(u) ROTR32(u, 8) +#define ROL16(u) ROTL32(u,16) +#endif + +//#define AS_UINT4(addr) *((uint4*)(addr)) + +__constant__ static uint64_t c_PaddedMessage80[10]; + +__host__ +void x16_fugue512_setBlock_80(void *pdata) +{ + cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice); +} + +/***************************************************/ + +__global__ +__launch_bounds__(TPB) +void x16_fugue512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash) +{ + __shared__ uint32_t mixtabs[1024]; + + // load shared mem (with 256 threads) + const uint32_t thr = threadIdx.x & 0xFF; + const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr); + mixtabs[thr] = tmp; + mixtabs[thr+256] = ROR8(tmp); + mixtabs[thr+512] = ROL16(tmp); + mixtabs[thr+768] = ROL8(tmp); +#if TPB <= 256 + if (blockDim.x < 256) { + const uint32_t thr = (threadIdx.x + 0x80) & 0xFF; + const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr); + mixtabs[thr] = tmp; + mixtabs[thr + 256] = ROR8(tmp); + mixtabs[thr + 512] = ROL16(tmp); + mixtabs[thr + 768] = ROL8(tmp); + } +#endif + + __syncthreads(); + + uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t Data[20]; + + #pragma unroll + for(int i = 0; i < 10; i++) + AS_UINT2(&Data[i * 2]) = AS_UINT2(&c_PaddedMessage80[i]); + Data[19] = (startNonce + thread); + + uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09, S10, S11; + uint32_t S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23; + uint32_t S24, S25, S26, S27, S28, S29, S30, S31, S32, S33, S34, S35; + //uint32_t B24, B25, B26, + uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35; + //const uint64_t bc = 640 bits to hash + //const uint32_t bclo = (uint32_t)(bc); + //const uint32_t bchi = (uint32_t)(bc >> 32); + + S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = 0; + S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0; + S20 = 0x8807a57e; S21 = 0xe616af75; S22 = 0xc5d3e4db; S23 = 0xac9ab027; + S24 = 0xd915f117; S25 = 0xb6eecc54; S26 = 0x06e8020b; S27 = 0x4a92efd1; + S28 = 0xaac6e2c9; S29 = 0xddb21398; S30 = 0xcae65838; S31 = 0x437f203f; + S32 = 0x25ea78e7; S33 = 0x951fddd6; S34 = 0xda6ed11d; S35 = 0xe13e3567; + + FUGUE512_3((Data[ 0]), (Data[ 1]), (Data[ 2])); + FUGUE512_3((Data[ 3]), (Data[ 4]), (Data[ 5])); + FUGUE512_3((Data[ 6]), (Data[ 7]), (Data[ 8])); + FUGUE512_3((Data[ 9]), (Data[10]), (Data[11])); + FUGUE512_3((Data[12]), (Data[13]), (Data[14])); + FUGUE512_3((Data[15]), (Data[16]), (Data[17])); + FUGUE512_F((Data[18]), (Data[19]), 0/*bchi*/, (80*8)/*bclo*/); + + // rotate right state by 3 dwords (S00 = S33, S03 = S00) + SUB_ROR3; + SUB_ROR9; + + #pragma unroll 32 + for (int i = 0; i < 32; i++) { + SUB_ROR3; + CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); + SMIX(S00, S01, S02, S03); + } + #pragma unroll 13 + for (int i = 0; i < 13; i++) { + S04 ^= S00; + S09 ^= S00; + S18 ^= S00; + S27 ^= S00; + SUB_ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S18 ^= S00; + S27 ^= S00; + SUB_ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S19 ^= S00; + S27 ^= S00; + SUB_ROR9; + SMIX(S00, S01, S02, S03); + S04 ^= S00; + S10 ^= S00; + S19 ^= S00; + S28 ^= S00; + SUB_ROR8; + SMIX(S00, S01, S02, S03); + } + S04 ^= S00; + S09 ^= S00; + S18 ^= S00; + S27 ^= S00; + + Data[ 0] = cuda_swab32(S01); + Data[ 1] = cuda_swab32(S02); + Data[ 2] = cuda_swab32(S03); + Data[ 3] = cuda_swab32(S04); + Data[ 4] = cuda_swab32(S09); + Data[ 5] = cuda_swab32(S10); + Data[ 6] = cuda_swab32(S11); + Data[ 7] = cuda_swab32(S12); + Data[ 8] = cuda_swab32(S18); + Data[ 9] = cuda_swab32(S19); + Data[10] = cuda_swab32(S20); + Data[11] = cuda_swab32(S21); + Data[12] = cuda_swab32(S27); + Data[13] = cuda_swab32(S28); + Data[14] = cuda_swab32(S29); + Data[15] = cuda_swab32(S30); + + const size_t hashPosition = thread; + uint64_t* pHash = &g_hash[hashPosition << 3]; + #pragma unroll 4 + for(int i = 0; i < 4; i++) + AS_UINT4(&pHash[i * 2]) = AS_UINT4(&Data[i * 4]); + } +} + +#define texDef(id, texname, texmem, texsource, texsize) { \ + unsigned int *texmem; \ + cudaMalloc(&texmem, texsize); \ + d_textures[thr_id][id] = texmem; \ + cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \ + texname.normalized = 0; \ + texname.filterMode = cudaFilterModePoint; \ + texname.addressMode[0] = cudaAddressModeClamp; \ + { cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(); \ + cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \ + } \ +} + +__host__ +void x16_fugue512_cpu_init(int thr_id, uint32_t threads) +{ + texDef(0, mixTab0Tex, mixTab0m, mixtab0, sizeof(uint32_t)*256); +} + +__host__ +void x16_fugue512_cpu_free(int thr_id) +{ + cudaFree(d_textures[thr_id][0]); +} + +__host__ +void x16_fugue512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash) +{ + const uint32_t threadsperblock = TPB; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + x16_fugue512_gpu_hash_80 <<>> (threads, startNonce, (uint64_t*)d_hash); +} diff --git a/x16r/cuda_x16_shabal512.cu b/x16r/cuda_x16_shabal512.cu new file mode 100644 index 0000000..ca00b50 --- /dev/null +++ b/x16r/cuda_x16_shabal512.cu @@ -0,0 +1,350 @@ +/* +* Shabal-512 for X16R +* tpruvot 2018, based on alexis x14 and xevan kernlx code +*/ + +#include +#include +#include + +typedef uint32_t sph_u32; + +#define C32(x) (x) +#define T32(x) (x) + +#define INPUT_BLOCK_ADD do { \ + B0 = T32(B0 + M0); \ + B1 = T32(B1 + M1); \ + B2 = T32(B2 + M2); \ + B3 = T32(B3 + M3); \ + B4 = T32(B4 + M4); \ + B5 = T32(B5 + M5); \ + B6 = T32(B6 + M6); \ + B7 = T32(B7 + M7); \ + B8 = T32(B8 + M8); \ + B9 = T32(B9 + M9); \ + BA = T32(BA + MA); \ + BB = T32(BB + MB); \ + BC = T32(BC + MC); \ + BD = T32(BD + MD); \ + BE = T32(BE + ME); \ + BF = T32(BF + MF); \ + } while (0) + +#define INPUT_BLOCK_SUB do { \ + C0 = T32(C0 - M0); \ + C1 = T32(C1 - M1); \ + C2 = T32(C2 - M2); \ + C3 = T32(C3 - M3); \ + C4 = T32(C4 - M4); \ + C5 = T32(C5 - M5); \ + C6 = T32(C6 - M6); \ + C7 = T32(C7 - M7); \ + C8 = T32(C8 - M8); \ + C9 = T32(C9 - M9); \ + CA = T32(CA - MA); \ + CB = T32(CB - MB); \ + CC = T32(CC - MC); \ + CD = T32(CD - MD); \ + CE = T32(CE - ME); \ + CF = T32(CF - MF); \ + } while (0) + +#define XOR_W do { \ + A00 ^= Wlow; \ + A01 ^= Whigh; \ + } while (0) + +#define SWAP(v1, v2) do { \ + sph_u32 tmp = (v1); \ + (v1) = (v2); \ + (v2) = tmp; \ + } while (0) + +#define SWAP_BC do { \ + SWAP(B0, C0); \ + SWAP(B1, C1); \ + SWAP(B2, C2); \ + SWAP(B3, C3); \ + SWAP(B4, C4); \ + SWAP(B5, C5); \ + SWAP(B6, C6); \ + SWAP(B7, C7); \ + SWAP(B8, C8); \ + SWAP(B9, C9); \ + SWAP(BA, CA); \ + SWAP(BB, CB); \ + SWAP(BC, CC); \ + SWAP(BD, CD); \ + SWAP(BE, CE); \ + SWAP(BF, CF); \ + } while (0) + +#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) do { \ + xa0 = T32((xa0 \ + ^ (((xa1 << 15) | (xa1 >> 17)) * 5U) \ + ^ xc) * 3U) \ + ^ xb1 ^ (xb2 & ~xb3) ^ xm; \ + xb0 = T32(~(((xb0 << 1) | (xb0 >> 31)) ^ xa0)); \ + } while (0) + +#define PERM_STEP_0 do { \ + PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \ + PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \ + PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \ + PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \ + PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \ + PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \ + PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \ + PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \ + PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \ + PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \ + PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \ + PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \ + PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \ + PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \ + PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \ + PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \ + } while (0) + +#define PERM_STEP_1 do { \ + PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \ + PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \ + PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \ + PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \ + PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \ + PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \ + PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \ + PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \ + PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \ + PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \ + PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \ + PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \ + PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \ + PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \ + PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \ + PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \ + } while (0) + +#define PERM_STEP_2 do { \ + PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \ + PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \ + PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \ + PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \ + PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \ + PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \ + PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \ + PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \ + PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \ + PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \ + PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \ + PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \ + PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \ + PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \ + PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \ + PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \ + } while (0) + +#define APPLY_P do { \ + B0 = T32(B0 << 17) | (B0 >> 15); \ + B1 = T32(B1 << 17) | (B1 >> 15); \ + B2 = T32(B2 << 17) | (B2 >> 15); \ + B3 = T32(B3 << 17) | (B3 >> 15); \ + B4 = T32(B4 << 17) | (B4 >> 15); \ + B5 = T32(B5 << 17) | (B5 >> 15); \ + B6 = T32(B6 << 17) | (B6 >> 15); \ + B7 = T32(B7 << 17) | (B7 >> 15); \ + B8 = T32(B8 << 17) | (B8 >> 15); \ + B9 = T32(B9 << 17) | (B9 >> 15); \ + BA = T32(BA << 17) | (BA >> 15); \ + BB = T32(BB << 17) | (BB >> 15); \ + BC = T32(BC << 17) | (BC >> 15); \ + BD = T32(BD << 17) | (BD >> 15); \ + BE = T32(BE << 17) | (BE >> 15); \ + BF = T32(BF << 17) | (BF >> 15); \ + PERM_STEP_0; \ + PERM_STEP_1; \ + PERM_STEP_2; \ + A0B = T32(A0B + C6); \ + A0A = T32(A0A + C5); \ + A09 = T32(A09 + C4); \ + A08 = T32(A08 + C3); \ + A07 = T32(A07 + C2); \ + A06 = T32(A06 + C1); \ + A05 = T32(A05 + C0); \ + A04 = T32(A04 + CF); \ + A03 = T32(A03 + CE); \ + A02 = T32(A02 + CD); \ + A01 = T32(A01 + CC); \ + A00 = T32(A00 + CB); \ + A0B = T32(A0B + CA); \ + A0A = T32(A0A + C9); \ + A09 = T32(A09 + C8); \ + A08 = T32(A08 + C7); \ + A07 = T32(A07 + C6); \ + A06 = T32(A06 + C5); \ + A05 = T32(A05 + C4); \ + A04 = T32(A04 + C3); \ + A03 = T32(A03 + C2); \ + A02 = T32(A02 + C1); \ + A01 = T32(A01 + C0); \ + A00 = T32(A00 + CF); \ + A0B = T32(A0B + CE); \ + A0A = T32(A0A + CD); \ + A09 = T32(A09 + CC); \ + A08 = T32(A08 + CB); \ + A07 = T32(A07 + CA); \ + A06 = T32(A06 + C9); \ + A05 = T32(A05 + C8); \ + A04 = T32(A04 + C7); \ + A03 = T32(A03 + C6); \ + A02 = T32(A02 + C5); \ + A01 = T32(A01 + C4); \ + A00 = T32(A00 + C3); \ + } while (0) + +#define INCR_W do { \ + if ((Wlow = T32(Wlow + 1)) == 0) \ + Whigh = T32(Whigh + 1); \ + } while (0) + +__constant__ static const sph_u32 A_init_512[] = { + C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632), + C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B), + C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F) +}; + +__constant__ static const sph_u32 B_init_512[] = { + C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640), + C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08), + C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E), + C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B) +}; + +__constant__ static const sph_u32 C_init_512[] = { + C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359), + C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780), + C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A), + C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969) +}; + +__constant__ static uint32_t c_PaddedMessage80[20]; + +__host__ +void x16_shabal512_setBlock_80(void *pdata) +{ + cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice); +} + +#define TPB_SHABAL 256 + +__global__ __launch_bounds__(TPB_SHABAL, 2) +void x16_shabal512_gpu_hash_80(uint32_t threads, const uint32_t startNonce, uint32_t *g_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + + uint32_t B[] = { + 0xC1099CB7, 0x07B385F3, 0xE7442C26, 0xCC8AD640, 0xEB6F56C7, 0x1EA81AA9, 0x73B9D314, 0x1DE85D08, + 0x48910A5A, 0x893B22DB, 0xC5A0DF44, 0xBBC4324E, 0x72D2F240, 0x75941D99, 0x6D8BDE82, 0xA1A7502B + }; + uint32_t M[16]; + + if (thread < threads) + { + // todo: try __ldc + *(uint2x4*)&M[0] = *(uint2x4*)&c_PaddedMessage80[0]; + *(uint2x4*)&M[8] = *(uint2x4*)&c_PaddedMessage80[8]; + + sph_u32 A00 = A_init_512[0], A01 = A_init_512[1], A02 = A_init_512[ 2], A03 = A_init_512[ 3]; + sph_u32 A04 = A_init_512[4], A05 = A_init_512[5], A06 = A_init_512[ 6], A07 = A_init_512[ 7]; + sph_u32 A08 = A_init_512[8], A09 = A_init_512[9], A0A = A_init_512[10], A0B = A_init_512[11]; + + sph_u32 B0 = B_init_512[ 0], B1 = B_init_512[ 1], B2 = B_init_512[ 2], B3 = B_init_512 [3]; + sph_u32 B4 = B_init_512[ 4], B5 = B_init_512[ 5], B6 = B_init_512[ 6], B7 = B_init_512[ 7]; + sph_u32 B8 = B_init_512[ 8], B9 = B_init_512[ 9], BA = B_init_512[10], BB = B_init_512[11]; + sph_u32 BC = B_init_512[12], BD = B_init_512[13], BE = B_init_512[14], BF = B_init_512[15]; + + sph_u32 C0 = C_init_512[ 0], C1 = C_init_512[ 1], C2 = C_init_512[ 2], C3 = C_init_512[ 3]; + sph_u32 C4 = C_init_512[ 4], C5 = C_init_512[ 5], C6 = C_init_512[ 6], C7 = C_init_512[ 7]; + sph_u32 C8 = C_init_512[ 8], C9 = C_init_512[ 9], CA = C_init_512[10], CB = C_init_512[11]; + sph_u32 CC = C_init_512[12], CD = C_init_512[13], CE = C_init_512[14], CF = C_init_512[15]; + + sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF; + sph_u32 Wlow = 1, Whigh = 0; + + M0 = M[ 0]; + M1 = M[ 1]; + M2 = M[ 2]; + M3 = M[ 3]; + M4 = M[ 4]; + M5 = M[ 5]; + M6 = M[ 6]; + M7 = M[ 7]; + M8 = M[ 8]; + M9 = M[ 9]; + MA = M[10]; + MB = M[11]; + MC = M[12]; + MD = M[13]; + ME = M[14]; + MF = M[15]; + + INPUT_BLOCK_ADD; + XOR_W; + APPLY_P; + INPUT_BLOCK_SUB; + SWAP_BC; + INCR_W; + + M0 = c_PaddedMessage80[16]; + M1 = c_PaddedMessage80[17]; + M2 = c_PaddedMessage80[18]; + M3 = cuda_swab32(startNonce + thread); + M4 = 0x80; + M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0; + + INPUT_BLOCK_ADD; + XOR_W; + APPLY_P; + + for (unsigned i = 0; i < 3; i++) { + SWAP_BC; + XOR_W; + APPLY_P; + } + + B[ 0] = B0; + B[ 1] = B1; + B[ 2] = B2; + B[ 3] = B3; + B[ 4] = B4; + B[ 5] = B5; + B[ 6] = B6; + B[ 7] = B7; + B[ 8] = B8; + B[ 9] = B9; + B[10] = BA; + B[11] = BB; + B[12] = BC; + B[13] = BD; + B[14] = BE; + B[15] = BF; + + // output + uint64_t hashPosition = thread; + uint32_t *Hash = &g_hash[hashPosition << 4]; + *(uint2x4*)&Hash[0] = *(uint2x4*)&B[0]; + *(uint2x4*)&Hash[8] = *(uint2x4*)&B[8]; + } +} + +__host__ +void x16_shabal512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash) +{ + const uint32_t threadsperblock = TPB_SHABAL; + + dim3 grid((threads + threadsperblock - 1) / threadsperblock); + dim3 block(threadsperblock); + + x16_shabal512_gpu_hash_80 <<>>(threads, startNonce, d_hash); +} diff --git a/x16r/cuda_x16_simd512_80.cu b/x16r/cuda_x16_simd512_80.cu new file mode 100644 index 0000000..142180a --- /dev/null +++ b/x16r/cuda_x16_simd512_80.cu @@ -0,0 +1,1836 @@ +/** + * SIMD512 CUDA IMPLEMENTATION based on sph simd code + * tpruvot 2018 (with the help of kernelx xevan code) + */ + +#include +#include +#include + +#ifdef __INTELLISENSE__ +/* just for vstudio code colors */ +#define __CUDA_ARCH__ 500 +#define __byte_perm(x, y, m) (x|y) +#endif + +#define TPB50_1 128 +#define TPB50_2 128 +#define TPB52_1 128 +#define TPB52_2 128 + +#define sph_u32 uint32_t +#define sph_s32 int32_t +typedef uint32_t u32; +typedef int32_t s32; + +#define C32 SPH_C32 +#define T32 SPH_T32 +#define ROL32 ROTL32 +#define XCAT(x, y) XCAT_(x, y) +#define XCAT_(x, y) x ## y + +/* + * The powers of 41 modulo 257. We use exponents from 0 to 255, inclusive. + */ +__constant__ static const s32 alpha_tab[] = { + 1, 41, 139, 45, 46, 87, 226, 14, 60, 147, 116, 130, 190, 80, 196, 69, + 2, 82, 21, 90, 92, 174, 195, 28, 120, 37, 232, 3, 123, 160, 135, 138, + 4, 164, 42, 180, 184, 91, 133, 56, 240, 74, 207, 6, 246, 63, 13, 19, + 8, 71, 84, 103, 111, 182, 9, 112, 223, 148, 157, 12, 235, 126, 26, 38, + 16, 142, 168, 206, 222, 107, 18, 224, 189, 39, 57, 24, 213, 252, 52, 76, + 32, 27, 79, 155, 187, 214, 36, 191, 121, 78, 114, 48, 169, 247, 104, 152, + 64, 54, 158, 53, 117, 171, 72, 125, 242, 156, 228, 96, 81, 237, 208, 47, + 128, 108, 59, 106, 234, 85, 144, 250, 227, 55, 199, 192, 162, 217, 159, 94, + 256, 216, 118, 212, 211, 170, 31, 243, 197, 110, 141, 127, 67, 177, 61, 188, + 255, 175, 236, 167, 165, 83, 62, 229, 137, 220, 25, 254, 134, 97, 122, 119, + 253, 93, 215, 77, 73, 166, 124, 201, 17, 183, 50, 251, 11, 194, 244, 238, + 249, 186, 173, 154, 146, 75, 248, 145, 34, 109, 100, 245, 22, 131, 231, 219, + 241, 115, 89, 51, 35, 150, 239, 33, 68, 218, 200, 233, 44, 5, 205, 181, + 225, 230, 178, 102, 70, 43, 221, 66, 136, 179, 143, 209, 88, 10, 153, 105, + 193, 203, 99, 204, 140, 86, 185, 132, 15, 101, 29, 161, 176, 20, 49, 210, + 129, 149, 198, 151, 23, 172, 113, 7, 30, 202, 58, 65, 95, 40, 98, 163 +}; + +/* + * Ranges: + * REDS1: from -32768..98302 to -383..383 + * REDS2: from -2^31..2^31-1 to -32768..98302 + */ +#define REDS1(x) (((x) & 0x00FF) - ((x) >> 8)) +#define REDS2(x) (((x) & 0xFFFF) + ((x) >> 16)) + +/* + * If, upon entry, the values of q[] are all in the -N..N range (where + * N >= 98302) then the new values of q[] are in the -2N..2N range. + * + * Since alpha_tab[v] <= 256, maximum allowed range is for N = 8388608. + */ +#define FFT_LOOP_16_8(rb) do { \ + s32 m = q[(rb)]; \ + s32 n = q[(rb) + 16]; \ + q[(rb)] = m + n; \ + q[(rb) + 16] = m - n; \ + s32 t; \ + m = q[(rb) + 0 + 1]; \ + n = q[(rb) + 0 + 1 + 16]; \ + t = REDS2(n * alpha_tab[0 + 1 * 8]); \ + q[(rb) + 0 + 1] = m + t; \ + q[(rb) + 0 + 1 + 16] = m - t; \ + m = q[(rb) + 0 + 2]; \ + n = q[(rb) + 0 + 2 + 16]; \ + t = REDS2(n * alpha_tab[0 + 2 * 8]); \ + q[(rb) + 0 + 2] = m + t; \ + q[(rb) + 0 + 2 + 16] = m - t; \ + m = q[(rb) + 0 + 3]; \ + n = q[(rb) + 0 + 3 + 16]; \ + t = REDS2(n * alpha_tab[0 + 3 * 8]); \ + q[(rb) + 0 + 3] = m + t; \ + q[(rb) + 0 + 3 + 16] = m - t; \ + \ + m = q[(rb) + 4 + 0]; \ + n = q[(rb) + 4 + 0 + 16]; \ + t = REDS2(n * alpha_tab[32 + 0 * 8]); \ + q[(rb) + 4 + 0] = m + t; \ + q[(rb) + 4 + 0 + 16] = m - t; \ + m = q[(rb) + 4 + 1]; \ + n = q[(rb) + 4 + 1 + 16]; \ + t = REDS2(n * alpha_tab[32 + 1 * 8]); \ + q[(rb) + 4 + 1] = m + t; \ + q[(rb) + 4 + 1 + 16] = m - t; \ + m = q[(rb) + 4 + 2]; \ + n = q[(rb) + 4 + 2 + 16]; \ + t = REDS2(n * alpha_tab[32 + 2 * 8]); \ + q[(rb) + 4 + 2] = m + t; \ + q[(rb) + 4 + 2 + 16] = m - t; \ + m = q[(rb) + 4 + 3]; \ + n = q[(rb) + 4 + 3 + 16]; \ + t = REDS2(n * alpha_tab[32 + 3 * 8]); \ + q[(rb) + 4 + 3] = m + t; \ + q[(rb) + 4 + 3 + 16] = m - t; \ + \ + m = q[(rb) + 8 + 0]; \ + n = q[(rb) + 8 + 0 + 16]; \ + t = REDS2(n * alpha_tab[64 + 0 * 8]); \ + q[(rb) + 8 + 0] = m + t; \ + q[(rb) + 8 + 0 + 16] = m - t; \ + m = q[(rb) + 8 + 1]; \ + n = q[(rb) + 8 + 1 + 16]; \ + t = REDS2(n * alpha_tab[64 + 1 * 8]); \ + q[(rb) + 8 + 1] = m + t; \ + q[(rb) + 8 + 1 + 16] = m - t; \ + m = q[(rb) + 8 + 2]; \ + n = q[(rb) + 8 + 2 + 16]; \ + t = REDS2(n * alpha_tab[64 + 2 * 8]); \ + q[(rb) + 8 + 2] = m + t; \ + q[(rb) + 8 + 2 + 16] = m - t; \ + m = q[(rb) + 8 + 3]; \ + n = q[(rb) + 8 + 3 + 16]; \ + t = REDS2(n * alpha_tab[64 + 3 * 8]); \ + q[(rb) + 8 + 3] = m + t; \ + q[(rb) + 8 + 3 + 16] = m - t; \ + \ + m = q[(rb) + 12 + 0]; \ + n = q[(rb) + 12 + 0 + 16]; \ + t = REDS2(n * alpha_tab[96 + 0 * 8]); \ + q[(rb) + 12 + 0] = m + t; \ + q[(rb) + 12 + 0 + 16] = m - t; \ + m = q[(rb) + 12 + 1]; \ + n = q[(rb) + 12 + 1 + 16]; \ + t = REDS2(n * alpha_tab[96 + 1 * 8]); \ + q[(rb) + 12 + 1] = m + t; \ + q[(rb) + 12 + 1 + 16] = m - t; \ + m = q[(rb) + 12 + 2]; \ + n = q[(rb) + 12 + 2 + 16]; \ + t = REDS2(n * alpha_tab[96 + 2 * 8]); \ + q[(rb) + 12 + 2] = m + t; \ + q[(rb) + 12 + 2 + 16] = m - t; \ + m = q[(rb) + 12 + 3]; \ + n = q[(rb) + 12 + 3 + 16]; \ + t = REDS2(n * alpha_tab[96 + 3 * 8]); \ + q[(rb) + 12 + 3] = m + t; \ + q[(rb) + 12 + 3 + 16] = m - t; \ + } while (0) + +#define FFT_LOOP_32_4(rb) do { \ + s32 m = q[(rb)]; \ + s32 n = q[(rb) + 32]; \ + q[(rb)] = m + n; \ + q[(rb) + 32] = m - n; \ + s32 t; \ + m = q[(rb) + 0 + 1]; \ + n = q[(rb) + 0 + 1 + 32]; \ + t = REDS2(n * alpha_tab[0 + 1 * 4]); \ + q[(rb) + 0 + 1] = m + t; \ + q[(rb) + 0 + 1 + 32] = m - t; \ + m = q[(rb) + 0 + 2]; \ + n = q[(rb) + 0 + 2 + 32]; \ + t = REDS2(n * alpha_tab[0 + 2 * 4]); \ + q[(rb) + 0 + 2] = m + t; \ + q[(rb) + 0 + 2 + 32] = m - t; \ + m = q[(rb) + 0 + 3]; \ + n = q[(rb) + 0 + 3 + 32]; \ + t = REDS2(n * alpha_tab[0 + 3 * 4]); \ + q[(rb) + 0 + 3] = m + t; \ + q[(rb) + 0 + 3 + 32] = m - t; \ + \ + m = q[(rb) + 4 + 0]; \ + n = q[(rb) + 4 + 0 + 32]; \ + t = REDS2(n * alpha_tab[16 + 0 * 4]); \ + q[(rb) + 4 + 0] = m + t; \ + q[(rb) + 4 + 0 + 32] = m - t; \ + m = q[(rb) + 4 + 1]; \ + n = q[(rb) + 4 + 1 + 32]; \ + t = REDS2(n * alpha_tab[16 + 1 * 4]); \ + q[(rb) + 4 + 1] = m + t; \ + q[(rb) + 4 + 1 + 32] = m - t; \ + m = q[(rb) + 4 + 2]; \ + n = q[(rb) + 4 + 2 + 32]; \ + t = REDS2(n * alpha_tab[16 + 2 * 4]); \ + q[(rb) + 4 + 2] = m + t; \ + q[(rb) + 4 + 2 + 32] = m - t; \ + m = q[(rb) + 4 + 3]; \ + n = q[(rb) + 4 + 3 + 32]; \ + t = REDS2(n * alpha_tab[16 + 3 * 4]); \ + q[(rb) + 4 + 3] = m + t; \ + q[(rb) + 4 + 3 + 32] = m - t; \ + \ + m = q[(rb) + 8 + 0]; \ + n = q[(rb) + 8 + 0 + 32]; \ + t = REDS2(n * alpha_tab[32 + 0 * 4]); \ + q[(rb) + 8 + 0] = m + t; \ + q[(rb) + 8 + 0 + 32] = m - t; \ + m = q[(rb) + 8 + 1]; \ + n = q[(rb) + 8 + 1 + 32]; \ + t = REDS2(n * alpha_tab[32 + 1 * 4]); \ + q[(rb) + 8 + 1] = m + t; \ + q[(rb) + 8 + 1 + 32] = m - t; \ + m = q[(rb) + 8 + 2]; \ + n = q[(rb) + 8 + 2 + 32]; \ + t = REDS2(n * alpha_tab[32 + 2 * 4]); \ + q[(rb) + 8 + 2] = m + t; \ + q[(rb) + 8 + 2 + 32] = m - t; \ + m = q[(rb) + 8 + 3]; \ + n = q[(rb) + 8 + 3 + 32]; \ + t = REDS2(n * alpha_tab[32 + 3 * 4]); \ + q[(rb) + 8 + 3] = m + t; \ + q[(rb) + 8 + 3 + 32] = m - t; \ + \ + m = q[(rb) + 12 + 0]; \ + n = q[(rb) + 12 + 0 + 32]; \ + t = REDS2(n * alpha_tab[48 + 0 * 4]); \ + q[(rb) + 12 + 0] = m + t; \ + q[(rb) + 12 + 0 + 32] = m - t; \ + m = q[(rb) + 12 + 1]; \ + n = q[(rb) + 12 + 1 + 32]; \ + t = REDS2(n * alpha_tab[48 + 1 * 4]); \ + q[(rb) + 12 + 1] = m + t; \ + q[(rb) + 12 + 1 + 32] = m - t; \ + m = q[(rb) + 12 + 2]; \ + n = q[(rb) + 12 + 2 + 32]; \ + t = REDS2(n * alpha_tab[48 + 2 * 4]); \ + q[(rb) + 12 + 2] = m + t; \ + q[(rb) + 12 + 2 + 32] = m - t; \ + m = q[(rb) + 12 + 3]; \ + n = q[(rb) + 12 + 3 + 32]; \ + t = REDS2(n * alpha_tab[48 + 3 * 4]); \ + q[(rb) + 12 + 3] = m + t; \ + q[(rb) + 12 + 3 + 32] = m - t; \ + \ + m = q[(rb) + 16 + 0]; \ + n = q[(rb) + 16 + 0 + 32]; \ + t = REDS2(n * alpha_tab[64 + 0 * 4]); \ + q[(rb) + 16 + 0] = m + t; \ + q[(rb) + 16 + 0 + 32] = m - t; \ + m = q[(rb) + 16 + 1]; \ + n = q[(rb) + 16 + 1 + 32]; \ + t = REDS2(n * alpha_tab[64 + 1 * 4]); \ + q[(rb) + 16 + 1] = m + t; \ + q[(rb) + 16 + 1 + 32] = m - t; \ + m = q[(rb) + 16 + 2]; \ + n = q[(rb) + 16 + 2 + 32]; \ + t = REDS2(n * alpha_tab[64 + 2 * 4]); \ + q[(rb) + 16 + 2] = m + t; \ + q[(rb) + 16 + 2 + 32] = m - t; \ + m = q[(rb) + 16 + 3]; \ + n = q[(rb) + 16 + 3 + 32]; \ + t = REDS2(n * alpha_tab[64 + 3 * 4]); \ + q[(rb) + 16 + 3] = m + t; \ + q[(rb) + 16 + 3 + 32] = m - t; \ + \ + m = q[(rb) + 20 + 0]; \ + n = q[(rb) + 20 + 0 + 32]; \ + t = REDS2(n * alpha_tab[80 + 0 * 4]); \ + q[(rb) + 20 + 0] = m + t; \ + q[(rb) + 20 + 0 + 32] = m - t; \ + m = q[(rb) + 20 + 1]; \ + n = q[(rb) + 20 + 1 + 32]; \ + t = REDS2(n * alpha_tab[80 + 1 * 4]); \ + q[(rb) + 20 + 1] = m + t; \ + q[(rb) + 20 + 1 + 32] = m - t; \ + m = q[(rb) + 20 + 2]; \ + n = q[(rb) + 20 + 2 + 32]; \ + t = REDS2(n * alpha_tab[80 + 2 * 4]); \ + q[(rb) + 20 + 2] = m + t; \ + q[(rb) + 20 + 2 + 32] = m - t; \ + m = q[(rb) + 20 + 3]; \ + n = q[(rb) + 20 + 3 + 32]; \ + t = REDS2(n * alpha_tab[80 + 3 * 4]); \ + q[(rb) + 20 + 3] = m + t; \ + q[(rb) + 20 + 3 + 32] = m - t; \ + \ + m = q[(rb) + 24 + 0]; \ + n = q[(rb) + 24 + 0 + 32]; \ + t = REDS2(n * alpha_tab[96 + 0 * 4]); \ + q[(rb) + 24 + 0] = m + t; \ + q[(rb) + 24 + 0 + 32] = m - t; \ + m = q[(rb) + 24 + 1]; \ + n = q[(rb) + 24 + 1 + 32]; \ + t = REDS2(n * alpha_tab[96 + 1 * 4]); \ + q[(rb) + 24 + 1] = m + t; \ + q[(rb) + 24 + 1 + 32] = m - t; \ + m = q[(rb) + 24 + 2]; \ + n = q[(rb) + 24 + 2 + 32]; \ + t = REDS2(n * alpha_tab[96 + 2 * 4]); \ + q[(rb) + 24 + 2] = m + t; \ + q[(rb) + 24 + 2 + 32] = m - t; \ + m = q[(rb) + 24 + 3]; \ + n = q[(rb) + 24 + 3 + 32]; \ + t = REDS2(n * alpha_tab[96 + 3 * 4]); \ + q[(rb) + 24 + 3] = m + t; \ + q[(rb) + 24 + 3 + 32] = m - t; \ + \ + m = q[(rb) + 28 + 0]; \ + n = q[(rb) + 28 + 0 + 32]; \ + t = REDS2(n * alpha_tab[112 + 0 * 4]); \ + q[(rb) + 28 + 0] = m + t; \ + q[(rb) + 28 + 0 + 32] = m - t; \ + m = q[(rb) + 28 + 1]; \ + n = q[(rb) + 28 + 1 + 32]; \ + t = REDS2(n * alpha_tab[112 + 1 * 4]); \ + q[(rb) + 28 + 1] = m + t; \ + q[(rb) + 28 + 1 + 32] = m - t; \ + m = q[(rb) + 28 + 2]; \ + n = q[(rb) + 28 + 2 + 32]; \ + t = REDS2(n * alpha_tab[112 + 2 * 4]); \ + q[(rb) + 28 + 2] = m + t; \ + q[(rb) + 28 + 2 + 32] = m - t; \ + m = q[(rb) + 28 + 3]; \ + n = q[(rb) + 28 + 3 + 32]; \ + t = REDS2(n * alpha_tab[112 + 3 * 4]); \ + q[(rb) + 28 + 3] = m + t; \ + q[(rb) + 28 + 3 + 32] = m - t; \ + } while (0) + +#define FFT_LOOP_64_2(rb) do { \ + s32 m = q[(rb)]; \ + s32 n = q[(rb) + 64]; \ + q[(rb)] = m + n; \ + q[(rb) + 64] = m - n; \ + s32 t; \ + m = q[(rb) + 0 + 1]; \ + n = q[(rb) + 0 + 1 + 64]; \ + t = REDS2(n * alpha_tab[0 + 1 * 2]); \ + q[(rb) + 0 + 1] = m + t; \ + q[(rb) + 0 + 1 + 64] = m - t; \ + m = q[(rb) + 0 + 2]; \ + n = q[(rb) + 0 + 2 + 64]; \ + t = REDS2(n * alpha_tab[0 + 2 * 2]); \ + q[(rb) + 0 + 2] = m + t; \ + q[(rb) + 0 + 2 + 64] = m - t; \ + m = q[(rb) + 0 + 3]; \ + n = q[(rb) + 0 + 3 + 64]; \ + t = REDS2(n * alpha_tab[0 + 3 * 2]); \ + q[(rb) + 0 + 3] = m + t; \ + q[(rb) + 0 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 4 + 0]; \ + n = q[(rb) + 4 + 0 + 64]; \ + t = REDS2(n * alpha_tab[8 + 0 * 2]); \ + q[(rb) + 4 + 0] = m + t; \ + q[(rb) + 4 + 0 + 64] = m - t; \ + m = q[(rb) + 4 + 1]; \ + n = q[(rb) + 4 + 1 + 64]; \ + t = REDS2(n * alpha_tab[8 + 1 * 2]); \ + q[(rb) + 4 + 1] = m + t; \ + q[(rb) + 4 + 1 + 64] = m - t; \ + m = q[(rb) + 4 + 2]; \ + n = q[(rb) + 4 + 2 + 64]; \ + t = REDS2(n * alpha_tab[8 + 2 * 2]); \ + q[(rb) + 4 + 2] = m + t; \ + q[(rb) + 4 + 2 + 64] = m - t; \ + m = q[(rb) + 4 + 3]; \ + n = q[(rb) + 4 + 3 + 64]; \ + t = REDS2(n * alpha_tab[8 + 3 * 2]); \ + q[(rb) + 4 + 3] = m + t; \ + q[(rb) + 4 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 8 + 0]; \ + n = q[(rb) + 8 + 0 + 64]; \ + t = REDS2(n * alpha_tab[16 + 0 * 2]); \ + q[(rb) + 8 + 0] = m + t; \ + q[(rb) + 8 + 0 + 64] = m - t; \ + m = q[(rb) + 8 + 1]; \ + n = q[(rb) + 8 + 1 + 64]; \ + t = REDS2(n * alpha_tab[16 + 1 * 2]); \ + q[(rb) + 8 + 1] = m + t; \ + q[(rb) + 8 + 1 + 64] = m - t; \ + m = q[(rb) + 8 + 2]; \ + n = q[(rb) + 8 + 2 + 64]; \ + t = REDS2(n * alpha_tab[16 + 2 * 2]); \ + q[(rb) + 8 + 2] = m + t; \ + q[(rb) + 8 + 2 + 64] = m - t; \ + m = q[(rb) + 8 + 3]; \ + n = q[(rb) + 8 + 3 + 64]; \ + t = REDS2(n * alpha_tab[16 + 3 * 2]); \ + q[(rb) + 8 + 3] = m + t; \ + q[(rb) + 8 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 12 + 0]; \ + n = q[(rb) + 12 + 0 + 64]; \ + t = REDS2(n * alpha_tab[24 + 0 * 2]); \ + q[(rb) + 12 + 0] = m + t; \ + q[(rb) + 12 + 0 + 64] = m - t; \ + m = q[(rb) + 12 + 1]; \ + n = q[(rb) + 12 + 1 + 64]; \ + t = REDS2(n * alpha_tab[24 + 1 * 2]); \ + q[(rb) + 12 + 1] = m + t; \ + q[(rb) + 12 + 1 + 64] = m - t; \ + m = q[(rb) + 12 + 2]; \ + n = q[(rb) + 12 + 2 + 64]; \ + t = REDS2(n * alpha_tab[24 + 2 * 2]); \ + q[(rb) + 12 + 2] = m + t; \ + q[(rb) + 12 + 2 + 64] = m - t; \ + m = q[(rb) + 12 + 3]; \ + n = q[(rb) + 12 + 3 + 64]; \ + t = REDS2(n * alpha_tab[24 + 3 * 2]); \ + q[(rb) + 12 + 3] = m + t; \ + q[(rb) + 12 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 16 + 0]; \ + n = q[(rb) + 16 + 0 + 64]; \ + t = REDS2(n * alpha_tab[32 + 0 * 2]); \ + q[(rb) + 16 + 0] = m + t; \ + q[(rb) + 16 + 0 + 64] = m - t; \ + m = q[(rb) + 16 + 1]; \ + n = q[(rb) + 16 + 1 + 64]; \ + t = REDS2(n * alpha_tab[32 + 1 * 2]); \ + q[(rb) + 16 + 1] = m + t; \ + q[(rb) + 16 + 1 + 64] = m - t; \ + m = q[(rb) + 16 + 2]; \ + n = q[(rb) + 16 + 2 + 64]; \ + t = REDS2(n * alpha_tab[32 + 2 * 2]); \ + q[(rb) + 16 + 2] = m + t; \ + q[(rb) + 16 + 2 + 64] = m - t; \ + m = q[(rb) + 16 + 3]; \ + n = q[(rb) + 16 + 3 + 64]; \ + t = REDS2(n * alpha_tab[32 + 3 * 2]); \ + q[(rb) + 16 + 3] = m + t; \ + q[(rb) + 16 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 20 + 0]; \ + n = q[(rb) + 20 + 0 + 64]; \ + t = REDS2(n * alpha_tab[40 + 0 * 2]); \ + q[(rb) + 20 + 0] = m + t; \ + q[(rb) + 20 + 0 + 64] = m - t; \ + m = q[(rb) + 20 + 1]; \ + n = q[(rb) + 20 + 1 + 64]; \ + t = REDS2(n * alpha_tab[40 + 1 * 2]); \ + q[(rb) + 20 + 1] = m + t; \ + q[(rb) + 20 + 1 + 64] = m - t; \ + m = q[(rb) + 20 + 2]; \ + n = q[(rb) + 20 + 2 + 64]; \ + t = REDS2(n * alpha_tab[40 + 2 * 2]); \ + q[(rb) + 20 + 2] = m + t; \ + q[(rb) + 20 + 2 + 64] = m - t; \ + m = q[(rb) + 20 + 3]; \ + n = q[(rb) + 20 + 3 + 64]; \ + t = REDS2(n * alpha_tab[40 + 3 * 2]); \ + q[(rb) + 20 + 3] = m + t; \ + q[(rb) + 20 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 24 + 0]; \ + n = q[(rb) + 24 + 0 + 64]; \ + t = REDS2(n * alpha_tab[48 + 0 * 2]); \ + q[(rb) + 24 + 0] = m + t; \ + q[(rb) + 24 + 0 + 64] = m - t; \ + m = q[(rb) + 24 + 1]; \ + n = q[(rb) + 24 + 1 + 64]; \ + t = REDS2(n * alpha_tab[48 + 1 * 2]); \ + q[(rb) + 24 + 1] = m + t; \ + q[(rb) + 24 + 1 + 64] = m - t; \ + m = q[(rb) + 24 + 2]; \ + n = q[(rb) + 24 + 2 + 64]; \ + t = REDS2(n * alpha_tab[48 + 2 * 2]); \ + q[(rb) + 24 + 2] = m + t; \ + q[(rb) + 24 + 2 + 64] = m - t; \ + m = q[(rb) + 24 + 3]; \ + n = q[(rb) + 24 + 3 + 64]; \ + t = REDS2(n * alpha_tab[48 + 3 * 2]); \ + q[(rb) + 24 + 3] = m + t; \ + q[(rb) + 24 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 28 + 0]; \ + n = q[(rb) + 28 + 0 + 64]; \ + t = REDS2(n * alpha_tab[56 + 0 * 2]); \ + q[(rb) + 28 + 0] = m + t; \ + q[(rb) + 28 + 0 + 64] = m - t; \ + m = q[(rb) + 28 + 1]; \ + n = q[(rb) + 28 + 1 + 64]; \ + t = REDS2(n * alpha_tab[56 + 1 * 2]); \ + q[(rb) + 28 + 1] = m + t; \ + q[(rb) + 28 + 1 + 64] = m - t; \ + m = q[(rb) + 28 + 2]; \ + n = q[(rb) + 28 + 2 + 64]; \ + t = REDS2(n * alpha_tab[56 + 2 * 2]); \ + q[(rb) + 28 + 2] = m + t; \ + q[(rb) + 28 + 2 + 64] = m - t; \ + m = q[(rb) + 28 + 3]; \ + n = q[(rb) + 28 + 3 + 64]; \ + t = REDS2(n * alpha_tab[56 + 3 * 2]); \ + q[(rb) + 28 + 3] = m + t; \ + q[(rb) + 28 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 32 + 0]; \ + n = q[(rb) + 32 + 0 + 64]; \ + t = REDS2(n * alpha_tab[64 + 0 * 2]); \ + q[(rb) + 32 + 0] = m + t; \ + q[(rb) + 32 + 0 + 64] = m - t; \ + m = q[(rb) + 32 + 1]; \ + n = q[(rb) + 32 + 1 + 64]; \ + t = REDS2(n * alpha_tab[64 + 1 * 2]); \ + q[(rb) + 32 + 1] = m + t; \ + q[(rb) + 32 + 1 + 64] = m - t; \ + m = q[(rb) + 32 + 2]; \ + n = q[(rb) + 32 + 2 + 64]; \ + t = REDS2(n * alpha_tab[64 + 2 * 2]); \ + q[(rb) + 32 + 2] = m + t; \ + q[(rb) + 32 + 2 + 64] = m - t; \ + m = q[(rb) + 32 + 3]; \ + n = q[(rb) + 32 + 3 + 64]; \ + t = REDS2(n * alpha_tab[64 + 3 * 2]); \ + q[(rb) + 32 + 3] = m + t; \ + q[(rb) + 32 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 36 + 0]; \ + n = q[(rb) + 36 + 0 + 64]; \ + t = REDS2(n * alpha_tab[72 + 0 * 2]); \ + q[(rb) + 36 + 0] = m + t; \ + q[(rb) + 36 + 0 + 64] = m - t; \ + m = q[(rb) + 36 + 1]; \ + n = q[(rb) + 36 + 1 + 64]; \ + t = REDS2(n * alpha_tab[72 + 1 * 2]); \ + q[(rb) + 36 + 1] = m + t; \ + q[(rb) + 36 + 1 + 64] = m - t; \ + m = q[(rb) + 36 + 2]; \ + n = q[(rb) + 36 + 2 + 64]; \ + t = REDS2(n * alpha_tab[72 + 2 * 2]); \ + q[(rb) + 36 + 2] = m + t; \ + q[(rb) + 36 + 2 + 64] = m - t; \ + m = q[(rb) + 36 + 3]; \ + n = q[(rb) + 36 + 3 + 64]; \ + t = REDS2(n * alpha_tab[72 + 3 * 2]); \ + q[(rb) + 36 + 3] = m + t; \ + q[(rb) + 36 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 40 + 0]; \ + n = q[(rb) + 40 + 0 + 64]; \ + t = REDS2(n * alpha_tab[80 + 0 * 2]); \ + q[(rb) + 40 + 0] = m + t; \ + q[(rb) + 40 + 0 + 64] = m - t; \ + m = q[(rb) + 40 + 1]; \ + n = q[(rb) + 40 + 1 + 64]; \ + t = REDS2(n * alpha_tab[80 + 1 * 2]); \ + q[(rb) + 40 + 1] = m + t; \ + q[(rb) + 40 + 1 + 64] = m - t; \ + m = q[(rb) + 40 + 2]; \ + n = q[(rb) + 40 + 2 + 64]; \ + t = REDS2(n * alpha_tab[80 + 2 * 2]); \ + q[(rb) + 40 + 2] = m + t; \ + q[(rb) + 40 + 2 + 64] = m - t; \ + m = q[(rb) + 40 + 3]; \ + n = q[(rb) + 40 + 3 + 64]; \ + t = REDS2(n * alpha_tab[80 + 3 * 2]); \ + q[(rb) + 40 + 3] = m + t; \ + q[(rb) + 40 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 44 + 0]; \ + n = q[(rb) + 44 + 0 + 64]; \ + t = REDS2(n * alpha_tab[88 + 0 * 2]); \ + q[(rb) + 44 + 0] = m + t; \ + q[(rb) + 44 + 0 + 64] = m - t; \ + m = q[(rb) + 44 + 1]; \ + n = q[(rb) + 44 + 1 + 64]; \ + t = REDS2(n * alpha_tab[88 + 1 * 2]); \ + q[(rb) + 44 + 1] = m + t; \ + q[(rb) + 44 + 1 + 64] = m - t; \ + m = q[(rb) + 44 + 2]; \ + n = q[(rb) + 44 + 2 + 64]; \ + t = REDS2(n * alpha_tab[88 + 2 * 2]); \ + q[(rb) + 44 + 2] = m + t; \ + q[(rb) + 44 + 2 + 64] = m - t; \ + m = q[(rb) + 44 + 3]; \ + n = q[(rb) + 44 + 3 + 64]; \ + t = REDS2(n * alpha_tab[88 + 3 * 2]); \ + q[(rb) + 44 + 3] = m + t; \ + q[(rb) + 44 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 48 + 0]; \ + n = q[(rb) + 48 + 0 + 64]; \ + t = REDS2(n * alpha_tab[96 + 0 * 2]); \ + q[(rb) + 48 + 0] = m + t; \ + q[(rb) + 48 + 0 + 64] = m - t; \ + m = q[(rb) + 48 + 1]; \ + n = q[(rb) + 48 + 1 + 64]; \ + t = REDS2(n * alpha_tab[96 + 1 * 2]); \ + q[(rb) + 48 + 1] = m + t; \ + q[(rb) + 48 + 1 + 64] = m - t; \ + m = q[(rb) + 48 + 2]; \ + n = q[(rb) + 48 + 2 + 64]; \ + t = REDS2(n * alpha_tab[96 + 2 * 2]); \ + q[(rb) + 48 + 2] = m + t; \ + q[(rb) + 48 + 2 + 64] = m - t; \ + m = q[(rb) + 48 + 3]; \ + n = q[(rb) + 48 + 3 + 64]; \ + t = REDS2(n * alpha_tab[96 + 3 * 2]); \ + q[(rb) + 48 + 3] = m + t; \ + q[(rb) + 48 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 52 + 0]; \ + n = q[(rb) + 52 + 0 + 64]; \ + t = REDS2(n * alpha_tab[104 + 0 * 2]); \ + q[(rb) + 52 + 0] = m + t; \ + q[(rb) + 52 + 0 + 64] = m - t; \ + m = q[(rb) + 52 + 1]; \ + n = q[(rb) + 52 + 1 + 64]; \ + t = REDS2(n * alpha_tab[104 + 1 * 2]); \ + q[(rb) + 52 + 1] = m + t; \ + q[(rb) + 52 + 1 + 64] = m - t; \ + m = q[(rb) + 52 + 2]; \ + n = q[(rb) + 52 + 2 + 64]; \ + t = REDS2(n * alpha_tab[104 + 2 * 2]); \ + q[(rb) + 52 + 2] = m + t; \ + q[(rb) + 52 + 2 + 64] = m - t; \ + m = q[(rb) + 52 + 3]; \ + n = q[(rb) + 52 + 3 + 64]; \ + t = REDS2(n * alpha_tab[104 + 3 * 2]); \ + q[(rb) + 52 + 3] = m + t; \ + q[(rb) + 52 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 56 + 0]; \ + n = q[(rb) + 56 + 0 + 64]; \ + t = REDS2(n * alpha_tab[112 + 0 * 2]); \ + q[(rb) + 56 + 0] = m + t; \ + q[(rb) + 56 + 0 + 64] = m - t; \ + m = q[(rb) + 56 + 1]; \ + n = q[(rb) + 56 + 1 + 64]; \ + t = REDS2(n * alpha_tab[112 + 1 * 2]); \ + q[(rb) + 56 + 1] = m + t; \ + q[(rb) + 56 + 1 + 64] = m - t; \ + m = q[(rb) + 56 + 2]; \ + n = q[(rb) + 56 + 2 + 64]; \ + t = REDS2(n * alpha_tab[112 + 2 * 2]); \ + q[(rb) + 56 + 2] = m + t; \ + q[(rb) + 56 + 2 + 64] = m - t; \ + m = q[(rb) + 56 + 3]; \ + n = q[(rb) + 56 + 3 + 64]; \ + t = REDS2(n * alpha_tab[112 + 3 * 2]); \ + q[(rb) + 56 + 3] = m + t; \ + q[(rb) + 56 + 3 + 64] = m - t; \ + \ + m = q[(rb) + 60 + 0]; \ + n = q[(rb) + 60 + 0 + 64]; \ + t = REDS2(n * alpha_tab[120 + 0 * 2]); \ + q[(rb) + 60 + 0] = m + t; \ + q[(rb) + 60 + 0 + 64] = m - t; \ + m = q[(rb) + 60 + 1]; \ + n = q[(rb) + 60 + 1 + 64]; \ + t = REDS2(n * alpha_tab[120 + 1 * 2]); \ + q[(rb) + 60 + 1] = m + t; \ + q[(rb) + 60 + 1 + 64] = m - t; \ + m = q[(rb) + 60 + 2]; \ + n = q[(rb) + 60 + 2 + 64]; \ + t = REDS2(n * alpha_tab[120 + 2 * 2]); \ + q[(rb) + 60 + 2] = m + t; \ + q[(rb) + 60 + 2 + 64] = m - t; \ + m = q[(rb) + 60 + 3]; \ + n = q[(rb) + 60 + 3 + 64]; \ + t = REDS2(n * alpha_tab[120 + 3 * 2]); \ + q[(rb) + 60 + 3] = m + t; \ + q[(rb) + 60 + 3 + 64] = m - t; \ + } while (0) + +#define FFT_LOOP_128_1(rb) do { \ + s32 m = q[(rb)]; \ + s32 n = q[(rb) + 128]; \ + q[(rb)] = m + n; \ + q[(rb) + 128] = m - n; \ + s32 t; \ + m = q[(rb) + 0 + 1]; \ + n = q[(rb) + 0 + 1 + 128]; \ + t = REDS2(n * alpha_tab[0 + 1 * 1]); \ + q[(rb) + 0 + 1] = m + t; \ + q[(rb) + 0 + 1 + 128] = m - t; \ + m = q[(rb) + 0 + 2]; \ + n = q[(rb) + 0 + 2 + 128]; \ + t = REDS2(n * alpha_tab[0 + 2 * 1]); \ + q[(rb) + 0 + 2] = m + t; \ + q[(rb) + 0 + 2 + 128] = m - t; \ + m = q[(rb) + 0 + 3]; \ + n = q[(rb) + 0 + 3 + 128]; \ + t = REDS2(n * alpha_tab[0 + 3 * 1]); \ + q[(rb) + 0 + 3] = m + t; \ + q[(rb) + 0 + 3 + 128] = m - t; \ + m = q[(rb) + 4 + 0]; \ + n = q[(rb) + 4 + 0 + 128]; \ + t = REDS2(n * alpha_tab[4 + 0 * 1]); \ + q[(rb) + 4 + 0] = m + t; \ + q[(rb) + 4 + 0 + 128] = m - t; \ + m = q[(rb) + 4 + 1]; \ + n = q[(rb) + 4 + 1 + 128]; \ + t = REDS2(n * alpha_tab[4 + 1 * 1]); \ + q[(rb) + 4 + 1] = m + t; \ + q[(rb) + 4 + 1 + 128] = m - t; \ + m = q[(rb) + 4 + 2]; \ + n = q[(rb) + 4 + 2 + 128]; \ + t = REDS2(n * alpha_tab[4 + 2 * 1]); \ + q[(rb) + 4 + 2] = m + t; \ + q[(rb) + 4 + 2 + 128] = m - t; \ + m = q[(rb) + 4 + 3]; \ + n = q[(rb) + 4 + 3 + 128]; \ + t = REDS2(n * alpha_tab[4 + 3 * 1]); \ + q[(rb) + 4 + 3] = m + t; \ + q[(rb) + 4 + 3 + 128] = m - t; \ + m = q[(rb) + 8 + 0]; \ + n = q[(rb) + 8 + 0 + 128]; \ + t = REDS2(n * alpha_tab[8 + 0 * 1]); \ + q[(rb) + 8 + 0] = m + t; \ + q[(rb) + 8 + 0 + 128] = m - t; \ + m = q[(rb) + 8 + 1]; \ + n = q[(rb) + 8 + 1 + 128]; \ + t = REDS2(n * alpha_tab[8 + 1 * 1]); \ + q[(rb) + 8 + 1] = m + t; \ + q[(rb) + 8 + 1 + 128] = m - t; \ + m = q[(rb) + 8 + 2]; \ + n = q[(rb) + 8 + 2 + 128]; \ + t = REDS2(n * alpha_tab[8 + 2 * 1]); \ + q[(rb) + 8 + 2] = m + t; \ + q[(rb) + 8 + 2 + 128] = m - t; \ + m = q[(rb) + 8 + 3]; \ + n = q[(rb) + 8 + 3 + 128]; \ + t = REDS2(n * alpha_tab[8 + 3 * 1]); \ + q[(rb) + 8 + 3] = m + t; \ + q[(rb) + 8 + 3 + 128] = m - t; \ + m = q[(rb) + 12 + 0]; \ + n = q[(rb) + 12 + 0 + 128]; \ + t = REDS2(n * alpha_tab[12 + 0 * 1]); \ + q[(rb) + 12 + 0] = m + t; \ + q[(rb) + 12 + 0 + 128] = m - t; \ + m = q[(rb) + 12 + 1]; \ + n = q[(rb) + 12 + 1 + 128]; \ + t = REDS2(n * alpha_tab[12 + 1 * 1]); \ + q[(rb) + 12 + 1] = m + t; \ + q[(rb) + 12 + 1 + 128] = m - t; \ + m = q[(rb) + 12 + 2]; \ + n = q[(rb) + 12 + 2 + 128]; \ + t = REDS2(n * alpha_tab[12 + 2 * 1]); \ + q[(rb) + 12 + 2] = m + t; \ + q[(rb) + 12 + 2 + 128] = m - t; \ + m = q[(rb) + 12 + 3]; \ + n = q[(rb) + 12 + 3 + 128]; \ + t = REDS2(n * alpha_tab[12 + 3 * 1]); \ + q[(rb) + 12 + 3] = m + t; \ + q[(rb) + 12 + 3 + 128] = m - t; \ + m = q[(rb) + 16 + 0]; \ + n = q[(rb) + 16 + 0 + 128]; \ + t = REDS2(n * alpha_tab[16 + 0 * 1]); \ + q[(rb) + 16 + 0] = m + t; \ + q[(rb) + 16 + 0 + 128] = m - t; \ + m = q[(rb) + 16 + 1]; \ + n = q[(rb) + 16 + 1 + 128]; \ + t = REDS2(n * alpha_tab[16 + 1 * 1]); \ + q[(rb) + 16 + 1] = m + t; \ + q[(rb) + 16 + 1 + 128] = m - t; \ + m = q[(rb) + 16 + 2]; \ + n = q[(rb) + 16 + 2 + 128]; \ + t = REDS2(n * alpha_tab[16 + 2 * 1]); \ + q[(rb) + 16 + 2] = m + t; \ + q[(rb) + 16 + 2 + 128] = m - t; \ + m = q[(rb) + 16 + 3]; \ + n = q[(rb) + 16 + 3 + 128]; \ + t = REDS2(n * alpha_tab[16 + 3 * 1]); \ + q[(rb) + 16 + 3] = m + t; \ + q[(rb) + 16 + 3 + 128] = m - t; \ + m = q[(rb) + 20 + 0]; \ + n = q[(rb) + 20 + 0 + 128]; \ + t = REDS2(n * alpha_tab[20 + 0 * 1]); \ + q[(rb) + 20 + 0] = m + t; \ + q[(rb) + 20 + 0 + 128] = m - t; \ + m = q[(rb) + 20 + 1]; \ + n = q[(rb) + 20 + 1 + 128]; \ + t = REDS2(n * alpha_tab[20 + 1 * 1]); \ + q[(rb) + 20 + 1] = m + t; \ + q[(rb) + 20 + 1 + 128] = m - t; \ + m = q[(rb) + 20 + 2]; \ + n = q[(rb) + 20 + 2 + 128]; \ + t = REDS2(n * alpha_tab[20 + 2 * 1]); \ + q[(rb) + 20 + 2] = m + t; \ + q[(rb) + 20 + 2 + 128] = m - t; \ + m = q[(rb) + 20 + 3]; \ + n = q[(rb) + 20 + 3 + 128]; \ + t = REDS2(n * alpha_tab[20 + 3 * 1]); \ + q[(rb) + 20 + 3] = m + t; \ + q[(rb) + 20 + 3 + 128] = m - t; \ + m = q[(rb) + 24 + 0]; \ + n = q[(rb) + 24 + 0 + 128]; \ + t = REDS2(n * alpha_tab[24 + 0 * 1]); \ + q[(rb) + 24 + 0] = m + t; \ + q[(rb) + 24 + 0 + 128] = m - t; \ + m = q[(rb) + 24 + 1]; \ + n = q[(rb) + 24 + 1 + 128]; \ + t = REDS2(n * alpha_tab[24 + 1 * 1]); \ + q[(rb) + 24 + 1] = m + t; \ + q[(rb) + 24 + 1 + 128] = m - t; \ + m = q[(rb) + 24 + 2]; \ + n = q[(rb) + 24 + 2 + 128]; \ + t = REDS2(n * alpha_tab[24 + 2 * 1]); \ + q[(rb) + 24 + 2] = m + t; \ + q[(rb) + 24 + 2 + 128] = m - t; \ + m = q[(rb) + 24 + 3]; \ + n = q[(rb) + 24 + 3 + 128]; \ + t = REDS2(n * alpha_tab[24 + 3 * 1]); \ + q[(rb) + 24 + 3] = m + t; \ + q[(rb) + 24 + 3 + 128] = m - t; \ + m = q[(rb) + 28 + 0]; \ + n = q[(rb) + 28 + 0 + 128]; \ + t = REDS2(n * alpha_tab[28 + 0 * 1]); \ + q[(rb) + 28 + 0] = m + t; \ + q[(rb) + 28 + 0 + 128] = m - t; \ + m = q[(rb) + 28 + 1]; \ + n = q[(rb) + 28 + 1 + 128]; \ + t = REDS2(n * alpha_tab[28 + 1 * 1]); \ + q[(rb) + 28 + 1] = m + t; \ + q[(rb) + 28 + 1 + 128] = m - t; \ + m = q[(rb) + 28 + 2]; \ + n = q[(rb) + 28 + 2 + 128]; \ + t = REDS2(n * alpha_tab[28 + 2 * 1]); \ + q[(rb) + 28 + 2] = m + t; \ + q[(rb) + 28 + 2 + 128] = m - t; \ + m = q[(rb) + 28 + 3]; \ + n = q[(rb) + 28 + 3 + 128]; \ + t = REDS2(n * alpha_tab[28 + 3 * 1]); \ + q[(rb) + 28 + 3] = m + t; \ + q[(rb) + 28 + 3 + 128] = m - t; \ + m = q[(rb) + 32 + 0]; \ + n = q[(rb) + 32 + 0 + 128]; \ + t = REDS2(n * alpha_tab[32 + 0 * 1]); \ + q[(rb) + 32 + 0] = m + t; \ + q[(rb) + 32 + 0 + 128] = m - t; \ + m = q[(rb) + 32 + 1]; \ + n = q[(rb) + 32 + 1 + 128]; \ + t = REDS2(n * alpha_tab[32 + 1 * 1]); \ + q[(rb) + 32 + 1] = m + t; \ + q[(rb) + 32 + 1 + 128] = m - t; \ + m = q[(rb) + 32 + 2]; \ + n = q[(rb) + 32 + 2 + 128]; \ + t = REDS2(n * alpha_tab[32 + 2 * 1]); \ + q[(rb) + 32 + 2] = m + t; \ + q[(rb) + 32 + 2 + 128] = m - t; \ + m = q[(rb) + 32 + 3]; \ + n = q[(rb) + 32 + 3 + 128]; \ + t = REDS2(n * alpha_tab[32 + 3 * 1]); \ + q[(rb) + 32 + 3] = m + t; \ + q[(rb) + 32 + 3 + 128] = m - t; \ + m = q[(rb) + 36 + 0]; \ + n = q[(rb) + 36 + 0 + 128]; \ + t = REDS2(n * alpha_tab[36 + 0 * 1]); \ + q[(rb) + 36 + 0] = m + t; \ + q[(rb) + 36 + 0 + 128] = m - t; \ + m = q[(rb) + 36 + 1]; \ + n = q[(rb) + 36 + 1 + 128]; \ + t = REDS2(n * alpha_tab[36 + 1 * 1]); \ + q[(rb) + 36 + 1] = m + t; \ + q[(rb) + 36 + 1 + 128] = m - t; \ + m = q[(rb) + 36 + 2]; \ + n = q[(rb) + 36 + 2 + 128]; \ + t = REDS2(n * alpha_tab[36 + 2 * 1]); \ + q[(rb) + 36 + 2] = m + t; \ + q[(rb) + 36 + 2 + 128] = m - t; \ + m = q[(rb) + 36 + 3]; \ + n = q[(rb) + 36 + 3 + 128]; \ + t = REDS2(n * alpha_tab[36 + 3 * 1]); \ + q[(rb) + 36 + 3] = m + t; \ + q[(rb) + 36 + 3 + 128] = m - t; \ + m = q[(rb) + 40 + 0]; \ + n = q[(rb) + 40 + 0 + 128]; \ + t = REDS2(n * alpha_tab[40 + 0 * 1]); \ + q[(rb) + 40 + 0] = m + t; \ + q[(rb) + 40 + 0 + 128] = m - t; \ + m = q[(rb) + 40 + 1]; \ + n = q[(rb) + 40 + 1 + 128]; \ + t = REDS2(n * alpha_tab[40 + 1 * 1]); \ + q[(rb) + 40 + 1] = m + t; \ + q[(rb) + 40 + 1 + 128] = m - t; \ + m = q[(rb) + 40 + 2]; \ + n = q[(rb) + 40 + 2 + 128]; \ + t = REDS2(n * alpha_tab[40 + 2 * 1]); \ + q[(rb) + 40 + 2] = m + t; \ + q[(rb) + 40 + 2 + 128] = m - t; \ + m = q[(rb) + 40 + 3]; \ + n = q[(rb) + 40 + 3 + 128]; \ + t = REDS2(n * alpha_tab[40 + 3 * 1]); \ + q[(rb) + 40 + 3] = m + t; \ + q[(rb) + 40 + 3 + 128] = m - t; \ + m = q[(rb) + 44 + 0]; \ + n = q[(rb) + 44 + 0 + 128]; \ + t = REDS2(n * alpha_tab[44 + 0 * 1]); \ + q[(rb) + 44 + 0] = m + t; \ + q[(rb) + 44 + 0 + 128] = m - t; \ + m = q[(rb) + 44 + 1]; \ + n = q[(rb) + 44 + 1 + 128]; \ + t = REDS2(n * alpha_tab[44 + 1 * 1]); \ + q[(rb) + 44 + 1] = m + t; \ + q[(rb) + 44 + 1 + 128] = m - t; \ + m = q[(rb) + 44 + 2]; \ + n = q[(rb) + 44 + 2 + 128]; \ + t = REDS2(n * alpha_tab[44 + 2 * 1]); \ + q[(rb) + 44 + 2] = m + t; \ + q[(rb) + 44 + 2 + 128] = m - t; \ + m = q[(rb) + 44 + 3]; \ + n = q[(rb) + 44 + 3 + 128]; \ + t = REDS2(n * alpha_tab[44 + 3 * 1]); \ + q[(rb) + 44 + 3] = m + t; \ + q[(rb) + 44 + 3 + 128] = m - t; \ + m = q[(rb) + 48 + 0]; \ + n = q[(rb) + 48 + 0 + 128]; \ + t = REDS2(n * alpha_tab[48 + 0 * 1]); \ + q[(rb) + 48 + 0] = m + t; \ + q[(rb) + 48 + 0 + 128] = m - t; \ + m = q[(rb) + 48 + 1]; \ + n = q[(rb) + 48 + 1 + 128]; \ + t = REDS2(n * alpha_tab[48 + 1 * 1]); \ + q[(rb) + 48 + 1] = m + t; \ + q[(rb) + 48 + 1 + 128] = m - t; \ + m = q[(rb) + 48 + 2]; \ + n = q[(rb) + 48 + 2 + 128]; \ + t = REDS2(n * alpha_tab[48 + 2 * 1]); \ + q[(rb) + 48 + 2] = m + t; \ + q[(rb) + 48 + 2 + 128] = m - t; \ + m = q[(rb) + 48 + 3]; \ + n = q[(rb) + 48 + 3 + 128]; \ + t = REDS2(n * alpha_tab[48 + 3 * 1]); \ + q[(rb) + 48 + 3] = m + t; \ + q[(rb) + 48 + 3 + 128] = m - t; \ + m = q[(rb) + 52 + 0]; \ + n = q[(rb) + 52 + 0 + 128]; \ + t = REDS2(n * alpha_tab[52 + 0 * 1]); \ + q[(rb) + 52 + 0] = m + t; \ + q[(rb) + 52 + 0 + 128] = m - t; \ + m = q[(rb) + 52 + 1]; \ + n = q[(rb) + 52 + 1 + 128]; \ + t = REDS2(n * alpha_tab[52 + 1 * 1]); \ + q[(rb) + 52 + 1] = m + t; \ + q[(rb) + 52 + 1 + 128] = m - t; \ + m = q[(rb) + 52 + 2]; \ + n = q[(rb) + 52 + 2 + 128]; \ + t = REDS2(n * alpha_tab[52 + 2 * 1]); \ + q[(rb) + 52 + 2] = m + t; \ + q[(rb) + 52 + 2 + 128] = m - t; \ + m = q[(rb) + 52 + 3]; \ + n = q[(rb) + 52 + 3 + 128]; \ + t = REDS2(n * alpha_tab[52 + 3 * 1]); \ + q[(rb) + 52 + 3] = m + t; \ + q[(rb) + 52 + 3 + 128] = m - t; \ + m = q[(rb) + 56 + 0]; \ + n = q[(rb) + 56 + 0 + 128]; \ + t = REDS2(n * alpha_tab[56 + 0 * 1]); \ + q[(rb) + 56 + 0] = m + t; \ + q[(rb) + 56 + 0 + 128] = m - t; \ + m = q[(rb) + 56 + 1]; \ + n = q[(rb) + 56 + 1 + 128]; \ + t = REDS2(n * alpha_tab[56 + 1 * 1]); \ + q[(rb) + 56 + 1] = m + t; \ + q[(rb) + 56 + 1 + 128] = m - t; \ + m = q[(rb) + 56 + 2]; \ + n = q[(rb) + 56 + 2 + 128]; \ + t = REDS2(n * alpha_tab[56 + 2 * 1]); \ + q[(rb) + 56 + 2] = m + t; \ + q[(rb) + 56 + 2 + 128] = m - t; \ + m = q[(rb) + 56 + 3]; \ + n = q[(rb) + 56 + 3 + 128]; \ + t = REDS2(n * alpha_tab[56 + 3 * 1]); \ + q[(rb) + 56 + 3] = m + t; \ + q[(rb) + 56 + 3 + 128] = m - t; \ + m = q[(rb) + 60 + 0]; \ + n = q[(rb) + 60 + 0 + 128]; \ + t = REDS2(n * alpha_tab[60 + 0 * 1]); \ + q[(rb) + 60 + 0] = m + t; \ + q[(rb) + 60 + 0 + 128] = m - t; \ + m = q[(rb) + 60 + 1]; \ + n = q[(rb) + 60 + 1 + 128]; \ + t = REDS2(n * alpha_tab[60 + 1 * 1]); \ + q[(rb) + 60 + 1] = m + t; \ + q[(rb) + 60 + 1 + 128] = m - t; \ + m = q[(rb) + 60 + 2]; \ + n = q[(rb) + 60 + 2 + 128]; \ + t = REDS2(n * alpha_tab[60 + 2 * 1]); \ + q[(rb) + 60 + 2] = m + t; \ + q[(rb) + 60 + 2 + 128] = m - t; \ + m = q[(rb) + 60 + 3]; \ + n = q[(rb) + 60 + 3 + 128]; \ + t = REDS2(n * alpha_tab[60 + 3 * 1]); \ + q[(rb) + 60 + 3] = m + t; \ + q[(rb) + 60 + 3 + 128] = m - t; \ + m = q[(rb) + 64 + 0]; \ + n = q[(rb) + 64 + 0 + 128]; \ + t = REDS2(n * alpha_tab[64 + 0 * 1]); \ + q[(rb) + 64 + 0] = m + t; \ + q[(rb) + 64 + 0 + 128] = m - t; \ + m = q[(rb) + 64 + 1]; \ + n = q[(rb) + 64 + 1 + 128]; \ + t = REDS2(n * alpha_tab[64 + 1 * 1]); \ + q[(rb) + 64 + 1] = m + t; \ + q[(rb) + 64 + 1 + 128] = m - t; \ + m = q[(rb) + 64 + 2]; \ + n = q[(rb) + 64 + 2 + 128]; \ + t = REDS2(n * alpha_tab[64 + 2 * 1]); \ + q[(rb) + 64 + 2] = m + t; \ + q[(rb) + 64 + 2 + 128] = m - t; \ + m = q[(rb) + 64 + 3]; \ + n = q[(rb) + 64 + 3 + 128]; \ + t = REDS2(n * alpha_tab[64 + 3 * 1]); \ + q[(rb) + 64 + 3] = m + t; \ + q[(rb) + 64 + 3 + 128] = m - t; \ + m = q[(rb) + 68 + 0]; \ + n = q[(rb) + 68 + 0 + 128]; \ + t = REDS2(n * alpha_tab[68 + 0 * 1]); \ + q[(rb) + 68 + 0] = m + t; \ + q[(rb) + 68 + 0 + 128] = m - t; \ + m = q[(rb) + 68 + 1]; \ + n = q[(rb) + 68 + 1 + 128]; \ + t = REDS2(n * alpha_tab[68 + 1 * 1]); \ + q[(rb) + 68 + 1] = m + t; \ + q[(rb) + 68 + 1 + 128] = m - t; \ + m = q[(rb) + 68 + 2]; \ + n = q[(rb) + 68 + 2 + 128]; \ + t = REDS2(n * alpha_tab[68 + 2 * 1]); \ + q[(rb) + 68 + 2] = m + t; \ + q[(rb) + 68 + 2 + 128] = m - t; \ + m = q[(rb) + 68 + 3]; \ + n = q[(rb) + 68 + 3 + 128]; \ + t = REDS2(n * alpha_tab[68 + 3 * 1]); \ + q[(rb) + 68 + 3] = m + t; \ + q[(rb) + 68 + 3 + 128] = m - t; \ + m = q[(rb) + 72 + 0]; \ + n = q[(rb) + 72 + 0 + 128]; \ + t = REDS2(n * alpha_tab[72 + 0 * 1]); \ + q[(rb) + 72 + 0] = m + t; \ + q[(rb) + 72 + 0 + 128] = m - t; \ + m = q[(rb) + 72 + 1]; \ + n = q[(rb) + 72 + 1 + 128]; \ + t = REDS2(n * alpha_tab[72 + 1 * 1]); \ + q[(rb) + 72 + 1] = m + t; \ + q[(rb) + 72 + 1 + 128] = m - t; \ + m = q[(rb) + 72 + 2]; \ + n = q[(rb) + 72 + 2 + 128]; \ + t = REDS2(n * alpha_tab[72 + 2 * 1]); \ + q[(rb) + 72 + 2] = m + t; \ + q[(rb) + 72 + 2 + 128] = m - t; \ + m = q[(rb) + 72 + 3]; \ + n = q[(rb) + 72 + 3 + 128]; \ + t = REDS2(n * alpha_tab[72 + 3 * 1]); \ + q[(rb) + 72 + 3] = m + t; \ + q[(rb) + 72 + 3 + 128] = m - t; \ + m = q[(rb) + 76 + 0]; \ + n = q[(rb) + 76 + 0 + 128]; \ + t = REDS2(n * alpha_tab[76 + 0 * 1]); \ + q[(rb) + 76 + 0] = m + t; \ + q[(rb) + 76 + 0 + 128] = m - t; \ + m = q[(rb) + 76 + 1]; \ + n = q[(rb) + 76 + 1 + 128]; \ + t = REDS2(n * alpha_tab[76 + 1 * 1]); \ + q[(rb) + 76 + 1] = m + t; \ + q[(rb) + 76 + 1 + 128] = m - t; \ + m = q[(rb) + 76 + 2]; \ + n = q[(rb) + 76 + 2 + 128]; \ + t = REDS2(n * alpha_tab[76 + 2 * 1]); \ + q[(rb) + 76 + 2] = m + t; \ + q[(rb) + 76 + 2 + 128] = m - t; \ + m = q[(rb) + 76 + 3]; \ + n = q[(rb) + 76 + 3 + 128]; \ + t = REDS2(n * alpha_tab[76 + 3 * 1]); \ + q[(rb) + 76 + 3] = m + t; \ + q[(rb) + 76 + 3 + 128] = m - t; \ + m = q[(rb) + 80 + 0]; \ + n = q[(rb) + 80 + 0 + 128]; \ + t = REDS2(n * alpha_tab[80 + 0 * 1]); \ + q[(rb) + 80 + 0] = m + t; \ + q[(rb) + 80 + 0 + 128] = m - t; \ + m = q[(rb) + 80 + 1]; \ + n = q[(rb) + 80 + 1 + 128]; \ + t = REDS2(n * alpha_tab[80 + 1 * 1]); \ + q[(rb) + 80 + 1] = m + t; \ + q[(rb) + 80 + 1 + 128] = m - t; \ + m = q[(rb) + 80 + 2]; \ + n = q[(rb) + 80 + 2 + 128]; \ + t = REDS2(n * alpha_tab[80 + 2 * 1]); \ + q[(rb) + 80 + 2] = m + t; \ + q[(rb) + 80 + 2 + 128] = m - t; \ + m = q[(rb) + 80 + 3]; \ + n = q[(rb) + 80 + 3 + 128]; \ + t = REDS2(n * alpha_tab[80 + 3 * 1]); \ + q[(rb) + 80 + 3] = m + t; \ + q[(rb) + 80 + 3 + 128] = m - t; \ + m = q[(rb) + 84 + 0]; \ + n = q[(rb) + 84 + 0 + 128]; \ + t = REDS2(n * alpha_tab[84 + 0 * 1]); \ + q[(rb) + 84 + 0] = m + t; \ + q[(rb) + 84 + 0 + 128] = m - t; \ + m = q[(rb) + 84 + 1]; \ + n = q[(rb) + 84 + 1 + 128]; \ + t = REDS2(n * alpha_tab[84 + 1 * 1]); \ + q[(rb) + 84 + 1] = m + t; \ + q[(rb) + 84 + 1 + 128] = m - t; \ + m = q[(rb) + 84 + 2]; \ + n = q[(rb) + 84 + 2 + 128]; \ + t = REDS2(n * alpha_tab[84 + 2 * 1]); \ + q[(rb) + 84 + 2] = m + t; \ + q[(rb) + 84 + 2 + 128] = m - t; \ + m = q[(rb) + 84 + 3]; \ + n = q[(rb) + 84 + 3 + 128]; \ + t = REDS2(n * alpha_tab[84 + 3 * 1]); \ + q[(rb) + 84 + 3] = m + t; \ + q[(rb) + 84 + 3 + 128] = m - t; \ + m = q[(rb) + 88 + 0]; \ + n = q[(rb) + 88 + 0 + 128]; \ + t = REDS2(n * alpha_tab[88 + 0 * 1]); \ + q[(rb) + 88 + 0] = m + t; \ + q[(rb) + 88 + 0 + 128] = m - t; \ + m = q[(rb) + 88 + 1]; \ + n = q[(rb) + 88 + 1 + 128]; \ + t = REDS2(n * alpha_tab[88 + 1 * 1]); \ + q[(rb) + 88 + 1] = m + t; \ + q[(rb) + 88 + 1 + 128] = m - t; \ + m = q[(rb) + 88 + 2]; \ + n = q[(rb) + 88 + 2 + 128]; \ + t = REDS2(n * alpha_tab[88 + 2 * 1]); \ + q[(rb) + 88 + 2] = m + t; \ + q[(rb) + 88 + 2 + 128] = m - t; \ + m = q[(rb) + 88 + 3]; \ + n = q[(rb) + 88 + 3 + 128]; \ + t = REDS2(n * alpha_tab[88 + 3 * 1]); \ + q[(rb) + 88 + 3] = m + t; \ + q[(rb) + 88 + 3 + 128] = m - t; \ + m = q[(rb) + 92 + 0]; \ + n = q[(rb) + 92 + 0 + 128]; \ + t = REDS2(n * alpha_tab[92 + 0 * 1]); \ + q[(rb) + 92 + 0] = m + t; \ + q[(rb) + 92 + 0 + 128] = m - t; \ + m = q[(rb) + 92 + 1]; \ + n = q[(rb) + 92 + 1 + 128]; \ + t = REDS2(n * alpha_tab[92 + 1 * 1]); \ + q[(rb) + 92 + 1] = m + t; \ + q[(rb) + 92 + 1 + 128] = m - t; \ + m = q[(rb) + 92 + 2]; \ + n = q[(rb) + 92 + 2 + 128]; \ + t = REDS2(n * alpha_tab[92 + 2 * 1]); \ + q[(rb) + 92 + 2] = m + t; \ + q[(rb) + 92 + 2 + 128] = m - t; \ + m = q[(rb) + 92 + 3]; \ + n = q[(rb) + 92 + 3 + 128]; \ + t = REDS2(n * alpha_tab[92 + 3 * 1]); \ + q[(rb) + 92 + 3] = m + t; \ + q[(rb) + 92 + 3 + 128] = m - t; \ + m = q[(rb) + 96 + 0]; \ + n = q[(rb) + 96 + 0 + 128]; \ + t = REDS2(n * alpha_tab[96 + 0 * 1]); \ + q[(rb) + 96 + 0] = m + t; \ + q[(rb) + 96 + 0 + 128] = m - t; \ + m = q[(rb) + 96 + 1]; \ + n = q[(rb) + 96 + 1 + 128]; \ + t = REDS2(n * alpha_tab[96 + 1 * 1]); \ + q[(rb) + 96 + 1] = m + t; \ + q[(rb) + 96 + 1 + 128] = m - t; \ + m = q[(rb) + 96 + 2]; \ + n = q[(rb) + 96 + 2 + 128]; \ + t = REDS2(n * alpha_tab[96 + 2 * 1]); \ + q[(rb) + 96 + 2] = m + t; \ + q[(rb) + 96 + 2 + 128] = m - t; \ + m = q[(rb) + 96 + 3]; \ + n = q[(rb) + 96 + 3 + 128]; \ + t = REDS2(n * alpha_tab[96 + 3 * 1]); \ + q[(rb) + 96 + 3] = m + t; \ + q[(rb) + 96 + 3 + 128] = m - t; \ + m = q[(rb) + 100 + 0]; \ + n = q[(rb) + 100 + 0 + 128]; \ + t = REDS2(n * alpha_tab[100 + 0 * 1]); \ + q[(rb) + 100 + 0] = m + t; \ + q[(rb) + 100 + 0 + 128] = m - t; \ + m = q[(rb) + 100 + 1]; \ + n = q[(rb) + 100 + 1 + 128]; \ + t = REDS2(n * alpha_tab[100 + 1 * 1]); \ + q[(rb) + 100 + 1] = m + t; \ + q[(rb) + 100 + 1 + 128] = m - t; \ + m = q[(rb) + 100 + 2]; \ + n = q[(rb) + 100 + 2 + 128]; \ + t = REDS2(n * alpha_tab[100 + 2 * 1]); \ + q[(rb) + 100 + 2] = m + t; \ + q[(rb) + 100 + 2 + 128] = m - t; \ + m = q[(rb) + 100 + 3]; \ + n = q[(rb) + 100 + 3 + 128]; \ + t = REDS2(n * alpha_tab[100 + 3 * 1]); \ + q[(rb) + 100 + 3] = m + t; \ + q[(rb) + 100 + 3 + 128] = m - t; \ + m = q[(rb) + 104 + 0]; \ + n = q[(rb) + 104 + 0 + 128]; \ + t = REDS2(n * alpha_tab[104 + 0 * 1]); \ + q[(rb) + 104 + 0] = m + t; \ + q[(rb) + 104 + 0 + 128] = m - t; \ + m = q[(rb) + 104 + 1]; \ + n = q[(rb) + 104 + 1 + 128]; \ + t = REDS2(n * alpha_tab[104 + 1 * 1]); \ + q[(rb) + 104 + 1] = m + t; \ + q[(rb) + 104 + 1 + 128] = m - t; \ + m = q[(rb) + 104 + 2]; \ + n = q[(rb) + 104 + 2 + 128]; \ + t = REDS2(n * alpha_tab[104 + 2 * 1]); \ + q[(rb) + 104 + 2] = m + t; \ + q[(rb) + 104 + 2 + 128] = m - t; \ + m = q[(rb) + 104 + 3]; \ + n = q[(rb) + 104 + 3 + 128]; \ + t = REDS2(n * alpha_tab[104 + 3 * 1]); \ + q[(rb) + 104 + 3] = m + t; \ + q[(rb) + 104 + 3 + 128] = m - t; \ + m = q[(rb) + 108 + 0]; \ + n = q[(rb) + 108 + 0 + 128]; \ + t = REDS2(n * alpha_tab[108 + 0 * 1]); \ + q[(rb) + 108 + 0] = m + t; \ + q[(rb) + 108 + 0 + 128] = m - t; \ + m = q[(rb) + 108 + 1]; \ + n = q[(rb) + 108 + 1 + 128]; \ + t = REDS2(n * alpha_tab[108 + 1 * 1]); \ + q[(rb) + 108 + 1] = m + t; \ + q[(rb) + 108 + 1 + 128] = m - t; \ + m = q[(rb) + 108 + 2]; \ + n = q[(rb) + 108 + 2 + 128]; \ + t = REDS2(n * alpha_tab[108 + 2 * 1]); \ + q[(rb) + 108 + 2] = m + t; \ + q[(rb) + 108 + 2 + 128] = m - t; \ + m = q[(rb) + 108 + 3]; \ + n = q[(rb) + 108 + 3 + 128]; \ + t = REDS2(n * alpha_tab[108 + 3 * 1]); \ + q[(rb) + 108 + 3] = m + t; \ + q[(rb) + 108 + 3 + 128] = m - t; \ + m = q[(rb) + 112 + 0]; \ + n = q[(rb) + 112 + 0 + 128]; \ + t = REDS2(n * alpha_tab[112 + 0 * 1]); \ + q[(rb) + 112 + 0] = m + t; \ + q[(rb) + 112 + 0 + 128] = m - t; \ + m = q[(rb) + 112 + 1]; \ + n = q[(rb) + 112 + 1 + 128]; \ + t = REDS2(n * alpha_tab[112 + 1 * 1]); \ + q[(rb) + 112 + 1] = m + t; \ + q[(rb) + 112 + 1 + 128] = m - t; \ + m = q[(rb) + 112 + 2]; \ + n = q[(rb) + 112 + 2 + 128]; \ + t = REDS2(n * alpha_tab[112 + 2 * 1]); \ + q[(rb) + 112 + 2] = m + t; \ + q[(rb) + 112 + 2 + 128] = m - t; \ + m = q[(rb) + 112 + 3]; \ + n = q[(rb) + 112 + 3 + 128]; \ + t = REDS2(n * alpha_tab[112 + 3 * 1]); \ + q[(rb) + 112 + 3] = m + t; \ + q[(rb) + 112 + 3 + 128] = m - t; \ + m = q[(rb) + 116 + 0]; \ + n = q[(rb) + 116 + 0 + 128]; \ + t = REDS2(n * alpha_tab[116 + 0 * 1]); \ + q[(rb) + 116 + 0] = m + t; \ + q[(rb) + 116 + 0 + 128] = m - t; \ + m = q[(rb) + 116 + 1]; \ + n = q[(rb) + 116 + 1 + 128]; \ + t = REDS2(n * alpha_tab[116 + 1 * 1]); \ + q[(rb) + 116 + 1] = m + t; \ + q[(rb) + 116 + 1 + 128] = m - t; \ + m = q[(rb) + 116 + 2]; \ + n = q[(rb) + 116 + 2 + 128]; \ + t = REDS2(n * alpha_tab[116 + 2 * 1]); \ + q[(rb) + 116 + 2] = m + t; \ + q[(rb) + 116 + 2 + 128] = m - t; \ + m = q[(rb) + 116 + 3]; \ + n = q[(rb) + 116 + 3 + 128]; \ + t = REDS2(n * alpha_tab[116 + 3 * 1]); \ + q[(rb) + 116 + 3] = m + t; \ + q[(rb) + 116 + 3 + 128] = m - t; \ + m = q[(rb) + 120 + 0]; \ + n = q[(rb) + 120 + 0 + 128]; \ + t = REDS2(n * alpha_tab[120 + 0 * 1]); \ + q[(rb) + 120 + 0] = m + t; \ + q[(rb) + 120 + 0 + 128] = m - t; \ + m = q[(rb) + 120 + 1]; \ + n = q[(rb) + 120 + 1 + 128]; \ + t = REDS2(n * alpha_tab[120 + 1 * 1]); \ + q[(rb) + 120 + 1] = m + t; \ + q[(rb) + 120 + 1 + 128] = m - t; \ + m = q[(rb) + 120 + 2]; \ + n = q[(rb) + 120 + 2 + 128]; \ + t = REDS2(n * alpha_tab[120 + 2 * 1]); \ + q[(rb) + 120 + 2] = m + t; \ + q[(rb) + 120 + 2 + 128] = m - t; \ + m = q[(rb) + 120 + 3]; \ + n = q[(rb) + 120 + 3 + 128]; \ + t = REDS2(n * alpha_tab[120 + 3 * 1]); \ + q[(rb) + 120 + 3] = m + t; \ + q[(rb) + 120 + 3 + 128] = m - t; \ + m = q[(rb) + 124 + 0]; \ + n = q[(rb) + 124 + 0 + 128]; \ + t = REDS2(n * alpha_tab[124 + 0 * 1]); \ + q[(rb) + 124 + 0] = m + t; \ + q[(rb) + 124 + 0 + 128] = m - t; \ + m = q[(rb) + 124 + 1]; \ + n = q[(rb) + 124 + 1 + 128]; \ + t = REDS2(n * alpha_tab[124 + 1 * 1]); \ + q[(rb) + 124 + 1] = m + t; \ + q[(rb) + 124 + 1 + 128] = m - t; \ + m = q[(rb) + 124 + 2]; \ + n = q[(rb) + 124 + 2 + 128]; \ + t = REDS2(n * alpha_tab[124 + 2 * 1]); \ + q[(rb) + 124 + 2] = m + t; \ + q[(rb) + 124 + 2 + 128] = m - t; \ + m = q[(rb) + 124 + 3]; \ + n = q[(rb) + 124 + 3 + 128]; \ + t = REDS2(n * alpha_tab[124 + 3 * 1]); \ + q[(rb) + 124 + 3] = m + t; \ + q[(rb) + 124 + 3 + 128] = m - t; \ + } while (0) + +/* + * Output ranges: + * d0: min= 0 max= 1020 + * d1: min= -67 max= 4587 + * d2: min=-4335 max= 4335 + * d3: min=-4147 max= 507 + * d4: min= -510 max= 510 + * d5: min= -252 max= 4402 + * d6: min=-4335 max= 4335 + * d7: min=-4332 max= 322 + */ +#define FFT8(xb, xs, d) do { \ + s32 x0 = x[(xb)]; \ + s32 x1 = x[(xb) + (xs)]; \ + s32 x2 = x[(xb) + 2 * (xs)]; \ + s32 x3 = x[(xb) + 3 * (xs)]; \ + s32 a0 = x0 + x2; \ + s32 a1 = x0 + (x2 << 4); \ + s32 a2 = x0 - x2; \ + s32 a3 = x0 - (x2 << 4); \ + s32 b0 = x1 + x3; \ + s32 b1 = REDS1((x1 << 2) + (x3 << 6)); \ + s32 b2 = (x1 << 4) - (x3 << 4); \ + s32 b3 = REDS1((x1 << 6) + (x3 << 2)); \ + d ## 0 = a0 + b0; \ + d ## 1 = a1 + b1; \ + d ## 2 = a2 + b2; \ + d ## 3 = a3 + b3; \ + d ## 4 = a0 - b0; \ + d ## 5 = a1 - b1; \ + d ## 6 = a2 - b2; \ + d ## 7 = a3 - b3; \ + } while (0) + +/* + * When k=16, we have alpha=2. Multiplication by alpha^i is then reduced + * to some shifting. + * + * Output: within -591471..591723 + */ +#define FFT16(xb, xs, rb) do { \ + s32 d1_0, d1_1, d1_2, d1_3, d1_4, d1_5, d1_6, d1_7; \ + s32 d2_0, d2_1, d2_2, d2_3, d2_4, d2_5, d2_6, d2_7; \ + FFT8(xb, (xs) << 1, d1_); \ + FFT8((xb) + (xs), (xs) << 1, d2_); \ + q[(rb) + 0] = d1_0 + d2_0; \ + q[(rb) + 1] = d1_1 + (d2_1 << 1); \ + q[(rb) + 2] = d1_2 + (d2_2 << 2); \ + q[(rb) + 3] = d1_3 + (d2_3 << 3); \ + q[(rb) + 4] = d1_4 + (d2_4 << 4); \ + q[(rb) + 5] = d1_5 + (d2_5 << 5); \ + q[(rb) + 6] = d1_6 + (d2_6 << 6); \ + q[(rb) + 7] = d1_7 + (d2_7 << 7); \ + q[(rb) + 8] = d1_0 - d2_0; \ + q[(rb) + 9] = d1_1 - (d2_1 << 1); \ + q[(rb) + 10] = d1_2 - (d2_2 << 2); \ + q[(rb) + 11] = d1_3 - (d2_3 << 3); \ + q[(rb) + 12] = d1_4 - (d2_4 << 4); \ + q[(rb) + 13] = d1_5 - (d2_5 << 5); \ + q[(rb) + 14] = d1_6 - (d2_6 << 6); \ + q[(rb) + 15] = d1_7 - (d2_7 << 7); \ + } while (0) + +/* + * Output range: |q| <= 1183446 + */ +#define FFT32(xb, xs, rb, id) do { \ + FFT16(xb, (xs) << 1, rb); \ + FFT16((xb) + (xs), (xs) << 1, (rb) + 16); \ + FFT_LOOP_16_8(rb); \ + } while (0) + +/* + * Output range: |q| <= 2366892 + */ +#define FFT64(xb, xs, rb) do { \ + FFT32(xb, (xs) << 1, (rb), label_a); \ + FFT32((xb) + (xs), (xs) << 1, (rb) + 32, label_b); \ + FFT_LOOP_32_4(rb); \ + } while (0) + +/* + * Output range: |q| <= 9467568 + */ +#define FFT256(xb, xs, rb, id) do { \ + FFT64((xb) + ((xs) * 0), (xs) << 2, (rb + 0)); \ + FFT64((xb) + ((xs) * 2), (xs) << 2, (rb + 64)); \ + FFT_LOOP_64_2(rb); \ + FFT64((xb) + ((xs) * 1), (xs) << 2, (rb + 128)); \ + FFT64((xb) + ((xs) * 3), (xs) << 2, (rb + 192)); \ + FFT_LOOP_64_2((rb) + 128); \ + FFT_LOOP_128_1(rb); \ + } while (0) + +/* + * beta^(255*i) mod 257 + */ +__constant__ static const unsigned short yoff_b_n[] = { + 1, 163, 98, 40, 95, 65, 58, 202, 30, 7, 113, 172, + 23, 151, 198, 149, 129, 210, 49, 20, 176, 161, 29, 101, + 15, 132, 185, 86, 140, 204, 99, 203, 193, 105, 153, 10, + 88, 209, 143, 179, 136, 66, 221, 43, 70, 102, 178, 230, + 225, 181, 205, 5, 44, 233, 200, 218, 68, 33, 239, 150, + 35, 51, 89, 115, 241, 219, 231, 131, 22, 245, 100, 109, + 34, 145, 248, 75, 146, 154, 173, 186, 249, 238, 244, 194, + 11, 251, 50, 183, 17, 201, 124, 166, 73, 77, 215, 93, + 253, 119, 122, 97, 134, 254, 25, 220, 137, 229, 62, 83, + 165, 167, 236, 175, 255, 188, 61, 177, 67, 127, 141, 110, + 197, 243, 31, 170, 211, 212, 118, 216, 256, 94, 159, 217, + 162, 192, 199, 55, 227, 250, 144, 85, 234, 106, 59, 108, + 128, 47, 208, 237, 81, 96, 228, 156, 242, 125, 72, 171, + 117, 53, 158, 54, 64, 152, 104, 247, 169, 48, 114, 78, + 121, 191, 36, 214, 187, 155, 79, 27, 32, 76, 52, 252, + 213, 24, 57, 39, 189, 224, 18, 107, 222, 206, 168, 142, + 16, 38, 26, 126, 235, 12, 157, 148, 223, 112, 9, 182, + 111, 103, 84, 71, 8, 19, 13, 63, 246, 6, 207, 74, + 240, 56, 133, 91, 184, 180, 42, 164, 4, 138, 135, 160, + 123, 3, 232, 37, 120, 28, 195, 174, 92, 90, 21, 82, + 2, 69, 196, 80, 190, 130, 116, 147, 60, 14, 226, 87, + 46, 45, 139, 41 +}; + +#define INNER(l, h, mm) (((u32)((l) * (mm)) & 0xFFFFU) \ + + ((u32)((h) * (mm)) << 16)) + +#define W_BIG(sb, o1, o2, mm) \ + (INNER(q[16 * (sb) + 2 * 0 + o1], q[16 * (sb) + 2 * 0 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 1 + o1], q[16 * (sb) + 2 * 1 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 2 + o1], q[16 * (sb) + 2 * 2 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 3 + o1], q[16 * (sb) + 2 * 3 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 4 + o1], q[16 * (sb) + 2 * 4 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 5 + o1], q[16 * (sb) + 2 * 5 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 6 + o1], q[16 * (sb) + 2 * 6 + o2], mm), \ + INNER(q[16 * (sb) + 2 * 7 + o1], q[16 * (sb) + 2 * 7 + o2], mm) + +#define WB_0_0 W_BIG( 4, 0, 1, 185) +#define WB_0_1 W_BIG( 6, 0, 1, 185) +#define WB_0_2 W_BIG( 0, 0, 1, 185) +#define WB_0_3 W_BIG( 2, 0, 1, 185) +#define WB_0_4 W_BIG( 7, 0, 1, 185) +#define WB_0_5 W_BIG( 5, 0, 1, 185) +#define WB_0_6 W_BIG( 3, 0, 1, 185) +#define WB_0_7 W_BIG( 1, 0, 1, 185) +#define WB_1_0 W_BIG(15, 0, 1, 185) +#define WB_1_1 W_BIG(11, 0, 1, 185) +#define WB_1_2 W_BIG(12, 0, 1, 185) +#define WB_1_3 W_BIG( 8, 0, 1, 185) +#define WB_1_4 W_BIG( 9, 0, 1, 185) +#define WB_1_5 W_BIG(13, 0, 1, 185) +#define WB_1_6 W_BIG(10, 0, 1, 185) +#define WB_1_7 W_BIG(14, 0, 1, 185) +#define WB_2_0 W_BIG(17, -256, -128, 233) +#define WB_2_1 W_BIG(18, -256, -128, 233) +#define WB_2_2 W_BIG(23, -256, -128, 233) +#define WB_2_3 W_BIG(20, -256, -128, 233) +#define WB_2_4 W_BIG(22, -256, -128, 233) +#define WB_2_5 W_BIG(21, -256, -128, 233) +#define WB_2_6 W_BIG(16, -256, -128, 233) +#define WB_2_7 W_BIG(19, -256, -128, 233) +#define WB_3_0 W_BIG(30, -383, -255, 233) +#define WB_3_1 W_BIG(24, -383, -255, 233) +#define WB_3_2 W_BIG(25, -383, -255, 233) +#define WB_3_3 W_BIG(31, -383, -255, 233) +#define WB_3_4 W_BIG(27, -383, -255, 233) +#define WB_3_5 W_BIG(29, -383, -255, 233) +#define WB_3_6 W_BIG(28, -383, -255, 233) +#define WB_3_7 W_BIG(26, -383, -255, 233) + +#define IF(x, y, z) ((((y) ^ (z)) & (x)) ^ (z)) +#define MAJ(x, y, z) (((x) & (y)) | (((x) | (y)) & (z))) + +#define PP4_0_0 1 +#define PP4_0_1 0 +#define PP4_0_2 3 +#define PP4_0_3 2 +#define PP4_1_0 2 +#define PP4_1_1 3 +#define PP4_1_2 0 +#define PP4_1_3 1 +#define PP4_2_0 3 +#define PP4_2_1 2 +#define PP4_2_2 1 +#define PP4_2_3 0 + +#define PP8_0_0 1 +#define PP8_0_1 0 +#define PP8_0_2 3 +#define PP8_0_3 2 +#define PP8_0_4 5 +#define PP8_0_5 4 +#define PP8_0_6 7 +#define PP8_0_7 6 + +#define PP8_1_0 6 +#define PP8_1_1 7 +#define PP8_1_2 4 +#define PP8_1_3 5 +#define PP8_1_4 2 +#define PP8_1_5 3 +#define PP8_1_6 0 +#define PP8_1_7 1 + +#define PP8_2_0 2 +#define PP8_2_1 3 +#define PP8_2_2 0 +#define PP8_2_3 1 +#define PP8_2_4 6 +#define PP8_2_5 7 +#define PP8_2_6 4 +#define PP8_2_7 5 + +#define PP8_3_0 3 +#define PP8_3_1 2 +#define PP8_3_2 1 +#define PP8_3_3 0 +#define PP8_3_4 7 +#define PP8_3_5 6 +#define PP8_3_6 5 +#define PP8_3_7 4 + +#define PP8_4_0 5 +#define PP8_4_1 4 +#define PP8_4_2 7 +#define PP8_4_3 6 +#define PP8_4_4 1 +#define PP8_4_5 0 +#define PP8_4_6 3 +#define PP8_4_7 2 + +#define PP8_5_0 7 +#define PP8_5_1 6 +#define PP8_5_2 5 +#define PP8_5_3 4 +#define PP8_5_4 3 +#define PP8_5_5 2 +#define PP8_5_6 1 +#define PP8_5_7 0 + +#define PP8_6_0 4 +#define PP8_6_1 5 +#define PP8_6_2 6 +#define PP8_6_3 7 +#define PP8_6_4 0 +#define PP8_6_5 1 +#define PP8_6_6 2 +#define PP8_6_7 3 + +#define STEP_ELT(n, w, fun, s, ppb) do { \ + u32 tt = T32(D ## n + (w) + fun(A ## n, B ## n, C ## n)); \ + A ## n = T32(ROL32(tt, s) + XCAT(tA, XCAT(ppb, n))); \ + D ## n = C ## n; \ + C ## n = B ## n; \ + B ## n = tA ## n; \ + } while (0) + +#define STEP_BIG(w0, w1, w2, w3, w4, w5, w6, w7, fun, r, s, pp8b) do { \ + u32 tA0 = ROL32(A0, r); \ + u32 tA1 = ROL32(A1, r); \ + u32 tA2 = ROL32(A2, r); \ + u32 tA3 = ROL32(A3, r); \ + u32 tA4 = ROL32(A4, r); \ + u32 tA5 = ROL32(A5, r); \ + u32 tA6 = ROL32(A6, r); \ + u32 tA7 = ROL32(A7, r); \ + STEP_ELT(0, w0, fun, s, pp8b); \ + STEP_ELT(1, w1, fun, s, pp8b); \ + STEP_ELT(2, w2, fun, s, pp8b); \ + STEP_ELT(3, w3, fun, s, pp8b); \ + STEP_ELT(4, w4, fun, s, pp8b); \ + STEP_ELT(5, w5, fun, s, pp8b); \ + STEP_ELT(6, w6, fun, s, pp8b); \ + STEP_ELT(7, w7, fun, s, pp8b); \ + } while (0) + +#define SIMD_M3_0_0 0_ +#define SIMD_M3_1_0 1_ +#define SIMD_M3_2_0 2_ +#define SIMD_M3_3_0 0_ +#define SIMD_M3_4_0 1_ +#define SIMD_M3_5_0 2_ +#define SIMD_M3_6_0 0_ +#define SIMD_M3_7_0 1_ + +#define SIMD_M3_0_1 1_ +#define SIMD_M3_1_1 2_ +#define SIMD_M3_2_1 0_ +#define SIMD_M3_3_1 1_ +#define SIMD_M3_4_1 2_ +#define SIMD_M3_5_1 0_ +#define SIMD_M3_6_1 1_ +#define SIMD_M3_7_1 2_ + +#define SIMD_M3_0_2 2_ +#define SIMD_M3_1_2 0_ +#define SIMD_M3_2_2 1_ +#define SIMD_M3_3_2 2_ +#define SIMD_M3_4_2 0_ +#define SIMD_M3_5_2 1_ +#define SIMD_M3_6_2 2_ +#define SIMD_M3_7_2 0_ + +#define M7_0_0 0_ +#define M7_1_0 1_ +#define M7_2_0 2_ +#define M7_3_0 3_ +#define M7_4_0 4_ +#define M7_5_0 5_ +#define M7_6_0 6_ +#define M7_7_0 0_ + +#define M7_0_1 1_ +#define M7_1_1 2_ +#define M7_2_1 3_ +#define M7_3_1 4_ +#define M7_4_1 5_ +#define M7_5_1 6_ +#define M7_6_1 0_ +#define M7_7_1 1_ + +#define M7_0_2 2_ +#define M7_1_2 3_ +#define M7_2_2 4_ +#define M7_3_2 5_ +#define M7_4_2 6_ +#define M7_5_2 0_ +#define M7_6_2 1_ +#define M7_7_2 2_ + +#define M7_0_3 3_ +#define M7_1_3 4_ +#define M7_2_3 5_ +#define M7_3_3 6_ +#define M7_4_3 0_ +#define M7_5_3 1_ +#define M7_6_3 2_ +#define M7_7_3 3_ + +#define STEP_BIG_(w, fun, r, s, pp8b) STEP_BIG w, fun, r, s, pp8b) + +#define ONE_ROUND_BIG(ri, isp, p0, p1, p2, p3) do { \ + STEP_BIG_(WB_ ## ri ## 0, \ + IF, p0, p1, XCAT(PP8_, M7_0_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 1, \ + IF, p1, p2, XCAT(PP8_, M7_1_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 2, \ + IF, p2, p3, XCAT(PP8_, M7_2_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 3, \ + IF, p3, p0, XCAT(PP8_, M7_3_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 4, \ + MAJ, p0, p1, XCAT(PP8_, M7_4_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 5, \ + MAJ, p1, p2, XCAT(PP8_, M7_5_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 6, \ + MAJ, p2, p3, XCAT(PP8_, M7_6_ ## isp)); \ + STEP_BIG_(WB_ ## ri ## 7, \ + MAJ, p3, p0, XCAT(PP8_, M7_7_ ## isp)); \ + } while (0) + +//__constant__ static const s32 SIMD_Q_64[] = { +// 4, 28, -80, -120, -47, -126, 45, -123, -92, -127, -70, 23, -23, -24, 40, -125, 101, 122, 34, -24, -119, 110, -121, -112, 32, 24, 51, 73, -117, -64, -21, 42, -60, 16, 5, 85, 107, 52, -44, -96, 42, 127, -18, -108, -47, 26, 91, 117, 112, 46, 87, 79, 126, -120, 65, -24, 121, 29, 118, -7, -53, 85, -98, -117, 32, 115, -47, -116, 63, 16, -108, 49, -119, 57, -110, 4, -76, -76, -42, -86, 58, 115, 4, 4, -83, -51, -37, 116, 32, 15, 36, -42, 73, -99, 94, 87, 60, -20, 67, 12, -76, 55, 117, -68, -82, -80, 93, -20, 92, -21, -128, -91, -11, 84, -28, 76, 94, -124, 37, 93, 17, -78, -106, -29, 88, -15, -47, 102, -4, -28, 80, 120, 47, 126, -45, 123, 92, 127, 70, -23, 23, 24, -40, 125, -101, -122, -34, 24, 119, -110, 121, 112, -32, -24, -51, -73, 117, 64, 21, -42, 60, -16, -5, -85, -107, -52, 44, 96, -42, -127, 18, 108, 47, -26, -91, -117, -112, -46, -87, -79, -126, 120, -65, 24, -121, -29, -118, 7, 53, -85, 98, 117, -32, -115, 47, 116, -63, -16, 108, -49, 119, -57, 110, -4, 76, 76, 42, 86, -58, -115, -4, -4, 83, 51, 37, -116, -32, -15, -36, 42, -73, 99, -94, -87, -60, 20, -67, -12, 76, -55, -117, 68, 82, 80, -93, 20, -92, 21, 128, 91, 11, -84, 28, -76, -94, 124, -37, -93, -17, 78, 106, 29, -88, 15, 47, -102 +//}; +__constant__ static const s32 SIMD_Q_80[] = { + -125, -101, 48, 8, 81, 2, -84, 5, 36, 1, 58, -106, 105, 104, -89, 3, -28, -7, -95, 104, 9, -19, 7, 16, -97, -105, -78, -56, 11, 64, 107, -87, 68, -113, -124, -44, -22, -77, 84, 32, -87, -2, 110, 20, 81, -103, -38, -12, -17, -83, -42, -50, -3, 8, -64, 104, -8, -100, -11, 121, 75, -44, 30, 11, -97, -14, 81, 12, -66, -113, 20, -80, 9, -72, 18, -125, 52, 52, 86, 42, -71, -14, -125, -125, 45, 77, 91, -13, -97, -114, -93, 86, -56, 29, -35, -42, -69, 108, -62, -117, 52, -74, -12, 60, 46, 48, -36, 108, -37, 107, 0, 37, 117, -45, 100, -53, -35, 4, -92, -36, -112, 50, 22, 99, -41, 113, 81, -27, 124, 100, -49, -9, -82, -3, 83, -6, -37, -2, -59, 105, -106, -105, 88, -4, 27, 6, 94, -105, -10, 18, -8, -17, 96, 104, 77, 55, -12, -65, -108, 86, -69, 112, 123, 43, 21, 76, -85, -33, 86, 1, -111, -21, -82, 102, 37, 11, 16, 82, 41, 49, 2, -9, 63, -105, 7, 99, 10, -122, -76, 43, -31, -12, 96, 13, -82, -13, 65, 112, -21, 79, -10, 71, -19, 124, -53, -53, -87, -43, 70, 13, 124, 124, -46, -78, -92, 12, 96, 113, 92, -87, 55, -30, 34, 41, 68, -109, 61, 116, -53, 73, 11, -61, -47, -49, 35, -109, 36, -108, -1, -38, -118, 44, -101, 52, 34, -5, 91, 35, 111, -51, -23, -100, 40, -114, -82, 26 +}; + +__constant__ static uint32_t c_PaddedMessage80[20]; + +__host__ +void x16_simd512_setBlock_80(void *pdata) +{ + cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice); +} + +#define TPB_SIMD 128 +__global__ +__launch_bounds__(TPB_SIMD,1) +static void x16_simd512_gpu_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_outputhash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint32_t A[20]; + #pragma unroll 10 + for (int i=0; i < 20; i += 2) + AS_UINT2(&A[i]) = AS_UINT2(&c_PaddedMessage80[i]); + A[19] = cuda_swab32(startNonce + thread); + + // simd + unsigned char x[128]; + #pragma unroll + for (int i = 0; i < 20; i += 2) + AS_UINT2(&x[i*4]) = AS_UINT2(&A[i]); + #pragma unroll + for(int i = 80; i < 128; i+=4) AS_U32(&x[i]) = 0; + + // SIMD_IV512 + u32 A0 = 0x0BA16B95, A1 = 0x72F999AD, A2 = 0x9FECC2AE, A3 = 0xBA3264FC, A4 = 0x5E894929, A5 = 0x8E9F30E5, A6 = 0x2F1DAA37, A7 = 0xF0F2C558; + u32 B0 = 0xAC506643, B1 = 0xA90635A5, B2 = 0xE25B878B, B3 = 0xAAB7878F, B4 = 0x88817F7A, B5 = 0x0A02892B, B6 = 0x559A7550, B7 = 0x598F657E; + u32 C0 = 0x7EEF60A1, C1 = 0x6B70E3E8, C2 = 0x9C1714D1, C3 = 0xB958E2A8, C4 = 0xAB02675E, C5 = 0xED1C014F, C6 = 0xCD8D65BB, C7 = 0xFDB7A257; + u32 D0 = 0x09254899, D1 = 0xD699C7BC, D2 = 0x9019B6DC, D3 = 0x2B9022E4, D4 = 0x8FA14956, D5 = 0x21BF9BD3, D6 = 0xB94D0943, D7 = 0x6FFDDC22; + + s32 q[256]; + FFT256(0, 1, 0, ll1); + + #pragma unroll + for (int i = 0; i < 256; i ++) { + s32 tq = q[i] + yoff_b_n[i]; + tq = REDS2(tq); + tq = REDS1(tq); + tq = REDS1(tq); + q[i] = (tq <= 128 ? tq : tq - 257); + } + + A0 ^= A[ 0]; + A1 ^= A[ 1]; + A2 ^= A[ 2]; + A3 ^= A[ 3]; + A4 ^= A[ 4]; + A5 ^= A[ 5]; + A6 ^= A[ 6]; + A7 ^= A[ 7]; + B0 ^= A[ 8]; + B1 ^= A[ 9]; + B2 ^= A[10]; + B3 ^= A[11]; + B4 ^= A[12]; + B5 ^= A[13]; + B6 ^= A[14]; + B7 ^= A[15]; + C0 ^= A[16]; + C1 ^= A[17]; + C2 ^= A[18]; + C3 ^= A[19]; + + ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); + ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); + ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); + ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); + + STEP_BIG( + C32(0x0BA16B95), C32(0x72F999AD), C32(0x9FECC2AE), C32(0xBA3264FC), + C32(0x5E894929), C32(0x8E9F30E5), C32(0x2F1DAA37), C32(0xF0F2C558), + IF, 4, 13, PP8_4_); + + STEP_BIG( + C32(0xAC506643), C32(0xA90635A5), C32(0xE25B878B), C32(0xAAB7878F), + C32(0x88817F7A), C32(0x0A02892B), C32(0x559A7550), C32(0x598F657E), + IF, 13, 10, PP8_5_); + + STEP_BIG( + C32(0x7EEF60A1), C32(0x6B70E3E8), C32(0x9C1714D1), C32(0xB958E2A8), + C32(0xAB02675E), C32(0xED1C014F), C32(0xCD8D65BB), C32(0xFDB7A257), + IF, 10, 25, PP8_6_); + + STEP_BIG( + C32(0x09254899), C32(0xD699C7BC), C32(0x9019B6DC), C32(0x2B9022E4), + C32(0x8FA14956), C32(0x21BF9BD3), C32(0xB94D0943), C32(0x6FFDDC22), + IF, 25, 4, PP8_0_); + + // Second round + + u32 COPY_A0 = A0, COPY_A1 = A1, COPY_A2 = A2, COPY_A3 = A3, COPY_A4 = A4, COPY_A5 = A5, COPY_A6 = A6, COPY_A7 = A7; + u32 COPY_B0 = B0, COPY_B1 = B1, COPY_B2 = B2, COPY_B3 = B3, COPY_B4 = B4, COPY_B5 = B5, COPY_B6 = B6, COPY_B7 = B7; + u32 COPY_C0 = C0, COPY_C1 = C1, COPY_C2 = C2, COPY_C3 = C3, COPY_C4 = C4, COPY_C5 = C5, COPY_C6 = C6, COPY_C7 = C7; + u32 COPY_D0 = D0, COPY_D1 = D1, COPY_D2 = D2, COPY_D3 = D3, COPY_D4 = D4, COPY_D5 = D5, COPY_D6 = D6, COPY_D7 = D7; + + #define q SIMD_Q_80 + + A0 ^= 0x280; // bitlen + + ONE_ROUND_BIG(0_, 0, 3, 23, 17, 27); + ONE_ROUND_BIG(1_, 1, 28, 19, 22, 7); + ONE_ROUND_BIG(2_, 2, 29, 9, 15, 5); + ONE_ROUND_BIG(3_, 3, 4, 13, 10, 25); + + STEP_BIG( + COPY_A0, COPY_A1, COPY_A2, COPY_A3, + COPY_A4, COPY_A5, COPY_A6, COPY_A7, + IF, 4, 13, PP8_4_); + + STEP_BIG( + COPY_B0, COPY_B1, COPY_B2, COPY_B3, + COPY_B4, COPY_B5, COPY_B6, COPY_B7, + IF, 13, 10, PP8_5_); + + STEP_BIG( + COPY_C0, COPY_C1, COPY_C2, COPY_C3, + COPY_C4, COPY_C5, COPY_C6, COPY_C7, + IF, 10, 25, PP8_6_); + + STEP_BIG( + COPY_D0, COPY_D1, COPY_D2, COPY_D3, + COPY_D4, COPY_D5, COPY_D6, COPY_D7, + IF, 25, 4, PP8_0_); + + #undef q + + A[ 0] = A0; + A[ 1] = A1; + A[ 2] = A2; + A[ 3] = A3; + A[ 4] = A4; + A[ 5] = A5; + A[ 6] = A6; + A[ 7] = A7; + A[ 8] = B0; + A[ 9] = B1; + A[10] = B2; + A[11] = B3; + A[12] = B4; + A[13] = B5; + A[14] = B6; + A[15] = B7; + + const uint64_t hashPosition = thread; + uint32_t *Hash = (uint32_t*)(&g_outputhash[(size_t)8 * hashPosition]); + #pragma unroll + for (int i=0; i < 16; i += 2) + *(uint2*)&Hash[i] = *(uint2*)&A[i]; + } +} + +/***************************************************/ + +__host__ +void x16_simd512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash) +{ + const uint32_t tpb = 128; + const dim3 grid((threads + tpb - 1) / tpb); + const dim3 block(tpb); + x16_simd512_gpu_80 <<>> (threads, startNonce, (uint64_t*) d_hash); +} diff --git a/x16r/cuda_x16r.h b/x16r/cuda_x16r.h new file mode 100644 index 0000000..976793c --- /dev/null +++ b/x16r/cuda_x16r.h @@ -0,0 +1,75 @@ +#include "x11/cuda_x11.h" + +extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads); +extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads); +extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x13_fugue512_cpu_free(int thr_id); + +extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads); +extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); + +extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag); +extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order); +extern void x15_whirlpool_cpu_free(int thr_id); + +extern void x17_sha512_cpu_init(int thr_id, uint32_t threads); +extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash); + +extern void x17_haval256_cpu_init(int thr_id, uint32_t threads); +extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, const int outlen); + +void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order); + +// ---- 80 bytes kernels + +void quark_bmw512_cpu_setBlock_80(void *pdata); +void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order); + +void groestl512_setBlock_80(int thr_id, uint32_t *endiandata); +void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void skein512_cpu_setBlock_80(void *pdata); +void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap); + +void qubit_luffa512_cpu_init(int thr_id, uint32_t threads); +void qubit_luffa512_cpu_setBlock_80(void *pdata); +void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order); + +void jh512_setBlock_80(int thr_id, uint32_t *endiandata); +void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void keccak512_setBlock_80(int thr_id, uint32_t *endiandata); +void keccak512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata); +void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void x11_shavite512_setBlock_80(void *pdata); +void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order); + +void x16_shabal512_setBlock_80(void *pdata); +void x16_shabal512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void x16_simd512_setBlock_80(void *pdata); +void x16_simd512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void x16_echo512_cuda_init(int thr_id, const uint32_t threads); +void x16_echo512_setBlock_80(void *pdata); +void x16_echo512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void x16_hamsi512_setBlock_80(void *pdata); +void x16_hamsi512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void x16_fugue512_cpu_init(int thr_id, uint32_t threads); +void x16_fugue512_cpu_free(int thr_id); +void x16_fugue512_setBlock_80(void *pdata); +void x16_fugue512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void x16_whirlpool512_init(int thr_id, uint32_t threads); +void x16_whirlpool512_setBlock_80(void* endiandata); +void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); + +void x16_sha512_setBlock_80(void *pdata); +void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash); diff --git a/x16r/x16r.cu b/x16r/x16r.cu new file mode 100644 index 0000000..65c2155 --- /dev/null +++ b/x16r/x16r.cu @@ -0,0 +1,625 @@ +/** + * X16R algorithm (X16 with Randomized chain order) + * + * tpruvot 2018 - GPL code + */ + +#include +#include +#include + +extern "C" { +#include "sph/sph_blake.h" +#include "sph/sph_bmw.h" +#include "sph/sph_groestl.h" +#include "sph/sph_skein.h" +#include "sph/sph_jh.h" +#include "sph/sph_keccak.h" + +#include "sph/sph_luffa.h" +#include "sph/sph_cubehash.h" +#include "sph/sph_shavite.h" +#include "sph/sph_simd.h" +#include "sph/sph_echo.h" + +#include "sph/sph_hamsi.h" +#include "sph/sph_fugue.h" +#include "sph/sph_shabal.h" +#include "sph/sph_whirlpool.h" +#include "sph/sph_sha2.h" +} + +#include "miner.h" +#include "cuda_helper.h" +#include "cuda_x16r.h" + +static uint32_t *d_hash[MAX_GPUS]; + +enum Algo { + BLAKE = 0, + BMW, + GROESTL, + JH, + KECCAK, + SKEIN, + LUFFA, + CUBEHASH, + SHAVITE, + SIMD, + ECHO, + HAMSI, + FUGUE, + SHABAL, + WHIRLPOOL, + SHA512, + HASH_FUNC_COUNT +}; + +static const char* algo_strings[] = { + "blake", + "bmw512", + "groestl", + "jh512", + "keccak", + "skein", + "luffa", + "cube", + "shavite", + "simd", + "echo", + "hamsi", + "fugue", + "shabal", + "whirlpool", + "sha512", + NULL +}; + +static __thread uint32_t s_ntime = UINT32_MAX; +static __thread bool s_implemented = false; +static __thread char hashOrder[HASH_FUNC_COUNT + 1] = { 0 }; + +static void getAlgoString(const uint32_t* prevblock, char *output) +{ + char *sptr = output; + uint8_t* data = (uint8_t*)prevblock; + + for (uint8_t j = 0; j < HASH_FUNC_COUNT; j++) { + uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed + uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4; + if (algoDigit >= 10) + sprintf(sptr, "%c", 'A' + (algoDigit - 10)); + else + sprintf(sptr, "%u", (uint32_t) algoDigit); + sptr++; + } + *sptr = '\0'; +} + +// X16R CPU Hash (Validation) +extern "C" void x16r_hash(void *output, const void *input) +{ + unsigned char _ALIGN(64) hash[128]; + + sph_blake512_context ctx_blake; + sph_bmw512_context ctx_bmw; + sph_groestl512_context ctx_groestl; + sph_jh512_context ctx_jh; + sph_keccak512_context ctx_keccak; + sph_skein512_context ctx_skein; + sph_luffa512_context ctx_luffa; + sph_cubehash512_context ctx_cubehash; + sph_shavite512_context ctx_shavite; + sph_simd512_context ctx_simd; + sph_echo512_context ctx_echo; + sph_hamsi512_context ctx_hamsi; + sph_fugue512_context ctx_fugue; + sph_shabal512_context ctx_shabal; + sph_whirlpool_context ctx_whirlpool; + sph_sha512_context ctx_sha512; + + void *in = (void*) input; + int size = 80; + + uint32_t *in32 = (uint32_t*) input; + getAlgoString(&in32[1], hashOrder); + + for (int i = 0; i < 16; i++) + { + const char elem = hashOrder[i]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch (algo) { + case BLAKE: + sph_blake512_init(&ctx_blake); + sph_blake512(&ctx_blake, in, size); + sph_blake512_close(&ctx_blake, hash); + break; + case BMW: + sph_bmw512_init(&ctx_bmw); + sph_bmw512(&ctx_bmw, in, size); + sph_bmw512_close(&ctx_bmw, hash); + break; + case GROESTL: + sph_groestl512_init(&ctx_groestl); + sph_groestl512(&ctx_groestl, in, size); + sph_groestl512_close(&ctx_groestl, hash); + break; + case SKEIN: + sph_skein512_init(&ctx_skein); + sph_skein512(&ctx_skein, in, size); + sph_skein512_close(&ctx_skein, hash); + break; + case JH: + sph_jh512_init(&ctx_jh); + sph_jh512(&ctx_jh, in, size); + sph_jh512_close(&ctx_jh, hash); + break; + case KECCAK: + sph_keccak512_init(&ctx_keccak); + sph_keccak512(&ctx_keccak, in, size); + sph_keccak512_close(&ctx_keccak, hash); + break; + case LUFFA: + sph_luffa512_init(&ctx_luffa); + sph_luffa512(&ctx_luffa, in, size); + sph_luffa512_close(&ctx_luffa, hash); + break; + case CUBEHASH: + sph_cubehash512_init(&ctx_cubehash); + sph_cubehash512(&ctx_cubehash, in, size); + sph_cubehash512_close(&ctx_cubehash, hash); + break; + case SHAVITE: + sph_shavite512_init(&ctx_shavite); + sph_shavite512(&ctx_shavite, in, size); + sph_shavite512_close(&ctx_shavite, hash); + break; + case SIMD: + sph_simd512_init(&ctx_simd); + sph_simd512(&ctx_simd, in, size); + sph_simd512_close(&ctx_simd, hash); + break; + case ECHO: + sph_echo512_init(&ctx_echo); + sph_echo512(&ctx_echo, in, size); + sph_echo512_close(&ctx_echo, hash); + break; + case HAMSI: + sph_hamsi512_init(&ctx_hamsi); + sph_hamsi512(&ctx_hamsi, in, size); + sph_hamsi512_close(&ctx_hamsi, hash); + break; + case FUGUE: + sph_fugue512_init(&ctx_fugue); + sph_fugue512(&ctx_fugue, in, size); + sph_fugue512_close(&ctx_fugue, hash); + break; + case SHABAL: + sph_shabal512_init(&ctx_shabal); + sph_shabal512(&ctx_shabal, in, size); + sph_shabal512_close(&ctx_shabal, hash); + break; + case WHIRLPOOL: + sph_whirlpool_init(&ctx_whirlpool); + sph_whirlpool(&ctx_whirlpool, in, size); + sph_whirlpool_close(&ctx_whirlpool, hash); + break; + case SHA512: + sph_sha512_init(&ctx_sha512); + sph_sha512(&ctx_sha512,(const void*) in, size); + sph_sha512_close(&ctx_sha512,(void*) hash); + break; + } + in = (void*) hash; + size = 64; + } + memcpy(output, hash, 32); +} + +void whirlpool_midstate(void *state, const void *input) +{ + sph_whirlpool_context ctx; + + sph_whirlpool_init(&ctx); + sph_whirlpool(&ctx, input, 64); + + memcpy(state, ctx.state, 64); +} + +static bool init[MAX_GPUS] = { 0 }; + +//#define _DEBUG +#define _DEBUG_PREFIX "x16r-" +#include "cuda_debug.cuh" + +//static int algo80_tests[HASH_FUNC_COUNT] = { 0 }; +//static int algo64_tests[HASH_FUNC_COUNT] = { 0 }; +static int algo80_fails[HASH_FUNC_COUNT] = { 0 }; + +extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done) +{ + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t first_nonce = pdata[19]; + const int dev_id = device_map[thr_id]; + int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19; + if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20; + uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); + //if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce); + + if (!init[thr_id]) + { + cudaSetDevice(device_map[thr_id]); + if (opt_cudaschedule == -1 && gpu_threads == 1) { + cudaDeviceReset(); + // reduce cpu usage + cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync); + } + gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput); + + quark_blake512_cpu_init(thr_id, throughput); + quark_bmw512_cpu_init(thr_id, throughput); + quark_groestl512_cpu_init(thr_id, throughput); + quark_skein512_cpu_init(thr_id, throughput); + quark_jh512_cpu_init(thr_id, throughput); + quark_keccak512_cpu_init(thr_id, throughput); + qubit_luffa512_cpu_init(thr_id, throughput); + x11_luffa512_cpu_init(thr_id, throughput); // 64 + x11_shavite512_cpu_init(thr_id, throughput); + x11_simd512_cpu_init(thr_id, throughput); // 64 + x11_echo512_cpu_init(thr_id, throughput); + x16_echo512_cuda_init(thr_id, throughput); + x13_hamsi512_cpu_init(thr_id, throughput); + x13_fugue512_cpu_init(thr_id, throughput); + x16_fugue512_cpu_init(thr_id, throughput); + x14_shabal512_cpu_init(thr_id, throughput); + x15_whirlpool_cpu_init(thr_id, throughput, 0); + x16_whirlpool512_init(thr_id, throughput); + x17_sha512_cpu_init(thr_id, throughput); + + CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0); + + cuda_check_cpu_init(thr_id, throughput); + + init[thr_id] = true; + } + + if (opt_benchmark) { + ((uint32_t*)ptarget)[7] = 0x003f; + ((uint8_t*)pdata)[8] = 0x90; // hashOrder[0] = '9'; for simd 80 + blake512 64 + //((uint8_t*)pdata)[8] = 0xA0; // hashOrder[0] = 'A'; for echo 80 + blake512 64 + //((uint8_t*)pdata)[8] = 0xB0; // hashOrder[0] = 'B'; for hamsi 80 + blake512 64 + //((uint8_t*)pdata)[8] = 0xC0; // hashOrder[0] = 'C'; for fugue 80 + blake512 64 + //((uint8_t*)pdata)[8] = 0xE0; // hashOrder[0] = 'E'; for whirlpool 80 + blake512 64 + } + uint32_t _ALIGN(64) endiandata[20]; + + for (int k=0; k < 19; k++) + be32enc(&endiandata[k], pdata[k]); + + uint32_t ntime = swab32(pdata[17]); + if (s_ntime != ntime) { + getAlgoString(&endiandata[1], hashOrder); + s_ntime = ntime; + s_implemented = true; + if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime); + } + + if (!s_implemented) { + sleep(1); + return -1; + } + + cuda_check_cpu_setTarget(ptarget); + + char elem = hashOrder[0]; + const uint8_t algo80 = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch (algo80) { + case BLAKE: + quark_blake512_cpu_setBlock_80(thr_id, endiandata); + break; + case BMW: + quark_bmw512_cpu_setBlock_80(endiandata); + break; + case GROESTL: + groestl512_setBlock_80(thr_id, endiandata); + break; + case JH: + jh512_setBlock_80(thr_id, endiandata); + break; + case KECCAK: + keccak512_setBlock_80(thr_id, endiandata); + break; + case SKEIN: + skein512_cpu_setBlock_80((void*)endiandata); + break; + case LUFFA: + qubit_luffa512_cpu_setBlock_80((void*)endiandata); + break; + case CUBEHASH: + cubehash512_setBlock_80(thr_id, endiandata); + break; + case SHAVITE: + x11_shavite512_setBlock_80((void*)endiandata); + break; + case SIMD: + x16_simd512_setBlock_80((void*)endiandata); + break; + case ECHO: + x16_echo512_setBlock_80((void*)endiandata); + break; + case HAMSI: + x16_hamsi512_setBlock_80((void*)endiandata); + break; + case FUGUE: + x16_fugue512_setBlock_80((void*)pdata); + break; + case SHABAL: + x16_shabal512_setBlock_80((void*)endiandata); + break; + case WHIRLPOOL: + x16_whirlpool512_setBlock_80((void*)endiandata); + break; + case SHA512: + x16_sha512_setBlock_80(endiandata); + break; + default: { + if (!thr_id) + applog(LOG_WARNING, "kernel %s %c unimplemented, order %s", algo_strings[algo80], elem, hashOrder); + s_implemented = false; + sleep(5); + return -1; + } + } + + int warn = 0; + + do { + int order = 0; + + // Hash with CUDA + + switch (algo80) { + case BLAKE: + quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("blake80:"); + break; + case BMW: + quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("bmw80 :"); + break; + case GROESTL: + groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("grstl80:"); + break; + case JH: + jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("jh51280:"); + break; + case KECCAK: + keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("kecck80:"); + break; + case SKEIN: + skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++; + TRACE("skein80:"); + break; + case LUFFA: + qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("luffa80:"); + break; + case CUBEHASH: + cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("cube 80:"); + break; + case SHAVITE: + x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++); + TRACE("shavite:"); + break; + case SIMD: + x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("simd512:"); + break; + case ECHO: + x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("echo :"); + break; + case HAMSI: + x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("hamsi :"); + break; + case FUGUE: + x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("fugue :"); + break; + case SHABAL: + x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("shabal :"); + break; + case WHIRLPOOL: + x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("whirl :"); + break; + case SHA512: + x16_sha512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("sha512 :"); + break; + } + + for (int i = 1; i < 16; i++) + { + const char elem = hashOrder[i]; + const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch (algo64) { + case BLAKE: + quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("blake :"); + break; + case BMW: + quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("bmw :"); + break; + case GROESTL: + quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("groestl:"); + break; + case JH: + quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("jh512 :"); + break; + case KECCAK: + quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("keccak :"); + break; + case SKEIN: + quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("skein :"); + break; + case LUFFA: + x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("luffa :"); + break; + case CUBEHASH: + x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("cube :"); + break; + case SHAVITE: + x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("shavite:"); + break; + case SIMD: + x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("simd :"); + break; + case ECHO: + x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("echo :"); + break; + case HAMSI: + x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("hamsi :"); + break; + case FUGUE: + x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("fugue :"); + break; + case SHABAL: + x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("shabal :"); + break; + case WHIRLPOOL: + x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++); + TRACE("shabal :"); + break; + case SHA512: + x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++; + TRACE("sha512 :"); + break; + } + } + + *hashes_done = pdata[19] - first_nonce + throughput; + + work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]); +#ifdef _DEBUG + uint32_t _ALIGN(64) dhash[8]; + be32enc(&endiandata[19], pdata[19]); + x16r_hash(dhash, endiandata); + applog_hash(dhash); + return -1; +#endif + if (work->nonces[0] != UINT32_MAX) + { + const uint32_t Htarg = ptarget[7]; + uint32_t _ALIGN(64) vhash[8]; + be32enc(&endiandata[19], work->nonces[0]); + x16r_hash(vhash, endiandata); + + if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) { + work->valid_nonces = 1; + work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1); + work_set_target_ratio(work, vhash); + if (work->nonces[1] != 0) { + be32enc(&endiandata[19], work->nonces[1]); + x16r_hash(vhash, endiandata); + bn_set_target_ratio(work, vhash, 1); + work->valid_nonces++; + pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; + } else { + pdata[19] = work->nonces[0] + 1; // cursor + } +#if 0 + gpulog(LOG_INFO, thr_id, "hash found with %s 80!", algo_strings[algo80]); + + algo80_tests[algo80] += work->valid_nonces; + char oks64[128] = { 0 }; + char oks80[128] = { 0 }; + char fails[128] = { 0 }; + for (int a = 0; a < HASH_FUNC_COUNT; a++) { + const char elem = hashOrder[a]; + const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + if (a > 0) algo64_tests[algo64] += work->valid_nonces; + sprintf(&oks64[strlen(oks64)], "|%X:%2d", a, algo64_tests[a] < 100 ? algo64_tests[a] : 99); + sprintf(&oks80[strlen(oks80)], "|%X:%2d", a, algo80_tests[a] < 100 ? algo80_tests[a] : 99); + sprintf(&fails[strlen(fails)], "|%X:%2d", a, algo80_fails[a] < 100 ? algo80_fails[a] : 99); + } + applog(LOG_INFO, "K64: %s", oks64); + applog(LOG_INFO, "K80: %s", oks80); + applog(LOG_ERR, "F80: %s", fails); +#endif + return work->valid_nonces; + } + else if (vhash[7] > Htarg) { + // x11+ coins could do some random error, but not on retry + gpu_increment_reject(thr_id); + algo80_fails[algo80]++; + if (!warn) { + warn++; + pdata[19] = work->nonces[0] + 1; + continue; + } else { + if (!opt_quiet) gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! %s %s", + work->nonces[0], algo_strings[algo80], hashOrder); + warn = 0; + } + } + } + + if ((uint64_t)throughput + pdata[19] >= max_nonce) { + pdata[19] = max_nonce; + break; + } + + pdata[19] += throughput; + + } while (pdata[19] < max_nonce && !work_restart[thr_id].restart); + + *hashes_done = pdata[19] - first_nonce; + return 0; +} + +// cleanup +extern "C" void free_x16r(int thr_id) +{ + if (!init[thr_id]) + return; + + cudaThreadSynchronize(); + + cudaFree(d_hash[thr_id]); + + quark_blake512_cpu_free(thr_id); + quark_groestl512_cpu_free(thr_id); + x11_simd512_cpu_free(thr_id); + x13_fugue512_cpu_free(thr_id); + x16_fugue512_cpu_free(thr_id); // to merge with x13_fugue512 ? + x15_whirlpool_cpu_free(thr_id); + + cuda_check_cpu_free(thr_id); + + cudaDeviceSynchronize(); + init[thr_id] = false; +} diff --git a/x17/cuda_x17_sha512.cu b/x17/cuda_x17_sha512.cu index bebf17d..a0757d0 100644 --- a/x17/cuda_x17_sha512.cu +++ b/x17/cuda_x17_sha512.cu @@ -169,3 +169,80 @@ void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, x17_sha512_gpu_hash_64 <<>> (threads, (uint64_t*)d_hash); } + +__constant__ +static uint64_t c_PaddedMessage80[10]; + +__global__ +/*__launch_bounds__(256, 4)*/ +void x16_sha512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash) +{ + const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x); + if (thread < threads) + { + uint64_t W[80]; + #pragma unroll + for (int i = 0; i < 9; i ++) { + W[i] = SWAP64(c_PaddedMessage80[i]); + } + const uint32_t nonce = startNonce + thread; + //((uint32_t*)W)[19] = cuda_swab32(nonce); + W[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nonce)); + W[9] = cuda_swab64(W[9]); + W[10] = 0x8000000000000000; + + #pragma unroll + for (int i = 11; i<15; i++) { + W[i] = 0U; + } + W[15] = 0x0000000000000280; + + #pragma unroll 64 + for (int i = 16; i < 80; i ++) { + W[i] = SSG5_1(W[i-2]) + W[i-7]; + W[i] += SSG5_0(W[i-15]) + W[i-16]; + } + + const uint64_t IV512[8] = { + 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, + 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, + 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, + 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 + }; + + uint64_t r[8]; + #pragma unroll + for (int i = 0; i < 8; i++) { + r[i] = IV512[i]; + } + + #pragma unroll + for (int i = 0; i < 80; i++) { + SHA3_STEP(c_WB, r, W, i&7, i); + } + + const uint64_t hashPosition = thread; + uint64_t *pHash = &g_hash[hashPosition << 3]; + #pragma unroll + for (int u = 0; u < 8; u ++) { + pHash[u] = SWAP64(r[u] + IV512[u]); + } + } +} + +__host__ +void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash) +{ + const uint32_t threadsperblock = 256; + + dim3 grid((threads + threadsperblock-1)/threadsperblock); + dim3 block(threadsperblock); + + x16_sha512_gpu_hash_80 <<>> (threads, startNounce, (uint64_t*)d_hash); +} + +__host__ +void x16_sha512_setBlock_80(void *pdata) +{ + cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice); +} \ No newline at end of file