x16r algo and new kernels

Was a very long work but finally working, and unlike xevan these new kernels are reusable.. Signed-off-by: Tanguy Pruvot <tanguy.pruvot@gmail.com>
7 years ago · 78dad7dd65
20 changed files with 4047 additions and 446 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -76,6 +76,8 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  x13/hsr.cu x13/cuda_hsr_sm3.cu x13/sm3.c \
 			  x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \
 			  x15/whirlpool.cu x15/cuda_x15_whirlpool_sm3.cu \
 			  x16r/x16r.cu x16r/cuda_x16_echo512.cu x16r/cuda_x16_fugue512.cu \
 			  x16r/cuda_x16_shabal512.cu x16r/cuda_x16_simd512_80.cu \
 			  x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
 			  x11/phi.cu x11/cuda_streebog_maxwell.cu \
 			  x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu
--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@
-ccminer 2.2.4 (Jan. 2018)     "lyra2v2 and keccak improvements"
+ccminer 2.2.5 (Feb 2018)                            "x16r algo"
 ---------------------------------------------------------------
 ***************************************************************
@ -122,6 +122,7 @@ its command line interface and options.
                          x11         use to mine DarkCoin
                          x14         use to mine X14Coin
                          x15         use to mine Halcyon
                          x16r        use to mine Raven
                          x17         use to mine X17
                          vanilla     use to mine Vanilla (Blake256)
                          veltor      use to mine VeltorCoin
@ -277,6 +278,9 @@ so we can more efficiently implement new algorithms using the latest hardware
 features.
 >>> RELEASE HISTORY <<<
  Feb. 2017       v2.2.5
                  New x16r algo
  Jan. 04th 2017  v2.2.4
                  Improve lyra2v2
                  Higher keccak default intensity
--- a/algos.h
+++ b/algos.h
@ -59,6 +59,7 @@ enum sha_algos {
 	ALGO_X13,
 	ALGO_X14,
 	ALGO_X15,
 	ALGO_X16R,
 	ALGO_X17,
 	ALGO_VANILLA,
 	ALGO_VELTOR,
@ -128,6 +129,7 @@ static const char *algo_names[] = {
 	"x13",
 	"x14",
 	"x15",
 	"x16r",
 	"x17",
 	"vanilla",
 	"veltor",
--- a/bench.cpp
+++ b/bench.cpp
@ -102,6 +102,7 @@ void algo_free_all(int thr_id)
 	free_x13(thr_id);
 	free_x14(thr_id);
 	free_x15(thr_id);
 	free_x16r(thr_id);
 	free_x17(thr_id);
 	free_zr5(thr_id);
 	free_scrypt(thr_id);
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -293,6 +293,7 @@ Options:\n\
 			x13         X13 (MaruCoin)\n\
 			x14         X14\n\
 			x15         X15\n\
 			x16r        X16R (Raven)\n\
 			x17         X17\n\
 			wildkeccak  Boolberry\n\
 			zr5         ZR5 (ZiftrCoin)\n\
@ -1705,6 +1706,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		case ALGO_LYRA2Z:
 		case ALGO_TIMETRAVEL:
 		case ALGO_BITCORE:
 		case ALGO_X16R:
 			work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));
 			break;
 		case ALGO_KECCAK:
@ -2499,6 +2501,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_X15:
 			rc = scanhash_x15(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_X16R:
 			rc = scanhash_x16r(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_X17:
 			rc = scanhash_x17(thr_id, &work, max_nonce, &hashes_done);
 			break;
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -269,6 +269,7 @@
    <ClCompile Include="neoscrypt\neoscrypt-cpu.c" />
    <ClInclude Include="neoscrypt\cuda_vectors.h" />
    <ClInclude Include="x11\cuda_x11_simd512_sm2.cuh" />
    <ClInclude Include="x16r\cuda_x16r.h" />
    <CudaCompile Include="Algo256\bmw.cu" />
    <CudaCompile Include="Algo256\cuda_bmw.cu">
      <MaxRegCount>76</MaxRegCount>
@ -434,6 +435,12 @@
    <CudaCompile Include="sha256\sha256d.cu" />
    <CudaCompile Include="sha256\cuda_sha256t.cu" />
    <CudaCompile Include="sha256\sha256t.cu" />
    <CudaCompile Include="x15\cuda_x15_whirlpool_sm3.cu" />
    <CudaCompile Include="x16r\x16r.cu" />
    <CudaCompile Include="x16r\cuda_x16_echo512.cu" />
    <CudaCompile Include="x16r\cuda_x16_fugue512.cu" />
    <CudaCompile Include="x16r\cuda_x16_shabal512.cu" />
    <CudaCompile Include="x16r\cuda_x16_simd512_80.cu" />
    <CudaCompile Include="zr5.cu" />
    <CudaCompile Include="heavy\cuda_blake512.cu">
    </CudaCompile>
@ -587,8 +594,7 @@
    <CudaCompile Include="x17\hmq17.cu" />
    <CudaCompile Include="x15\x15.cu" />
    <CudaCompile Include="x15\whirlpool.cu" />
-    <CudaCompile Include="x17\x17.cu">
+    <CudaCompile Include="x17\x17.cu" />
    </CudaCompile>
    <CudaCompile Include="x17\cuda_x17_haval256.cu">
    </CudaCompile>
    <CudaCompile Include="x17\cuda_x17_sha512.cu">
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -58,6 +58,9 @@
    <Filter Include="Source Files\CUDA\x15">
      <UniqueIdentifier>{a2403c22-6777-46ab-a55a-3fcc7386c974}</UniqueIdentifier>
    </Filter>
    <Filter Include="Source Files\CUDA\x16r">
      <UniqueIdentifier>{55dfae6a-66ba-43e2-8ceb-98ee70cbdf16}</UniqueIdentifier>
    </Filter>
    <Filter Include="Source Files\CUDA\x17">
      <UniqueIdentifier>{85dfae6a-66ca-4332-8cec-98ee70cbdf2f}</UniqueIdentifier>
    </Filter>
@ -596,6 +599,9 @@
    <ClInclude Include="equi\equihash.h">
      <Filter>Source Files\equi</Filter>
    </ClInclude>
    <ClInclude Include="x16r\cuda_x16r.h">
      <Filter>Header Files\CUDA</Filter>
    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <CudaCompile Include="cuda.cpp">
@ -967,6 +973,24 @@
    <CudaCompile Include="equi\cuda_equi.cu">
      <Filter>Source Files\equi</Filter>
    </CudaCompile>
    <CudaCompile Include="x15\cuda_x15_whirlpool_sm3.cu">
      <Filter>Source Files\CUDA\x15</Filter>
    </CudaCompile>
    <CudaCompile Include="x16r\cuda_x16_echo512.cu">
      <Filter>Source Files\CUDA\x16r</Filter>
    </CudaCompile>
    <CudaCompile Include="x16r\cuda_x16_fugue512.cu">
      <Filter>Source Files\CUDA\x16r</Filter>
    </CudaCompile>
    <CudaCompile Include="x16r\cuda_x16_shabal512.cu">
      <Filter>Source Files\CUDA\x16r</Filter>
    </CudaCompile>
    <CudaCompile Include="x16r\cuda_x16_simd512_80.cu">
      <Filter>Source Files\CUDA\x16r</Filter>
    </CudaCompile>
    <CudaCompile Include="x16r\x16r.cu">
      <Filter>Source Files\CUDA\x16r</Filter>
    </CudaCompile>
  </ItemGroup>
  <ItemGroup>
    <Image Include="res\ccminer.ico">
--- a/compat/ccminer-config.h
+++ b/compat/ccminer-config.h
@ -164,7 +164,7 @@
 #define PACKAGE_URL "http://github.com/tpruvot/ccminer"
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.2.4"
+#define PACKAGE_VERSION "2.2.5"
 /* If using the C implementation of alloca, define if you know the
   direction of stack growth for your system; otherwise it will be
--- a/miner.h
+++ b/miner.h
@ -325,6 +325,7 @@ extern int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, unsig
 extern int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_zr5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
@ -389,6 +390,7 @@ extern void free_x11(int thr_id);
 extern void free_x13(int thr_id);
 extern void free_x14(int thr_id);
 extern void free_x15(int thr_id);
 extern void free_x16r(int thr_id);
 extern void free_x17(int thr_id);
 extern void free_zr5(int thr_id);
 //extern void free_sha256d(int thr_id);
@ -933,6 +935,7 @@ void x11hash(void *output, const void *input);
 void x13hash(void *output, const void *input);
 void x14hash(void *output, const void *input);
 void x15hash(void *output, const void *input);
 void x16r_hash(void *output, const void *input);
 void x17hash(void *output, const void *input);
 void wildkeccak_hash(void *output, const void *input, uint64_t* scratchpad, uint64_t ssize);
 void zr5hash(void *output, const void *input);
--- a/res/ccminer.rc
+++ b/res/ccminer.rc
@ -60,8 +60,8 @@ IDI_ICON1               ICON                    "ccminer.ico"
 //
 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 2,2,4,0
+ FILEVERSION 2,2,5,0
- PRODUCTVERSION 2,2,4,0
+ PRODUCTVERSION 2,2,5,0
 FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
 FILEFLAGS 0x21L
@ -76,10 +76,10 @@ BEGIN
    BEGIN
        BLOCK "040904e4"
        BEGIN
-            VALUE "FileVersion", "2.2.4"
+            VALUE "FileVersion", "2.2.5"
            VALUE "LegalCopyright", "Copyright (C) 2018"
            VALUE "ProductName", "ccminer"
-            VALUE "ProductVersion", "2.2.4"
+            VALUE "ProductVersion", "2.2.5"
        END
    END
    BLOCK "VarFileInfo"
--- a/util.cpp
+++ b/util.cpp
@ -2325,6 +2325,9 @@ void print_hash_tests(void)
 	x15hash(&hash[0], &buf[0]);
 	printpfx("X15", hash);
 	x16r_hash(&hash[0], &buf[0]);
 	printpfx("X16r", hash);
 	x17hash(&hash[0], &buf[0]);
 	printpfx("X17", hash);
--- a/x13/cuda_x13_hamsi512.cu
+++ b/x13/cuda_x13_hamsi512.cu
@ -1,6 +1,6 @@
 /*
- * Quick Hamsi-512 for X13
+ * Quick Hamsi-512 for X13 by tsiv - 2014
- * by tsiv - 2014
+ * + Hamsi-512 80 by tpruvot - 2018
 */
 #include <stdio.h>
@ -16,31 +16,17 @@ static __constant__ uint32_t d_alpha_f[32];
 static __constant__ uint32_t d_T512[64][16];
 static const uint32_t alpha_n[] = {
-	SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
+	0xff00f0f0, 0xccccaaaa, 0xf0f0cccc, 0xff00aaaa, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xf0f0ff00,
-	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
+	0xf0f0cccc, 0xaaaaff00, 0xccccff00, 0xaaaaf0f0, 0xaaaaf0f0, 0xff00cccc, 0xccccf0f0, 0xff00aaaa,
-	SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
+	0xccccaaaa, 0xff00f0f0, 0xff00aaaa, 0xf0f0cccc, 0xf0f0ff00, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc,
-	SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
+	0xaaaaff00, 0xf0f0cccc, 0xaaaaf0f0, 0xccccff00, 0xff00cccc, 0xaaaaf0f0, 0xff00aaaa, 0xccccf0f0
 	SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
 	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
 	SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
 	SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
 	SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
 	SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
 	SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
 };
 static const uint32_t alpha_f[] = {
-	SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
+	0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0, 0xcaf9f9c0, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, 0x639ccaf9,
-	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
+	0x639c0ff0, 0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c, 0xf9c0639c, 0xcaf90ff0, 0x0ff0639c, 0xcaf9f9c0,
-	SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
+	0x0ff0f9c0, 0xcaf9639c, 0xcaf9f9c0, 0x639c0ff0, 0x639ccaf9, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0,
-	SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
+	0xf9c0caf9, 0x639c0ff0, 0xf9c0639c, 0x0ff0caf9, 0xcaf90ff0, 0xf9c0639c, 0xcaf9f9c0, 0x0ff0639c
 	SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
 	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
 	SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
 	SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
 	SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
 	SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
 	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
 };
 #define hamsi_s00   m0
@ -200,390 +186,134 @@ static const uint32_t alpha_f[] = {
 static const uint32_t T512[64][16] = {
-	{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
+{ 0xef0b0270, 0x3afd0000, 0x5dae0000, 0x69490000, 0x9b0f3c06, 0x4405b5f9, 0x66140a51, 0x924f5d0a, // 0
-	  SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
+  0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68 },
-	  SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
+{ 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68,
-	  SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
+  0x26600240, 0xddd80000, 0x722a0000, 0x4f060000, 0x936667ff, 0x29f944ce, 0x368b63d5, 0x0c26f262 },
-	  SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
+{ 0x145a3c00, 0xb9e90000, 0x61270000, 0xf1610000, 0xce613d6c, 0xb0493d78, 0x47a96720, 0xe18e24c5,
-	  SPH_C32(0x9e69af68) },
+  0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f },
-	{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
+{ 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f,
-	  SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
+  0x373d2800, 0x71500000, 0x95e00000, 0x0a140000, 0xbdac1909, 0x48ef9831, 0x456d6d1f, 0x3daac2da },
-	  SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
+{ 0x54285c00, 0xeaed0000, 0xc5d60000, 0xa1c50000, 0xb3a26770, 0x94a5c4e1, 0x6bb0419d, 0x551b3782,
-	  SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
+  0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29 },
-	  SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
+{ 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29,
-	  SPH_C32(0x0c26f262) },
+  0xc8934400, 0x5a3e0000, 0x57870000, 0x4c560000, 0xea982435, 0x75b11115, 0x28b67247, 0x2dd1f9ab },
-	{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
+{ 0x29449c00, 0x64e70000, 0xf24b0000, 0xc2f30000, 0x0ede4e8f, 0x56c23745, 0xf3e04259, 0x8d0d9ec4,
-	  SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
+  0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2 },
-	  SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
+{ 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2, // 7
-	  SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
+  0x6f299000, 0x6c850000, 0x2f160000, 0x782e0000, 0x644c37cd, 0x12dd1cd6, 0xd26a8c36, 0x32219526 },
-	  SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
+{ 0xf6800005, 0x3443c000, 0x24070000, 0x8f3d0000, 0x21373bfb, 0x0ab8d5ae, 0xcdc58b19, 0xd795ba31,
-	  SPH_C32(0xdc24e61f) },
+  0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88 },
-	{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
+{ 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88,
-	  SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
+  0x50ff0004, 0x45744000, 0x3dfb0000, 0x19e60000, 0x1bbc5606, 0xe1727b5d, 0xe1a8cc96, 0x7b1bd6b9 },
-	  SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
+{ 0xf7750009, 0xcf3cc000, 0xc3d60000, 0x04920000, 0x029519a9, 0xf8e836ba, 0x7a87f14e, 0x9e16981a,
-	  SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
+  0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320 },
-	  SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
+{ 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320,
-	  SPH_C32(0x3daac2da) },
+  0x231f0009, 0x42f40000, 0x66790000, 0x4ebb0000, 0xfedb5bd3, 0x315cb0d6, 0xe2b1674a, 0x69505b3a },
-	{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
+{ 0x774400f0, 0xf15a0000, 0xf5b20000, 0x34140000, 0x89377e8c, 0x5a8bec25, 0x0bc3cd1e, 0xcf3775cb,
-	  SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
+  0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574 },
-	  SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
+{ 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574,
-	  SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
+  0x832800a0, 0x67420000, 0xe1170000, 0x370b0000, 0xcba30034, 0x3c34923c, 0x9767bdcc, 0x450360bf },
-	  SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
+{ 0xe8870170, 0x9d720000, 0x12db0000, 0xd4220000, 0xf2886b27, 0xa921e543, 0x4ef8b518, 0x618813b1, // 14
-	  SPH_C32(0x78cace29) },
+  0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758 },
-	{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
+{ 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758,
-	  SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
+  0x5cb00110, 0x913e0000, 0x44190000, 0x888c0000, 0x66dc7418, 0x921f1d66, 0x55ceea25, 0x925c44e9 },
-	  SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
+{ 0x0c720000, 0x49e50f00, 0x42790000, 0x5cea0000, 0x33aa301a, 0x15822514, 0x95a34b7b, 0xb44b0090,
-	  SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
+  0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f },
-	  SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
+{ 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f,
-	  SPH_C32(0x2dd1f9ab) },
+  0xf2500000, 0xeebd0a00, 0x67a80000, 0xab8a0000, 0xba9b48c0, 0x0a56dd74, 0xdb73e86e, 0x1568ff0f },
-	{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
+{ 0x45180000, 0xa5b51700, 0xf96a0000, 0x3b480000, 0x1ecc142c, 0x231395d6, 0x16bca6b0, 0xdf33f4df,
-	  SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
+  0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e },
-	  SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
+{ 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e,
-	  SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
+  0xfd250000, 0xb3c41100, 0xcef00000, 0xcef90000, 0x3c4d7580, 0x8d5b6493, 0x7098b0a6, 0x1af21fe1 },
-	  SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
+{ 0x75a40000, 0xc28b2700, 0x94a40000, 0x90f50000, 0xfb7857e0, 0x49ce0bae, 0x1767c483, 0xaedf667e,
-	  SPH_C32(0xbf2c0be2) },
+  0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b },
-	{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
+{ 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b, // 21
-	  SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
+  0xa4c20000, 0xd9372400, 0x0a480000, 0x66610000, 0xf87a12c7, 0x86bef75c, 0xa324df94, 0x2ba05a55 },
-	  SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
+{ 0x75c90003, 0x0e10c000, 0xd1200000, 0xbaea0000, 0x8bc42f3e, 0x8758b757, 0xbb28761d, 0x00b72e2b,
-	  SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
+  0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254 },
-	  SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
+{ 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254,
-	  SPH_C32(0x32219526) },
+  0x9b060002, 0x61468000, 0x221e0000, 0x1d740000, 0x36715d27, 0x30495c92, 0xf11336a7, 0xfe1cdc7f },
-	{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
+{ 0x86790000, 0x3f390002, 0xe19ae000, 0x98560000, 0x9565670e, 0x4e88c8ea, 0xd3dd4944, 0x161ddab9,
-	  SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
+  0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834 },
-	  SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
+{ 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834,
-	  SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
+  0xb6ce0000, 0xdae90002, 0x156e8000, 0xda920000, 0xf6dd5a64, 0x36325c8a, 0xf272e8ae, 0xa6b8c28d },
-	  SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
+{ 0x14190000, 0x23ca003c, 0x50df0000, 0x44b60000, 0x1b6c67b0, 0x3cf3ac75, 0x61e610b0, 0xdbcadb80,
-	  SPH_C32(0xac8e6c88) },
+  0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7 },
-	{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
+{ 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7,
-	  SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
+  0xf75a0000, 0x19840028, 0xa2190000, 0xeef80000, 0xc0722516, 0x19981260, 0x73dba1e6, 0xe1844257 },
-	  SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
+{ 0x54500000, 0x0671005c, 0x25ae0000, 0x6a1e0000, 0x2ea54edf, 0x664e8512, 0xbfba18c3, 0x7e715d17, // 28
-	  SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
+  0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e },
-	  SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
+{ 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e,
-	  SPH_C32(0x7b1bd6b9) },
+  0xe8dd0000, 0xfa4a0044, 0x3c2d0000, 0xbb150000, 0x80bd361b, 0x24e81d44, 0xbfa8c2f4, 0x524a0d59 },
-	{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
+{ 0x69510000, 0xd4e1009c, 0xc3230000, 0xac2f0000, 0xe4950bae, 0xcea415dc, 0x87ec287c, 0xbce1a3ce,
-	  SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
+  0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173 },
-	  SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
+{ 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173,
-	  SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
+  0xaf220000, 0x7b6c0090, 0x67e20000, 0x8da20000, 0xc7841e29, 0xb7b744f3, 0x9ac484f4, 0x8b6c72bd },
-	  SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
+{ 0xcc140000, 0xa5630000, 0x5ab90780, 0x3b500000, 0x4bd013ff, 0x879b3418, 0x694348c1, 0xca5a87fe,
-	  SPH_C32(0xf746c320) },
+  0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa },
-	{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
+{ 0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa,
-	  SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
+  0x4d8a0000, 0x49340000, 0x3c8b0500, 0xaea30000, 0x16793bfd, 0xcf6f08a4, 0x8f19eaec, 0x443d3004 },
-	  SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
+{ 0x78230000, 0x12fc0000, 0xa93a0b80, 0x90a50000, 0x713e2879, 0x7ee98924, 0xf08ca062, 0x636f8bab,
-	  SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
+  0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b },
-	  SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
+{ 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b, // 35
-	  SPH_C32(0x69505b3a) },
+  0x7a8c0000, 0xa5d40000, 0x13260880, 0xc63d0000, 0xcbb36daa, 0xfea14f43, 0x59d0b4f8, 0x979961d0 },
-	{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
+{ 0xac480000, 0x1ba60000, 0x45fb1380, 0x03430000, 0x5a85316a, 0x1fb250b6, 0xfe72c7fe, 0x91e478f6,
-	  SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
+  0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e },
-	  SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
+{ 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e,
-	  SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
+  0xb2060000, 0xc5690000, 0x28031200, 0x74670000, 0xb6c236f4, 0xeb1239f8, 0x33d1dfec, 0x094e3198 },
-	  SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
+{ 0xaec30000, 0x9c4f0001, 0x79d1e000, 0x2c150000, 0x45cc75b3, 0x6650b736, 0xab92f78f, 0xa312567b,
-	  SPH_C32(0x8a341574) },
+  0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e },
-	{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
+{ 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e,
-	  SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
+  0x75e60000, 0x95660001, 0x307b2000, 0xadf40000, 0x8f321eea, 0x24298307, 0xe8c49cf9, 0x4b7eec55 },
-	  SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
+{ 0x58430000, 0x807e0000, 0x78330001, 0xc66b3800, 0xe7375cdc, 0x79ad3fdd, 0xac73fe6f, 0x3a4479b1,
-	  SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
+  0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6 },
-	  SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
+{ 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6,
-	  SPH_C32(0x450360bf) },
+  0x45190000, 0xab0c0000, 0x30be0001, 0x690a2000, 0xc2fc7219, 0xb1d4800d, 0x2dd1fa46, 0x24314f17 },
-	{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
+{ 0xa53b0000, 0x14260000, 0x4e30001e, 0x7cae0000, 0x8f9e0dd5, 0x78dfaa3d, 0xf73168d8, 0x0b1b4946, // 42
-	  SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
+  0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce },
-	  SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
+{ 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce,
-	  SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
+  0xa2d60000, 0xa6760000, 0xc9440014, 0xeba30000, 0xccec2e7b, 0x3018c499, 0x03490afa, 0x9b6ef888 },
-	  SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
+{ 0x88980000, 0x1f940000, 0x7fcf002e, 0xfb4e0000, 0xf158079a, 0x61ae9167, 0xa895706c, 0xe6107494,
-	  SPH_C32(0xf3d45758) },
+  0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463 },
-	{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
+{ 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463,
-	  SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
+  0x835a0000, 0xc4f70000, 0x01470022, 0xeec80000, 0x60a54f69, 0x142f2a24, 0x5cf534f2, 0x3ea660f7 },
-	  SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
+{ 0x52500000, 0x29540000, 0x6a61004e, 0xf0ff0000, 0x9a317eec, 0x452341ce, 0xcf568fe5, 0x5303130f,
-	  SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
+  0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691 },
-	  SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
+{ 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691,
-	  SPH_C32(0x925c44e9) },
+  0x01dd0000, 0x80a80000, 0xf4960048, 0xa6000000, 0x90d57ea2, 0xd7e68c37, 0x6612cffd, 0x2c94459e },
-	{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
+{ 0xe6280000, 0x4c4b0000, 0xa8550000, 0xd3d002e0, 0xd86130b8, 0x98a7b0da, 0x289506b4, 0xd75a4897,
-	  SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
+  0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f },
-	  SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
+{ 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f, // 49
-	  SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
+  0x16ed0000, 0x15680000, 0xedd70000, 0x325d0220, 0xe30c3689, 0x5a4ae643, 0xe375f8a8, 0x81fdf908 },
-	  SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
+{ 0xb4310000, 0x77330000, 0xb15d0000, 0x7fd004e0, 0x78a26138, 0xd116c35d, 0xd256d489, 0x4e6f74de,
-	  SPH_C32(0xa123ff9f) },
+  0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539 },
-	{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
+{ 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539,
-	  SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
+  0x57370000, 0xcaf20000, 0x364e0000, 0xc0220480, 0x56186b22, 0x5ca3f40c, 0xa1937f8f, 0x15b961e7 },
-	  SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
+{ 0x02f20000, 0xa2810000, 0x873f0000, 0xe36c7800, 0x1e1d74ef, 0x073d2bd6, 0xc4c23237, 0x7f32259e,
-	  SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
+  0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0 },
-	  SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
+{ 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0,
-	  SPH_C32(0x1568ff0f) },
+  0xb82f0000, 0xb12c0000, 0x30d80000, 0x14445000, 0xc15860a2, 0x3127e8ec, 0x2e98bf23, 0x551e3d6e },
-	{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
+{ 0x1e6c0000, 0xc4420000, 0x8a2e0000, 0xbcb6b800, 0x2c4413b6, 0x8bfdd3da, 0x6a0c1bc8, 0xb99dc2eb,
-	  SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
+  0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f },
-	  SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
+{ 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f,
-	  SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
+  0x8c3a0000, 0xda980000, 0x607f0000, 0x54078800, 0x85714513, 0x6006b243, 0xdb50399c, 0x8a58e6a4 },
-	  SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
+{ 0x033d0000, 0x08b30000, 0xf33a0000, 0x3ac20007, 0x51298a50, 0x6b6e661f, 0x0ea5cfe3, 0xe6da7ffe, // 56
-	  SPH_C32(0xc5c1eb3e) },
+  0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000 },
-	{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
+{ 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000,
-	  SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
+  0xabe70000, 0x9e0d0000, 0xaf270000, 0x3d180005, 0x2c4f1fd3, 0x74f61695, 0xb5c347eb, 0x3c5dfffe },
-	  SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
+{ 0x01930000, 0xe7820000, 0xedfb0000, 0xcf0c000b, 0x8dd08d58, 0xbca3b42e, 0x063661e1, 0x536f9e7b,
-	  SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
+  0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7 },
-	  SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
+{ 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7,
-	  SPH_C32(0x1af21fe1) },
+  0x93bb0000, 0x3b070000, 0xba010000, 0x99d00008, 0x3739ae4e, 0xe64c1722, 0x96f896b3, 0x2879ebac },
-	{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
+{ 0x5fa80000, 0x56030000, 0x43ae0000, 0x64f30013, 0x257e86bf, 0x1311944e, 0x541e95bf, 0x8ea4db69,
-	  SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
+  0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e },
-	  SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
+{ 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e,
-	  SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
+  0x5fec0000, 0x294b0000, 0x99d20000, 0x4ed00012, 0x1ed34f73, 0xbaa708c9, 0x57140bdf, 0x30aebcf7 },
-	  SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
+{ 0xee930000, 0xd6070000, 0x92c10000, 0x2b9801e0, 0x9451287c, 0x3b6cfb57, 0x45312374, 0x201f6a64,
-	  SPH_C32(0x857f3c2b) },
+  0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0 },
-	{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
+{ 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0,
-	  SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
+  0x95bb0000, 0x81450000, 0x3b240000, 0x48db0140, 0x0a8a6c53, 0x56f56eec, 0x62c91877, 0xe7e00a94 }
 	  SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
 	  SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
 	  SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
 	  SPH_C32(0x2ba05a55) },
 	{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
 	  SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
 	  SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
 	  SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
 	  SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
 	  SPH_C32(0xfeabf254) },
 	{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
 	  SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
 	  SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
 	  SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
 	  SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
 	  SPH_C32(0xfe1cdc7f) },
 	{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
 	  SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
 	  SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
 	  SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
 	  SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
 	  SPH_C32(0xb0a51834) },
 	{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
 	  SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
 	  SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
 	  SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
 	  SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
 	  SPH_C32(0xa6b8c28d) },
 	{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
 	  SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
 	  SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
 	  SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
 	  SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
 	  SPH_C32(0x3a4e99d7) },
 	{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
 	  SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
 	  SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
 	  SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
 	  SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
 	  SPH_C32(0xe1844257) },
 	{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
 	  SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
 	  SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
 	  SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
 	  SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
 	  SPH_C32(0x2c3b504e) },
 	{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
 	  SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
 	  SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
 	  SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
 	  SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
 	  SPH_C32(0x524a0d59) },
 	{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
 	  SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
 	  SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
 	  SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
 	  SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
 	  SPH_C32(0x378dd173) },
 	{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
 	  SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
 	  SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
 	  SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
 	  SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
 	  SPH_C32(0x8b6c72bd) },
 	{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
 	  SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
 	  SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
 	  SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
 	  SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
 	  SPH_C32(0x8e67b7fa) },
 	{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
 	  SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
 	  SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
 	  SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
 	  SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
 	  SPH_C32(0x443d3004) },
 	{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
 	  SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
 	  SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
 	  SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
 	  SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
 	  SPH_C32(0xf4f6ea7b) },
 	{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
 	  SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
 	  SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
 	  SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
 	  SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
 	  SPH_C32(0x979961d0) },
 	{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
 	  SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
 	  SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
 	  SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
 	  SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
 	  SPH_C32(0x98aa496e) },
 	{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
 	  SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
 	  SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
 	  SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
 	  SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
 	  SPH_C32(0x094e3198) },
 	{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
 	  SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
 	  SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
 	  SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
 	  SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
 	  SPH_C32(0xe86cba2e) },
 	{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
 	  SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
 	  SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
 	  SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
 	  SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
 	  SPH_C32(0x4b7eec55) },
 	{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
 	  SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
 	  SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
 	  SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
 	  SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
 	  SPH_C32(0x1e7536a6) },
 	{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
 	  SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
 	  SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
 	  SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
 	  SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
 	  SPH_C32(0x24314f17) },
 	{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
 	  SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
 	  SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
 	  SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
 	  SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
 	  SPH_C32(0x9075b1ce) },
 	{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
 	  SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
 	  SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
 	  SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
 	  SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
 	  SPH_C32(0x9b6ef888) },
 	{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
 	  SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
 	  SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
 	  SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
 	  SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
 	  SPH_C32(0xd8b61463) },
 	{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
 	  SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
 	  SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
 	  SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
 	  SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
 	  SPH_C32(0x3ea660f7) },
 	{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
 	  SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
 	  SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
 	  SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
 	  SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
 	  SPH_C32(0x7f975691) },
 	{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
 	  SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
 	  SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
 	  SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
 	  SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
 	  SPH_C32(0x2c94459e) },
 	{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
 	  SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
 	  SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
 	  SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
 	  SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
 	  SPH_C32(0x56a7b19f) },
 	{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
 	  SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
 	  SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
 	  SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
 	  SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
 	  SPH_C32(0x81fdf908) },
 	{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
 	  SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
 	  SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
 	  SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
 	  SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
 	  SPH_C32(0x5bd61539) },
 	{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
 	  SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
 	  SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
 	  SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
 	  SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
 	  SPH_C32(0x15b961e7) },
 	{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
 	  SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
 	  SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
 	  SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
 	  SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
 	  SPH_C32(0x2a2c18f0) },
 	{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
 	  SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
 	  SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
 	  SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
 	  SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
 	  SPH_C32(0x551e3d6e) },
 	{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
 	  SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
 	  SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
 	  SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
 	  SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
 	  SPH_C32(0x33c5244f) },
 	{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
 	  SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
 	  SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
 	  SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
 	  SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
 	  SPH_C32(0x8a58e6a4) },
 	{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
 	  SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
 	  SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
 	  SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
 	  SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
 	  SPH_C32(0xda878000) },
 	{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
 	  SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
 	  SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
 	  SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
 	  SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
 	  SPH_C32(0x3c5dfffe) },
 	{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
 	  SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
 	  SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
 	  SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
 	  SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
 	  SPH_C32(0x7b1675d7) },
 	{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
 	  SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
 	  SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
 	  SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
 	  SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
 	  SPH_C32(0x2879ebac) },
 	{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
 	  SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
 	  SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
 	  SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
 	  SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
 	  SPH_C32(0xbe0a679e) },
 	{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
 	  SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
 	  SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
 	  SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
 	  SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
 	  SPH_C32(0x30aebcf7) },
 	{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
 	  SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
 	  SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
 	  SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
 	  SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
 	  SPH_C32(0xc7ff60f0) },
 	{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
 	  SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
 	  SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
 	  SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
 	  SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
 	  SPH_C32(0xe7e00a94) }
 };
 __global__
@ -598,12 +328,12 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
 		uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
 		unsigned char *h1 = (unsigned char *)Hash;
-		uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265);
+		uint32_t c0 = 0x73746565, c1 = 0x6c706172, c2 = 0x6b204172, c3 = 0x656e6265;
-		uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c);
+		uint32_t c4 = 0x72672031, c5 = 0x302c2062, c6 = 0x75732032, c7 = 0x3434362c;
-		uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48);
+		uint32_t c8 = 0x20422d33, c9 = 0x30303120, cA = 0x4c657576, cB = 0x656e2d48;
-		uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d);
+		uint32_t cC = 0x65766572, cD = 0x6c65652c, cE = 0x2042656c, cF = 0x6769756d;
 		uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
 		uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
 		uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
 		uint32_t *tp, db, dm;
 		for(int i = 0; i < 64; i += 8) {
@ -637,16 +367,16 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
 			T_BIG;
 		}
 		// precomputed for 64 bytes blocks ?
 		tp = &d_T512[0][0] + 112;
-
+		m0 = tp[ 0]; m1 = tp[ 1];
-		m0 = *(tp+ 0); m1 = *(tp+ 1);
+		m2 = tp[ 2]; m3 = tp[ 3];
-		m2 = *(tp+ 2); m3 = *(tp+ 3);
+		m4 = tp[ 4]; m5 = tp[ 5];
-		m4 = *(tp+ 4); m5 = *(tp+ 5);
+		m6 = tp[ 6]; m7 = tp[ 7];
-		m6 = *(tp+ 6); m7 = *(tp+ 7);
+		m8 = tp[ 8]; m9 = tp[ 9];
-		m8 = *(tp+ 8); m9 = *(tp+ 9);
+		mA = tp[10]; mB = tp[11];
-		mA = *(tp+10); mB = *(tp+11);
+		mC = tp[12]; mD = tp[13];
-		mC = *(tp+12); mD = *(tp+13);
+		mE = tp[14]; mF = tp[15];
 		mE = *(tp+14); mF = *(tp+15);
 		for( int r = 0; r < 6; r += 2 ) {
 			ROUND_BIG(r, d_alpha_n);
@ -655,15 +385,14 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
 		T_BIG;
 		tp = &d_T512[0][0] + 784;
-
+		m0 = tp[ 0]; m1 = tp[ 1];
-		m0 = *(tp+ 0); m1 = *(tp+ 1);
+		m2 = tp[ 2]; m3 = tp[ 3];
-		m2 = *(tp+ 2); m3 = *(tp+ 3);
+		m4 = tp[ 4]; m5 = tp[ 5];
-		m4 = *(tp+ 4); m5 = *(tp+ 5);
+		m6 = tp[ 6]; m7 = tp[ 7];
-		m6 = *(tp+ 6); m7 = *(tp+ 7);
+		m8 = tp[ 8]; m9 = tp[ 9];
-		m8 = *(tp+ 8); m9 = *(tp+ 9);
+		mA = tp[10]; mB = tp[11];
-		mA = *(tp+10); mB = *(tp+11);
+		mC = tp[12]; mD = tp[13];
-		mC = *(tp+12); mD = *(tp+13);
+		mE = tp[14]; mF = tp[15];
 		mE = *(tp+14); mF = *(tp+15);
 		for( int r = 0; r < 12; r += 2 ) {
 			ROUND_BIG(r, d_alpha_f);
@ -696,3 +425,127 @@ void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce
 	x13_hamsi512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 	//MyStreamSynchronize(NULL, order, thr_id);
 }
 __constant__ static uint64_t c_PaddedMessage80[10];
 __host__
 void x16_hamsi512_setBlock_80(void *pdata)
 {
 	cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
 }
 __global__
 void x16_hamsi512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		unsigned char h1[80];
 		#pragma unroll
 		for (int i = 0; i < 10; i++)
 			((uint2*)h1)[i] = ((uint2*)c_PaddedMessage80)[i];
 		//((uint64_t*)h1)[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(startNonce + thread));
 		((uint32_t*)h1)[19] = cuda_swab32(startNonce + thread);
 		uint32_t c0 = 0x73746565, c1 = 0x6c706172, c2 = 0x6b204172, c3 = 0x656e6265;
 		uint32_t c4 = 0x72672031, c5 = 0x302c2062, c6 = 0x75732032, c7 = 0x3434362c;
 		uint32_t c8 = 0x20422d33, c9 = 0x30303120, cA = 0x4c657576, cB = 0x656e2d48;
 		uint32_t cC = 0x65766572, cD = 0x6c65652c, cE = 0x2042656c, cF = 0x6769756d;
 		uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
 		uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
 		uint32_t *tp, db, dm;
 		for(int i = 0; i < 80; i += 8)
 		{
 			m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0;
 			m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0;
 			tp = &d_T512[0][0];
 			#pragma unroll
 			for (int u = 0; u < 8; u++) {
 				db = h1[i + u];
 				#pragma unroll 2
 				for (int v = 0; v < 8; v++, db >>= 1) {
 					dm = -(uint32_t)(db & 1);
 					m0 ^= dm & tp[ 0]; m1 ^= dm & tp[ 1];
 					m2 ^= dm & tp[ 2]; m3 ^= dm & tp[ 3];
 					m4 ^= dm & tp[ 4]; m5 ^= dm & tp[ 5];
 					m6 ^= dm & tp[ 6]; m7 ^= dm & tp[ 7];
 					m8 ^= dm & tp[ 8]; m9 ^= dm & tp[ 9];
 					mA ^= dm & tp[10]; mB ^= dm & tp[11];
 					mC ^= dm & tp[12]; mD ^= dm & tp[13];
 					mE ^= dm & tp[14]; mF ^= dm & tp[15];
 					tp += 16;
 				}
 			}
 			#pragma unroll
 			for (int r = 0; r < 6; r++) {
 				ROUND_BIG(r, d_alpha_n);
 			}
 			T_BIG;
 		}
 		#define INPUT_BIG { \
 			m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0; \
 			m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0; \
 			tp = &d_T512[0][0]; \
 			for (int u = 0; u < 8; u++) { \
 				db = endtag[u]; \
 				for (int v = 0; v < 8; v++, db >>= 1) { \
 					dm = -(uint32_t)(db & 1); \
 					m0 ^= dm & tp[ 0]; m1 ^= dm & tp[ 1]; \
 					m2 ^= dm & tp[ 2]; m3 ^= dm & tp[ 3]; \
 					m4 ^= dm & tp[ 4]; m5 ^= dm & tp[ 5]; \
 					m6 ^= dm & tp[ 6]; m7 ^= dm & tp[ 7]; \
 					m8 ^= dm & tp[ 8]; m9 ^= dm & tp[ 9]; \
 					mA ^= dm & tp[10]; mB ^= dm & tp[11]; \
 					mC ^= dm & tp[12]; mD ^= dm & tp[13]; \
 					mE ^= dm & tp[14]; mF ^= dm & tp[15]; \
 					tp += 16; \
 				} \
 			} \
 		}
 		// close
 		uint8_t endtag[8] = { 0x80, 0x00, 0x00, 0x00,  0x00, 0x00, 0x00, 0x00 };
 		INPUT_BIG;
 		#pragma unroll
 		for (int r = 0; r < 6; r++) {
 			ROUND_BIG(r, d_alpha_n);
 		}
 		T_BIG;
 		endtag[0] = endtag[1] = 0x00;
 		endtag[6] = 0x02;
 		endtag[7] = 0x80;
 		INPUT_BIG;
 		// PF_BIG
 		#pragma unroll
 		for(int r = 0; r < 12; r++) {
 			ROUND_BIG(r, d_alpha_f);
 		}
 		T_BIG;
 		uint64_t hashPosition = thread;
 		uint32_t *Hash = (uint32_t*)&g_hash[hashPosition << 3];
 		#pragma unroll 16
 		for(int i = 0; i < 16; i++)
 			Hash[i] = cuda_swab32(h[i]);
 		#undef INPUT_BIG
 	}
 }
 __host__
 void x16_hamsi512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
 {
 	const uint32_t threadsperblock = 128;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 	x16_hamsi512_gpu_hash_80 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash);
 }
--- a/x15/cuda_x15_whirlpool_sm3.cu
+++ b/x15/cuda_x15_whirlpool_sm3.cu
@ -1998,7 +1998,7 @@ const int i0, const int i1, const int i2, const int i3, const int i4, const int
 __global__
-void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, int swab)
+void oldwhirlpool_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, void *outputHash, int swab)
 {
 	__shared__ uint64_t sharedMemory[2048];
@ -2014,7 +2014,8 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
 			sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
 		#endif
 	}
-	__threadfence_block(); // ensure shared mem is ready
+	//__threadfence_block(); // ensure shared mem is ready
 	__syncthreads();
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@ -2028,7 +2029,8 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
 		uint64_t state[8];
 		#pragma unroll 8
 		for (int i=0; i < 8; i++) {
-			state[i] = c_PaddedMessage80[i];
+			//state[i] = c_PaddedMessage80[i];
 			AS_UINT2(&state[i]) = AS_UINT2(&c_PaddedMessage80[i]);
 		}
 #else
 		#pragma unroll 8
@ -2050,6 +2052,7 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
 			state[i] = xor1(n[i],c_PaddedMessage80[i]);
 		}
 #endif
 		/// round 2 ///////
 		//////////////////////////////////
 		n[0] = c_PaddedMessage80[8];    //read data
@ -2331,7 +2334,7 @@ extern uint32_t whirlpool512_finalhash_64(int thr_id, uint32_t threads, uint32_t
 }
 __host__
-void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
+void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash)
 {
 	dim3 grid((threads + threadsperblock-1) / threadsperblock);
 	dim3 block(threadsperblock);
@ -2339,7 +2342,7 @@ void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNounce
 	if (threads < 256)
 		applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!");
-	oldwhirlpool_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_outputHash, 1);
+	oldwhirlpool_gpu_hash_80<<<grid, block>>>(threads, startNonce, d_outputHash, 1);
 }
 extern void whirl_midstate(void *state, const void *input);
@ -2363,3 +2366,54 @@ void whirlpool512_setBlock_80_sm3(void *pdata, const void *ptarget)
 	cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice);
 }
 // ------------------------------------------------------------------------------------------------
 __host__
 void x16_whirlpool512_init(int thr_id, uint32_t threads)
 {
 	cudaMemcpyToSymbol(InitVector_RC, plain_RC, sizeof(plain_RC), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(mixTob0Tox, plain_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice);
 #if USE_ALL_TABLES
 	cudaMemcpyToSymbol(mixTob1Tox, plain_T1, (256 * 8), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(mixTob2Tox, plain_T2, (256 * 8), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(mixTob3Tox, plain_T3, (256 * 8), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(mixTob4Tox, plain_T4, (256 * 8), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(mixTob5Tox, plain_T5, (256 * 8), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(mixTob6Tox, plain_T6, (256 * 8), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(mixTob7Tox, plain_T7, (256 * 8), 0, cudaMemcpyHostToDevice);
 #endif
 }
 extern void whirlpool_midstate(void *state, const void *input);
 __host__
 void x16_whirlpool512_setBlock_80(void *pdata)
 {
 	unsigned char PaddedMessage[128];
 	memcpy(PaddedMessage, pdata, 80);
 	memset(PaddedMessage + 80, 0, 48);
 	PaddedMessage[80] = 0x80; /* ending */
 #if HOST_MIDSTATE
 	// compute constant first block
 	unsigned char midstate[64] = { 0 };
 	whirlpool_midstate(midstate, pdata);
 	memcpy(PaddedMessage, midstate, 64);
 #endif
 	cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice);
 }
 __host__
 void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_outputHash)
 {
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 	if (threads < 256)
 		applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!");
 	oldwhirlpool_gpu_hash_80 <<<grid, block>>> (threads, startNonce, d_outputHash, 1);
 }
--- a/x16r/cuda_x16_echo512.cu
+++ b/x16r/cuda_x16_echo512.cu
@ -0,0 +1,214 @@
 /**
 * echo512-80 cuda kernel for X16R algorithm
 *
 * tpruvot 2018 - GPL code
 */
 #include <stdio.h>
 #include <memory.h>
 #include "cuda_helper.h"
 extern __device__ __device_builtin__ void __threadfence_block(void);
 #include "../x11/cuda_x11_aes.cuh"
 __device__ __forceinline__ void AES_2ROUND(const uint32_t* __restrict__ sharedMemory,
 	uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3,
 	uint32_t &k0)
 {
 	uint32_t y0, y1, y2, y3;
 	aes_round(sharedMemory,
 		x0, x1, x2, x3,
 		k0,
 		y0, y1, y2, y3);
 	aes_round(sharedMemory,
 		y0, y1, y2, y3,
 		x0, x1, x2, x3);
 	k0++;
 }
 __device__
 static void echo_round(uint32_t* const sharedMemory, uint32_t *W, uint32_t &k0)
 {
 	// Big Sub Words
 	#pragma unroll 16
 	for (int idx = 0; idx < 16; idx++) {
 		AES_2ROUND(sharedMemory, W[(idx << 2) + 0], W[(idx << 2) + 1], W[(idx << 2) + 2], W[(idx << 2) + 3], k0);
 	}
 	// Shift Rows
 	#pragma unroll 4
 	for (int i = 0; i < 4; i++)
 	{
 		uint32_t t[4];
 		/// 1, 5, 9, 13
 		t[0] = W[i +  4];
 		t[1] = W[i +  8];
 		t[2] = W[i + 24];
 		t[3] = W[i + 60];
 		W[i +  4] = W[i + 20];
 		W[i +  8] = W[i + 40];
 		W[i + 24] = W[i + 56];
 		W[i + 60] = W[i + 44];
 		W[i + 20] = W[i + 36];
 		W[i + 40] = t[1];
 		W[i + 56] = t[2];
 		W[i + 44] = W[i + 28];
 		W[i + 28] = W[i + 12];
 		W[i + 12] = t[3];
 		W[i + 36] = W[i + 52];
 		W[i + 52] = t[0];
 	}
 	// Mix Columns
 	#pragma unroll 4
 	for (int i = 0; i < 4; i++)
 	{
 		#pragma unroll 4
 		for (int idx = 0; idx < 64; idx += 16)
 		{
 			uint32_t a[4];
 			a[0] = W[idx + i];
 			a[1] = W[idx + i + 4];
 			a[2] = W[idx + i + 8];
 			a[3] = W[idx + i + 12];
 			uint32_t ab = a[0] ^ a[1];
 			uint32_t bc = a[1] ^ a[2];
 			uint32_t cd = a[2] ^ a[3];
 			uint32_t t, t2, t3;
 			t  = (ab & 0x80808080);
 			t2 = (bc & 0x80808080);
 			t3 = (cd & 0x80808080);
 			uint32_t abx = (t  >> 7) * 27U ^ ((ab^t)  << 1);
 			uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
 			uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
 			W[idx + i] = bc ^ a[3] ^ abx;
 			W[idx + i +  4] = a[0] ^ cd ^ bcx;
 			W[idx + i +  8] = ab ^ a[3] ^ cdx;
 			W[idx + i + 12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx);
 		}
 	}
 }
 __device__ __forceinline__
 void cuda_echo_round_80(uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__ data, const uint32_t nonce, uint32_t *hash)
 {
 	uint32_t h[29]; // <= 127 bytes input
 	#pragma unroll 8
 	for (int i = 0; i < 18; i += 2)
 		AS_UINT2(&h[i]) = AS_UINT2(&data[i]);
 	h[18] = data[18];
 	h[19] = cuda_swab32(nonce);
 	h[20] = 0x80;
 	h[21] = h[22] = h[23] = h[24] = h[25] = h[26] = 0;
 	//((uint8_t*)h)[80] = 0x80;
 	//((uint8_t*)h)[128-17] = 0x02;
 	//((uint8_t*)h)[128-16] = 0x80;
 	//((uint8_t*)h)[128-15] = 0x02;
 	h[27] = 0x2000000;
 	h[28] = 0x280;
 	//h[29] = h[30] = h[31] = 0;
 	uint32_t k0 = 640; // bitlen
 	uint32_t W[64];
 	#pragma unroll 8
 	for (int i = 0; i < 32; i+=4) {
 		W[i] = 512; // L
 		W[i+1] = 0; // H
 		W[i+2] = 0; // X
 		W[i+3] = 0;
 	}
 	uint32_t Z[16];
 	#pragma unroll
 	for (int i = 0;  i<16; i++) Z[i] = W[i];
 	#pragma unroll
 	for (int i = 32; i<61; i++) W[i] = h[i - 32];
 	#pragma unroll
 	for (int i = 61; i<64; i++) W[i] = 0;
 	for (int i = 0; i < 10; i++)
 		echo_round(sharedMemory, W, k0);
 	#pragma unroll 16
 	for (int i = 0; i < 16; i++) {
 		Z[i] ^= h[i] ^ W[i] ^ W[i + 32];
 	}
 	#pragma unroll 8
 	for (int i = 0; i < 16; i += 2)
 		AS_UINT2(&hash[i]) = AS_UINT2(&Z[i]);
 }
 __device__ __forceinline__
 void echo_gpu_init(uint32_t *const __restrict__ sharedMemory)
 {
 	/* each thread startup will fill a uint32 */
 	if (threadIdx.x < 128) {
 		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
 		sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
 		sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
 		sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
 		sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
 		sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
 		sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
 		sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
 	}
 }
 __host__
 void x16_echo512_cuda_init(int thr_id, const uint32_t threads)
 {
 	aes_cpu_init(thr_id);
 }
 __constant__ static uint32_t c_PaddedMessage80[20];
 __host__
 void x16_echo512_setBlock_80(void *endiandata)
 {
 	cudaMemcpyToSymbol(c_PaddedMessage80, endiandata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
 }
 __global__ __launch_bounds__(128, 7) /* will force 72 registers */
 void x16_echo512_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint64_t *g_hash)
 {
 	__shared__ uint32_t sharedMemory[1024];
 	echo_gpu_init(sharedMemory);
 	__threadfence_block();
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint64_t hashPosition = thread;
 		uint32_t *pHash = (uint32_t*)&g_hash[hashPosition<<3];
 		cuda_echo_round_80(sharedMemory, c_PaddedMessage80, startNonce + thread, pHash);
 	}
 }
 __host__
 void x16_echo512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
 {
 	const uint32_t threadsperblock = 128;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 	x16_echo512_gpu_hash_80<<<grid, block>>>(threads, startNonce, (uint64_t*)d_hash);
 }
--- a/x16r/cuda_x16_fugue512.cu
+++ b/x16r/cuda_x16_fugue512.cu
@ -0,0 +1,467 @@
 #include <stdio.h>
 #include <cuda_helper.h>
 #define TPB 256
 /*
 * fugue512-80 x16r kernel implementation.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2018 tpruvot
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 */
 #ifdef __INTELLISENSE__
 #define __byte_perm(x, y, m) (x|y)
 #define tex1Dfetch(t, n) (n)
 #define __CUDACC__
 #include <cuda_texture_types.h>
 #endif
 // store allocated textures device addresses
 static unsigned int* d_textures[MAX_GPUS][1];
 #define mixtab0(x) mixtabs[(x)]
 #define mixtab1(x) mixtabs[(x)+256]
 #define mixtab2(x) mixtabs[(x)+512]
 #define mixtab3(x) mixtabs[(x)+768]
 static texture<unsigned int, 1, cudaReadModeElementType> mixTab0Tex;
 static const uint32_t mixtab0[] = {
 	0x63633297, 0x7c7c6feb, 0x77775ec7, 0x7b7b7af7, 0xf2f2e8e5, 0x6b6b0ab7, 0x6f6f16a7, 0xc5c56d39,
 	0x303090c0, 0x01010704, 0x67672e87, 0x2b2bd1ac, 0xfefeccd5, 0xd7d71371, 0xabab7c9a, 0x767659c3,
 	0xcaca4005, 0x8282a33e, 0xc9c94909, 0x7d7d68ef, 0xfafad0c5, 0x5959947f, 0x4747ce07, 0xf0f0e6ed,
 	0xadad6e82, 0xd4d41a7d, 0xa2a243be, 0xafaf608a, 0x9c9cf946, 0xa4a451a6, 0x727245d3, 0xc0c0762d,
 	0xb7b728ea, 0xfdfdc5d9, 0x9393d47a, 0x2626f298, 0x363682d8, 0x3f3fbdfc, 0xf7f7f3f1, 0xcccc521d,
 	0x34348cd0, 0xa5a556a2, 0xe5e58db9, 0xf1f1e1e9, 0x71714cdf, 0xd8d83e4d, 0x313197c4, 0x15156b54,
 	0x04041c10, 0xc7c76331, 0x2323e98c, 0xc3c37f21, 0x18184860, 0x9696cf6e, 0x05051b14, 0x9a9aeb5e,
 	0x0707151c, 0x12127e48, 0x8080ad36, 0xe2e298a5, 0xebeba781, 0x2727f59c, 0xb2b233fe, 0x757550cf,
 	0x09093f24, 0x8383a43a, 0x2c2cc4b0, 0x1a1a4668, 0x1b1b416c, 0x6e6e11a3, 0x5a5a9d73, 0xa0a04db6,
 	0x5252a553, 0x3b3ba1ec, 0xd6d61475, 0xb3b334fa, 0x2929dfa4, 0xe3e39fa1, 0x2f2fcdbc, 0x8484b126,
 	0x5353a257, 0xd1d10169, 0x00000000, 0xededb599, 0x2020e080, 0xfcfcc2dd, 0xb1b13af2, 0x5b5b9a77,
 	0x6a6a0db3, 0xcbcb4701, 0xbebe17ce, 0x3939afe4, 0x4a4aed33, 0x4c4cff2b, 0x5858937b, 0xcfcf5b11,
 	0xd0d0066d, 0xefefbb91, 0xaaaa7b9e, 0xfbfbd7c1, 0x4343d217, 0x4d4df82f, 0x333399cc, 0x8585b622,
 	0x4545c00f, 0xf9f9d9c9, 0x02020e08, 0x7f7f66e7, 0x5050ab5b, 0x3c3cb4f0, 0x9f9ff04a, 0xa8a87596,
 	0x5151ac5f, 0xa3a344ba, 0x4040db1b, 0x8f8f800a, 0x9292d37e, 0x9d9dfe42, 0x3838a8e0, 0xf5f5fdf9,
 	0xbcbc19c6, 0xb6b62fee, 0xdada3045, 0x2121e784, 0x10107040, 0xffffcbd1, 0xf3f3efe1, 0xd2d20865,
 	0xcdcd5519, 0x0c0c2430, 0x1313794c, 0xececb29d, 0x5f5f8667, 0x9797c86a, 0x4444c70b, 0x1717655c,
 	0xc4c46a3d, 0xa7a758aa, 0x7e7e61e3, 0x3d3db3f4, 0x6464278b, 0x5d5d886f, 0x19194f64, 0x737342d7,
 	0x60603b9b, 0x8181aa32, 0x4f4ff627, 0xdcdc225d, 0x2222ee88, 0x2a2ad6a8, 0x9090dd76, 0x88889516,
 	0x4646c903, 0xeeeebc95, 0xb8b805d6, 0x14146c50, 0xdede2c55, 0x5e5e8163, 0x0b0b312c, 0xdbdb3741,
 	0xe0e096ad, 0x32329ec8, 0x3a3aa6e8, 0x0a0a3628, 0x4949e43f, 0x06061218, 0x2424fc90, 0x5c5c8f6b,
 	0xc2c27825, 0xd3d30f61, 0xacac6986, 0x62623593, 0x9191da72, 0x9595c662, 0xe4e48abd, 0x797974ff,
 	0xe7e783b1, 0xc8c84e0d, 0x373785dc, 0x6d6d18af, 0x8d8d8e02, 0xd5d51d79, 0x4e4ef123, 0xa9a97292,
 	0x6c6c1fab, 0x5656b943, 0xf4f4fafd, 0xeaeaa085, 0x6565208f, 0x7a7a7df3, 0xaeae678e, 0x08083820,
 	0xbaba0bde, 0x787873fb, 0x2525fb94, 0x2e2ecab8, 0x1c1c5470, 0xa6a65fae, 0xb4b421e6, 0xc6c66435,
 	0xe8e8ae8d, 0xdddd2559, 0x747457cb, 0x1f1f5d7c, 0x4b4bea37, 0xbdbd1ec2, 0x8b8b9c1a, 0x8a8a9b1e,
 	0x70704bdb, 0x3e3ebaf8, 0xb5b526e2, 0x66662983, 0x4848e33b, 0x0303090c, 0xf6f6f4f5, 0x0e0e2a38,
 	0x61613c9f, 0x35358bd4, 0x5757be47, 0xb9b902d2, 0x8686bf2e, 0xc1c17129, 0x1d1d5374, 0x9e9ef74e,
 	0xe1e191a9, 0xf8f8decd, 0x9898e556, 0x11117744, 0x696904bf, 0xd9d93949, 0x8e8e870e, 0x9494c166,
 	0x9b9bec5a, 0x1e1e5a78, 0x8787b82a, 0xe9e9a989, 0xcece5c15, 0x5555b04f, 0x2828d8a0, 0xdfdf2b51,
 	0x8c8c8906, 0xa1a14ab2, 0x89899212, 0x0d0d2334, 0xbfbf10ca, 0xe6e684b5, 0x4242d513, 0x686803bb,
 	0x4141dc1f, 0x9999e252, 0x2d2dc3b4, 0x0f0f2d3c, 0xb0b03df6, 0x5454b74b, 0xbbbb0cda, 0x16166258
 };
 #define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \
 	x22 ^= x00; \
 	x00 = (q); \
 	x08 ^= x00; \
 	x01 ^= x24; \
 	x04 ^= x27; \
 	x07 ^= x30; \
 }
 #define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20) { \
 	x00 ^= x04; \
 	x01 ^= x05; \
 	x02 ^= x06; \
 	x18 ^= x04; \
 	x19 ^= x05; \
 	x20 ^= x06; \
 }
 #define SMIX(x0, x1, x2, x3) { \
 	uint32_t tmp; \
 	uint32_t r0 = 0; \
 	uint32_t r1 = 0; \
 	uint32_t r2 = 0; \
 	uint32_t r3 = 0; \
 	uint32_t c0 = mixtab0(x0 >> 24); \
 	tmp = mixtab1((x0 >> 16) & 0xFF); \
 	c0 ^= tmp; \
 	r1 ^= tmp; \
 	tmp = mixtab2((x0 >>  8) & 0xFF); \
 	c0 ^= tmp; \
 	r2 ^= tmp; \
 	tmp = mixtab3(x0 & 0xFF); \
 	c0 ^= tmp; \
 	r3 ^= tmp; \
 	tmp = mixtab0(x1 >> 24); \
 	uint32_t c1 = tmp; \
 	r0 ^= tmp; \
 	tmp = mixtab1((x1 >> 16) & 0xFF); \
 	c1 ^= tmp; \
 	tmp = mixtab2((x1 >>  8) & 0xFF); \
 	c1 ^= tmp; \
 	r2 ^= tmp; \
 	tmp = mixtab3(x1 & 0xFF); \
 	c1 ^= tmp; \
 	r3 ^= tmp; \
 	tmp = mixtab0(x2 >> 24); \
 	uint32_t c2 = tmp; \
 	r0 ^= tmp; \
 	tmp = mixtab1((x2 >> 16) & 0xFF); \
 	c2 ^= tmp; \
 	r1 ^= tmp; \
 	tmp = mixtab2((x2 >>  8) & 0xFF); \
 	c2 ^= tmp; \
 	tmp = mixtab3(x2 & 0xFF); \
 	c2 ^= tmp; \
 	r3 ^= tmp; \
 	tmp = mixtab0(x3 >> 24); \
 	uint32_t c3 = tmp; \
 	r0 ^= tmp; \
 	tmp = mixtab1((x3 >> 16) & 0xFF); \
 	c3 ^= tmp; \
 	r1 ^= tmp; \
 	tmp = mixtab2((x3 >>  8) & 0xFF); \
 	c3 ^= tmp; \
 	r2 ^= tmp; \
 	tmp = mixtab3(x3 & 0xFF); \
 	c3 ^= tmp; \
 	x0 = ((c0 ^ r0) & 0xFF000000) | ((c1 ^ r1) & 0x00FF0000) \
 		| ((c2 ^ r2) & 0x0000FF00) | ((c3 ^ r3) & 0x000000FF); \
 	x1 = ((c1 ^ (r0 <<  8)) & 0xFF000000) | ((c2 ^ (r1 <<  8)) & 0x00FF0000) \
 		| ((c3 ^ (r2 <<  8)) & 0x0000FF00) | ((c0 ^ (r3 >> 24)) & 0x000000FF); \
 	x2 = ((c2 ^ (r0 << 16)) & 0xFF000000) | ((c3 ^ (r1 << 16)) & 0x00FF0000) \
 		| ((c0 ^ (r2 >> 16)) & 0x0000FF00) | ((c1 ^ (r3 >> 16)) & 0x000000FF); \
 	x3 = ((c3 ^ (r0 << 24)) & 0xFF000000) | ((c0 ^ (r1 >>  8)) & 0x00FF0000) \
 		| ((c1 ^ (r2 >>  8)) & 0x0000FF00) | ((c2 ^ (r3 >>  8)) & 0x000000FF); \
 }
 #define SUB_ROR3 { \
 	B33 = S33, B34 = S34, B35 = S35; \
 	S35 = S32; S34 = S31; S33 = S30; S32 = S29; S31 = S28; S30 = S27; S29 = S26; S28 = S25; S27 = S24; \
 	S26 = S23; S25 = S22; S24 = S21; S23 = S20; S22 = S19; S21 = S18; S20 = S17; S19 = S16; S18 = S15; \
 	S17 = S14; S16 = S13; S15 = S12; S14 = S11; S13 = S10; S12 = S09; S11 = S08; S10 = S07; S09 = S06; \
 	S08 = S05; S07 = S04; S06 = S03; S05 = S02; S04 = S01; S03 = S00; S02 = B35; S01 = B34; S00 = B33; \
 }
 #define SUB_ROR8 { \
 	B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
 	S35 = S27; S34 = S26; S33 = S25; S32 = S24; S31 = S23; S30 = S22; S29 = S21; S28 = S20; S27 = S19; \
 	S26 = S18; S25 = S17; S24 = S16; S23 = S15; S22 = S14; S21 = S13; S20 = S12; S19 = S11; S18 = S10; \
 	S17 = S09; S16 = S08; S15 = S07; S14 = S06; S13 = S05; S12 = S04; S11 = S03; S10 = S02; S09 = S01; \
 	S08 = S00; S07 = B35; S06 = B34; S05 = B33; S04 = B32; S03 = B31; S02 = B30; S01 = B29; S00 = B28; \
 }
 #define SUB_ROR9 { \
 	B27 = S27, B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
 	S35 = S26; S34 = S25; S33 = S24; S32 = S23; S31 = S22; S30 = S21; S29 = S20; S28 = S19; S27 = S18; \
 	S26 = S17; S25 = S16; S24 = S15; S23 = S14; S22 = S13; S21 = S12; S20 = S11; S19 = S10; S18 = S09; \
 	S17 = S08; S16 = S07; S15 = S06; S14 = S05; S13 = S04; S12 = S03; S11 = S02; S10 = S01; S09 = S00; \
 	S08 = B35; S07 = B34; S06 = B33; S05 = B32; S04 = B31; S03 = B30; S02 = B29; S01 = B28; S00 = B27; \
 }
 #define SUB_ROR9_3 { \
 	SUB_ROR3; SUB_ROR3; SUB_ROR3; \
 }
 #define SUB_ROR12 { /* to fix */ \
 	B24 = S00; B25 = S01; B26 = S02; B27 = S03; B28 = S04; B29 = S05; B30 = S06; B31 = S07; B32 = S08; B33 = S09; B34 = S10; B35 = S11; \
 	S00 = S12; S01 = S13; S02 = S14; S03 = S15; S04 = S16; S05 = S17; S06 = S18; S07 = S19; S08 = S20; S09 = S21; S10 = S22; S11 = S23; \
 	S12 = S24; S13 = S25; S14 = S26; S15 = S27; S16 = S28; S17 = S29; S18 = S30; S19 = S31; S20 = S32; S21 = S33; S22 = S34; S23 = S35; \
 	S24 = B24; S25 = B25; S26 = B26; S27 = B27; S28 = B28; S29 = B29; S30 = B30; S31 = B31; S32 = B32; S33 = B33; S34 = B34; S35 = B35; \
 }
 #define FUGUE512_3(x, y, z) { \
 	TIX4(x, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
 	CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
 	SMIX(S33, S34, S35, S00); \
 	CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
 	SMIX(S30, S31, S32, S33); \
 	CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
 	SMIX(S27, S28, S29, S30); \
 	CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
 	SMIX(S24, S25, S26, S27); \
 	\
 	TIX4(y, S24, S25, S28, S31, S32, S10, S12, S15, S18); \
 	CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \
 	SMIX(S21, S22, S23, S24); \
 	CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \
 	SMIX(S18, S19, S20, S21); \
 	CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \
 	SMIX(S15, S16, S17, S18); \
 	CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \
 	SMIX(S12, S13, S14, S15); \
 	\
 	TIX4(z, S12, S13, S16, S19, S20, S34, S00, S03, S06); \
 	CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \
 	SMIX(S09, S10, S11, S12); \
 	CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \
 	SMIX(S06, S07, S08, S09); \
 	CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \
 	SMIX(S03, S04, S05, S06); \
 	CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \
 	SMIX(S00, S01, S02, S03); \
 }
 #define FUGUE512_F(w, x, y, z) { \
 	TIX4(w, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
 	CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
 	SMIX(S33, S34, S35, S00); \
 	CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
 	SMIX(S30, S31, S32, S33); \
 	CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
 	SMIX(S27, S28, S29, S30); \
 	CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
 	SMIX(S24, S25, S26, S27); \
 	\
 	TIX4(x, S24, S25, S28, S31, S32, S10, S12, S15, S18); \
 	CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \
 	SMIX(S21, S22, S23, S24); \
 	CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \
 	SMIX(S18, S19, S20, S21); \
 	CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \
 	SMIX(S15, S16, S17, S18); \
 	CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \
 	SMIX(S12, S13, S14, S15); \
 	\
 	TIX4(y, S12, S13, S16, S19, S20, S34, S00, S03, S06); \
 	CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \
 	SMIX(S09, S10, S11, S12); \
 	CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \
 	SMIX(S06, S07, S08, S09); \
 	CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \
 	SMIX(S03, S04, S05, S06); \
 	CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \
 	SMIX(S00, S01, S02, S03); \
 	\
 	TIX4(z, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
 	CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
 	SMIX(S33, S34, S35, S00); \
 	CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
 	SMIX(S30, S31, S32, S33); \
 	CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
 	SMIX(S27, S28, S29, S30); \
 	CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
 	SMIX(S24, S25, S26, S27); \
 }
 #undef ROL8
 #ifdef __CUDA_ARCH__
 __device__ __forceinline__
 uint32_t ROL8(const uint32_t a) {
 	return __byte_perm(a, 0, 0x2103);
 }
 __device__ __forceinline__
 uint32_t ROR8(const uint32_t a) {
 	return __byte_perm(a, 0, 0x0321);
 }
 __device__ __forceinline__
 uint32_t ROL16(const uint32_t a) {
 	return __byte_perm(a, 0, 0x1032);
 }
 #else
 #define ROL8(u)  ROTL32(u, 8)
 #define ROR8(u)  ROTR32(u, 8)
 #define ROL16(u) ROTL32(u,16)
 #endif
 //#define AS_UINT4(addr) *((uint4*)(addr))
 __constant__ static uint64_t c_PaddedMessage80[10];
 __host__
 void x16_fugue512_setBlock_80(void *pdata)
 {
 	cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
 }
 /***************************************************/
 __global__
 __launch_bounds__(TPB)
 void x16_fugue512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
 {
 	__shared__ uint32_t mixtabs[1024];
 	// load shared mem (with 256 threads)
 	const uint32_t thr = threadIdx.x & 0xFF;
 	const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);
 	mixtabs[thr] = tmp;
 	mixtabs[thr+256] = ROR8(tmp);
 	mixtabs[thr+512] = ROL16(tmp);
 	mixtabs[thr+768] = ROL8(tmp);
 #if TPB <= 256
 	if (blockDim.x < 256) {
 		const uint32_t thr = (threadIdx.x + 0x80) & 0xFF;
 		const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);
 		mixtabs[thr] = tmp;
 		mixtabs[thr + 256] = ROR8(tmp);
 		mixtabs[thr + 512] = ROL16(tmp);
 		mixtabs[thr + 768] = ROL8(tmp);
 	}
 #endif
 	__syncthreads();
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t Data[20];
 		#pragma unroll
 		for(int i = 0; i < 10; i++)
 			AS_UINT2(&Data[i * 2]) = AS_UINT2(&c_PaddedMessage80[i]);
 		Data[19] = (startNonce + thread);
 		uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09, S10, S11;
 		uint32_t S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23;
 		uint32_t S24, S25, S26, S27, S28, S29, S30, S31, S32, S33, S34, S35;
 		//uint32_t B24, B25, B26,
 		uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35;
 		//const uint64_t bc = 640 bits to hash
 		//const uint32_t bclo = (uint32_t)(bc);
 		//const uint32_t bchi = (uint32_t)(bc >> 32);
 		S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = 0;
 		S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
 		S20 = 0x8807a57e; S21 = 0xe616af75; S22 = 0xc5d3e4db; S23 = 0xac9ab027;
 		S24 = 0xd915f117; S25 = 0xb6eecc54; S26 = 0x06e8020b; S27 = 0x4a92efd1;
 		S28 = 0xaac6e2c9; S29 = 0xddb21398; S30 = 0xcae65838; S31 = 0x437f203f;
 		S32 = 0x25ea78e7; S33 = 0x951fddd6; S34 = 0xda6ed11d; S35 = 0xe13e3567;
 		FUGUE512_3((Data[ 0]), (Data[ 1]), (Data[ 2]));
 		FUGUE512_3((Data[ 3]), (Data[ 4]), (Data[ 5]));
 		FUGUE512_3((Data[ 6]), (Data[ 7]), (Data[ 8]));
 		FUGUE512_3((Data[ 9]), (Data[10]), (Data[11]));
 		FUGUE512_3((Data[12]), (Data[13]), (Data[14]));
 		FUGUE512_3((Data[15]), (Data[16]), (Data[17]));
 		FUGUE512_F((Data[18]), (Data[19]), 0/*bchi*/, (80*8)/*bclo*/);
 		// rotate right state by 3 dwords (S00 = S33, S03 = S00)
 		SUB_ROR3;
 		SUB_ROR9;
 		#pragma unroll 32
 		for (int i = 0; i < 32; i++) {
 			SUB_ROR3;
 			CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
 			SMIX(S00, S01, S02, S03);
 		}
 		#pragma unroll 13
 		for (int i = 0; i < 13; i++) {
 			S04 ^= S00;
 			S09 ^= S00;
 			S18 ^= S00;
 			S27 ^= S00;
 			SUB_ROR9;
 			SMIX(S00, S01, S02, S03);
 			S04 ^= S00;
 			S10 ^= S00;
 			S18 ^= S00;
 			S27 ^= S00;
 			SUB_ROR9;
 			SMIX(S00, S01, S02, S03);
 			S04 ^= S00;
 			S10 ^= S00;
 			S19 ^= S00;
 			S27 ^= S00;
 			SUB_ROR9;
 			SMIX(S00, S01, S02, S03);
 			S04 ^= S00;
 			S10 ^= S00;
 			S19 ^= S00;
 			S28 ^= S00;
 			SUB_ROR8;
 			SMIX(S00, S01, S02, S03);
 		}
 		S04 ^= S00;
 		S09 ^= S00;
 		S18 ^= S00;
 		S27 ^= S00;
 		Data[ 0] = cuda_swab32(S01);
 		Data[ 1] = cuda_swab32(S02);
 		Data[ 2] = cuda_swab32(S03);
 		Data[ 3] = cuda_swab32(S04);
 		Data[ 4] = cuda_swab32(S09);
 		Data[ 5] = cuda_swab32(S10);
 		Data[ 6] = cuda_swab32(S11);
 		Data[ 7] = cuda_swab32(S12);
 		Data[ 8] = cuda_swab32(S18);
 		Data[ 9] = cuda_swab32(S19);
 		Data[10] = cuda_swab32(S20);
 		Data[11] = cuda_swab32(S21);
 		Data[12] = cuda_swab32(S27);
 		Data[13] = cuda_swab32(S28);
 		Data[14] = cuda_swab32(S29);
 		Data[15] = cuda_swab32(S30);
 		const size_t hashPosition = thread;
 		uint64_t* pHash = &g_hash[hashPosition << 3];
 		#pragma unroll 4
 		for(int i = 0; i < 4; i++)
 			AS_UINT4(&pHash[i * 2]) = AS_UINT4(&Data[i * 4]);
 	}
 }
 #define texDef(id, texname, texmem, texsource, texsize) { \
 	unsigned int *texmem; \
 	cudaMalloc(&texmem, texsize); \
 	d_textures[thr_id][id] = texmem; \
 	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
 	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
 	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
 	} \
 }
 __host__
 void x16_fugue512_cpu_init(int thr_id, uint32_t threads)
 {
 	texDef(0, mixTab0Tex, mixTab0m, mixtab0, sizeof(uint32_t)*256);
 }
 __host__
 void x16_fugue512_cpu_free(int thr_id)
 {
 	cudaFree(d_textures[thr_id][0]);
 }
 __host__
 void x16_fugue512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
 {
 	const uint32_t threadsperblock = TPB;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 	x16_fugue512_gpu_hash_80 <<<grid, block>>> (threads, startNonce, (uint64_t*)d_hash);
 }
--- a/x16r/cuda_x16_shabal512.cu
+++ b/x16r/cuda_x16_shabal512.cu
@ -0,0 +1,350 @@
 /*
 * Shabal-512 for X16R
 * tpruvot 2018, based on alexis x14 and xevan kernlx code
 */
 #include <cuda_helper.h>
 #include <cuda_vectors.h>
 #include <cuda_vector_uint2x4.h>
 typedef uint32_t sph_u32;
 #define C32(x) (x)
 #define T32(x) (x)
 #define INPUT_BLOCK_ADD do { \
 		B0 = T32(B0 + M0); \
 		B1 = T32(B1 + M1); \
 		B2 = T32(B2 + M2); \
 		B3 = T32(B3 + M3); \
 		B4 = T32(B4 + M4); \
 		B5 = T32(B5 + M5); \
 		B6 = T32(B6 + M6); \
 		B7 = T32(B7 + M7); \
 		B8 = T32(B8 + M8); \
 		B9 = T32(B9 + M9); \
 		BA = T32(BA + MA); \
 		BB = T32(BB + MB); \
 		BC = T32(BC + MC); \
 		BD = T32(BD + MD); \
 		BE = T32(BE + ME); \
 		BF = T32(BF + MF); \
 		} while (0)
 #define INPUT_BLOCK_SUB do { \
 		C0 = T32(C0 - M0); \
 		C1 = T32(C1 - M1); \
 		C2 = T32(C2 - M2); \
 		C3 = T32(C3 - M3); \
 		C4 = T32(C4 - M4); \
 		C5 = T32(C5 - M5); \
 		C6 = T32(C6 - M6); \
 		C7 = T32(C7 - M7); \
 		C8 = T32(C8 - M8); \
 		C9 = T32(C9 - M9); \
 		CA = T32(CA - MA); \
 		CB = T32(CB - MB); \
 		CC = T32(CC - MC); \
 		CD = T32(CD - MD); \
 		CE = T32(CE - ME); \
 		CF = T32(CF - MF); \
 		} while (0)
 #define XOR_W   do { \
 		A00 ^= Wlow; \
 		A01 ^= Whigh; \
 		} while (0)
 #define SWAP(v1, v2) do { \
 		sph_u32 tmp = (v1); \
 		(v1) = (v2); \
 		(v2) = tmp; \
 		} while (0)
 #define SWAP_BC do { \
 		SWAP(B0, C0); \
 		SWAP(B1, C1); \
 		SWAP(B2, C2); \
 		SWAP(B3, C3); \
 		SWAP(B4, C4); \
 		SWAP(B5, C5); \
 		SWAP(B6, C6); \
 		SWAP(B7, C7); \
 		SWAP(B8, C8); \
 		SWAP(B9, C9); \
 		SWAP(BA, CA); \
 		SWAP(BB, CB); \
 		SWAP(BC, CC); \
 		SWAP(BD, CD); \
 		SWAP(BE, CE); \
 		SWAP(BF, CF); \
 		} while (0)
 #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) do { \
 		xa0 = T32((xa0 \
 			^ (((xa1 << 15) | (xa1 >> 17)) * 5U) \
 			^ xc) * 3U) \
 			^ xb1 ^ (xb2 & ~xb3) ^ xm; \
 		xb0 = T32(~(((xb0 << 1) | (xb0 >> 31)) ^ xa0)); \
 		} while (0)
 #define PERM_STEP_0 do { \
 		PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
 		PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
 		PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
 		PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
 		PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
 		PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
 		PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
 		PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
 		PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
 		PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
 		PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
 		PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
 		PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
 		PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
 		PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
 		PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
 		} while (0)
 #define PERM_STEP_1 do { \
 		PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
 		PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
 		PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
 		PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
 		PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
 		PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
 		PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
 		PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
 		PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
 		PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
 		PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
 		PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
 		PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
 		PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
 		PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
 		PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
 		} while (0)
 #define PERM_STEP_2 do { \
 		PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
 		PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
 		PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
 		PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
 		PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
 		PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
 		PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
 		PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
 		PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
 		PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
 		PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
 		PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
 		PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
 		PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
 		PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
 		PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
 		} while (0)
 #define APPLY_P do { \
 		B0 = T32(B0 << 17) | (B0 >> 15); \
 		B1 = T32(B1 << 17) | (B1 >> 15); \
 		B2 = T32(B2 << 17) | (B2 >> 15); \
 		B3 = T32(B3 << 17) | (B3 >> 15); \
 		B4 = T32(B4 << 17) | (B4 >> 15); \
 		B5 = T32(B5 << 17) | (B5 >> 15); \
 		B6 = T32(B6 << 17) | (B6 >> 15); \
 		B7 = T32(B7 << 17) | (B7 >> 15); \
 		B8 = T32(B8 << 17) | (B8 >> 15); \
 		B9 = T32(B9 << 17) | (B9 >> 15); \
 		BA = T32(BA << 17) | (BA >> 15); \
 		BB = T32(BB << 17) | (BB >> 15); \
 		BC = T32(BC << 17) | (BC >> 15); \
 		BD = T32(BD << 17) | (BD >> 15); \
 		BE = T32(BE << 17) | (BE >> 15); \
 		BF = T32(BF << 17) | (BF >> 15); \
 		PERM_STEP_0; \
 		PERM_STEP_1; \
 		PERM_STEP_2; \
 		A0B = T32(A0B + C6); \
 		A0A = T32(A0A + C5); \
 		A09 = T32(A09 + C4); \
 		A08 = T32(A08 + C3); \
 		A07 = T32(A07 + C2); \
 		A06 = T32(A06 + C1); \
 		A05 = T32(A05 + C0); \
 		A04 = T32(A04 + CF); \
 		A03 = T32(A03 + CE); \
 		A02 = T32(A02 + CD); \
 		A01 = T32(A01 + CC); \
 		A00 = T32(A00 + CB); \
 		A0B = T32(A0B + CA); \
 		A0A = T32(A0A + C9); \
 		A09 = T32(A09 + C8); \
 		A08 = T32(A08 + C7); \
 		A07 = T32(A07 + C6); \
 		A06 = T32(A06 + C5); \
 		A05 = T32(A05 + C4); \
 		A04 = T32(A04 + C3); \
 		A03 = T32(A03 + C2); \
 		A02 = T32(A02 + C1); \
 		A01 = T32(A01 + C0); \
 		A00 = T32(A00 + CF); \
 		A0B = T32(A0B + CE); \
 		A0A = T32(A0A + CD); \
 		A09 = T32(A09 + CC); \
 		A08 = T32(A08 + CB); \
 		A07 = T32(A07 + CA); \
 		A06 = T32(A06 + C9); \
 		A05 = T32(A05 + C8); \
 		A04 = T32(A04 + C7); \
 		A03 = T32(A03 + C6); \
 		A02 = T32(A02 + C5); \
 		A01 = T32(A01 + C4); \
 		A00 = T32(A00 + C3); \
 	} while (0)
 #define INCR_W do { \
 	if ((Wlow = T32(Wlow + 1)) == 0) \
 		Whigh = T32(Whigh + 1); \
 	} while (0)
 __constant__ static const sph_u32 A_init_512[] = {
 	C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
 	C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
 	C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
 };
 __constant__ static const sph_u32 B_init_512[] = {
 	C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
 	C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
 	C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
 	C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
 };
 __constant__ static const sph_u32 C_init_512[] = {
 	C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
 	C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
 	C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
 	C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
 };
 __constant__ static uint32_t c_PaddedMessage80[20];
 __host__
 void x16_shabal512_setBlock_80(void *pdata)
 {
 	cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
 }
 #define TPB_SHABAL 256
 __global__ __launch_bounds__(TPB_SHABAL, 2)
 void x16_shabal512_gpu_hash_80(uint32_t threads, const uint32_t startNonce, uint32_t *g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	uint32_t B[] = {
 		0xC1099CB7, 0x07B385F3, 0xE7442C26, 0xCC8AD640, 0xEB6F56C7, 0x1EA81AA9, 0x73B9D314, 0x1DE85D08,
 		0x48910A5A, 0x893B22DB, 0xC5A0DF44, 0xBBC4324E, 0x72D2F240, 0x75941D99, 0x6D8BDE82, 0xA1A7502B
 	};
 	uint32_t M[16];
 	if (thread < threads)
 	{
 		// todo: try __ldc
 		*(uint2x4*)&M[0] = *(uint2x4*)&c_PaddedMessage80[0];
 		*(uint2x4*)&M[8] = *(uint2x4*)&c_PaddedMessage80[8];
 		sph_u32 A00 = A_init_512[0], A01 = A_init_512[1], A02 = A_init_512[ 2], A03 = A_init_512[ 3];
 		sph_u32 A04 = A_init_512[4], A05 = A_init_512[5], A06 = A_init_512[ 6], A07 = A_init_512[ 7];
 		sph_u32 A08 = A_init_512[8], A09 = A_init_512[9], A0A = A_init_512[10], A0B = A_init_512[11];
 		sph_u32 B0 = B_init_512[ 0], B1 = B_init_512[ 1], B2 = B_init_512[ 2], B3 = B_init_512 [3];
 		sph_u32 B4 = B_init_512[ 4], B5 = B_init_512[ 5], B6 = B_init_512[ 6], B7 = B_init_512[ 7];
 		sph_u32 B8 = B_init_512[ 8], B9 = B_init_512[ 9], BA = B_init_512[10], BB = B_init_512[11];
 		sph_u32 BC = B_init_512[12], BD = B_init_512[13], BE = B_init_512[14], BF = B_init_512[15];
 		sph_u32 C0 = C_init_512[ 0], C1 = C_init_512[ 1], C2 = C_init_512[ 2], C3 = C_init_512[ 3];
 		sph_u32 C4 = C_init_512[ 4], C5 = C_init_512[ 5], C6 = C_init_512[ 6], C7 = C_init_512[ 7];
 		sph_u32 C8 = C_init_512[ 8], C9 = C_init_512[ 9], CA = C_init_512[10], CB = C_init_512[11];
 		sph_u32 CC = C_init_512[12], CD = C_init_512[13], CE = C_init_512[14], CF = C_init_512[15];
 		sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
 		sph_u32 Wlow = 1, Whigh = 0;
 		M0 = M[ 0];
 		M1 = M[ 1];
 		M2 = M[ 2];
 		M3 = M[ 3];
 		M4 = M[ 4];
 		M5 = M[ 5];
 		M6 = M[ 6];
 		M7 = M[ 7];
 		M8 = M[ 8];
 		M9 = M[ 9];
 		MA = M[10];
 		MB = M[11];
 		MC = M[12];
 		MD = M[13];
 		ME = M[14];
 		MF = M[15];
 		INPUT_BLOCK_ADD;
 		XOR_W;
 		APPLY_P;
 		INPUT_BLOCK_SUB;
 		SWAP_BC;
 		INCR_W;
 		M0 = c_PaddedMessage80[16];
 		M1 = c_PaddedMessage80[17];
 		M2 = c_PaddedMessage80[18];
 		M3 = cuda_swab32(startNonce + thread);
 		M4 = 0x80;
 		M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0;
 		INPUT_BLOCK_ADD;
 		XOR_W;
 		APPLY_P;
 		for (unsigned i = 0; i < 3; i++) {
 			SWAP_BC;
 			XOR_W;
 			APPLY_P;
 		}
 		B[ 0] = B0;
 		B[ 1] = B1;
 		B[ 2] = B2;
 		B[ 3] = B3;
 		B[ 4] = B4;
 		B[ 5] = B5;
 		B[ 6] = B6;
 		B[ 7] = B7;
 		B[ 8] = B8;
 		B[ 9] = B9;
 		B[10] = BA;
 		B[11] = BB;
 		B[12] = BC;
 		B[13] = BD;
 		B[14] = BE;
 		B[15] = BF;
 		// output
 		uint64_t hashPosition = thread;
 		uint32_t *Hash = &g_hash[hashPosition << 4];
 		*(uint2x4*)&Hash[0] = *(uint2x4*)&B[0];
 		*(uint2x4*)&Hash[8] = *(uint2x4*)&B[8];
 	}
 }
 __host__
 void x16_shabal512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
 {
 	const uint32_t threadsperblock = TPB_SHABAL;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 	x16_shabal512_gpu_hash_80 <<<grid, block >>>(threads, startNonce, d_hash);
 }
--- a/x16r/cuda_x16_simd512_80.cu
+++ b/x16r/cuda_x16_simd512_80.cu
--- a/x16r/cuda_x16r.h
+++ b/x16r/cuda_x16r.h
@ -0,0 +1,75 @@
 #include "x11/cuda_x11.h"
 extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void x13_fugue512_cpu_free(int thr_id);
 extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
 extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag);
 extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void x15_whirlpool_cpu_free(int thr_id);
 extern void x17_sha512_cpu_init(int thr_id, uint32_t threads);
 extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash);
 extern void x17_haval256_cpu_init(int thr_id, uint32_t threads);
 extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, const int outlen);
 void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
 // ---- 80 bytes kernels
 void quark_bmw512_cpu_setBlock_80(void *pdata);
 void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
 void groestl512_setBlock_80(int thr_id, uint32_t *endiandata);
 void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
 void skein512_cpu_setBlock_80(void *pdata);
 void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap);
 void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
 void qubit_luffa512_cpu_setBlock_80(void *pdata);
 void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
 void jh512_setBlock_80(int thr_id, uint32_t *endiandata);
 void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
 void keccak512_setBlock_80(int thr_id, uint32_t *endiandata);
 void keccak512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
 void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
 void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
 void x11_shavite512_setBlock_80(void *pdata);
 void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
 void x16_shabal512_setBlock_80(void *pdata);
 void x16_shabal512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
 void x16_simd512_setBlock_80(void *pdata);
 void x16_simd512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
 void x16_echo512_cuda_init(int thr_id, const uint32_t threads);
 void x16_echo512_setBlock_80(void *pdata);
 void x16_echo512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
 void x16_hamsi512_setBlock_80(void *pdata);
 void x16_hamsi512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
 void x16_fugue512_cpu_init(int thr_id, uint32_t threads);
 void x16_fugue512_cpu_free(int thr_id);
 void x16_fugue512_setBlock_80(void *pdata);
 void x16_fugue512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
 void x16_whirlpool512_init(int thr_id, uint32_t threads);
 void x16_whirlpool512_setBlock_80(void* endiandata);
 void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
 void x16_sha512_setBlock_80(void *pdata);
 void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
--- a/x16r/x16r.cu
+++ b/x16r/x16r.cu
@ -0,0 +1,625 @@
 /**
 * X16R algorithm (X16 with Randomized chain order)
 *
 * tpruvot 2018 - GPL code
 */
 #include <stdio.h>
 #include <memory.h>
 #include <unistd.h>
 extern "C" {
 #include "sph/sph_blake.h"
 #include "sph/sph_bmw.h"
 #include "sph/sph_groestl.h"
 #include "sph/sph_skein.h"
 #include "sph/sph_jh.h"
 #include "sph/sph_keccak.h"
 #include "sph/sph_luffa.h"
 #include "sph/sph_cubehash.h"
 #include "sph/sph_shavite.h"
 #include "sph/sph_simd.h"
 #include "sph/sph_echo.h"
 #include "sph/sph_hamsi.h"
 #include "sph/sph_fugue.h"
 #include "sph/sph_shabal.h"
 #include "sph/sph_whirlpool.h"
 #include "sph/sph_sha2.h"
 }
 #include "miner.h"
 #include "cuda_helper.h"
 #include "cuda_x16r.h"
 static uint32_t *d_hash[MAX_GPUS];
 enum Algo {
 	BLAKE = 0,
 	BMW,
 	GROESTL,
 	JH,
 	KECCAK,
 	SKEIN,
 	LUFFA,
 	CUBEHASH,
 	SHAVITE,
 	SIMD,
 	ECHO,
 	HAMSI,
 	FUGUE,
 	SHABAL,
 	WHIRLPOOL,
 	SHA512,
 	HASH_FUNC_COUNT
 };
 static const char* algo_strings[] = {
 	"blake",
 	"bmw512",
 	"groestl",
 	"jh512",
 	"keccak",
 	"skein",
 	"luffa",
 	"cube",
 	"shavite",
 	"simd",
 	"echo",
 	"hamsi",
 	"fugue",
 	"shabal",
 	"whirlpool",
 	"sha512",
 	NULL
 };
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread bool s_implemented = false;
 static __thread char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
 static void getAlgoString(const uint32_t* prevblock, char *output)
 {
 	char *sptr = output;
 	uint8_t* data = (uint8_t*)prevblock;
 	for (uint8_t j = 0; j < HASH_FUNC_COUNT; j++) {
 		uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed
 		uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4;
 		if (algoDigit >= 10)
 			sprintf(sptr, "%c", 'A' + (algoDigit - 10));
 		else
 			sprintf(sptr, "%u", (uint32_t) algoDigit);
 		sptr++;
 	}
 	*sptr = '\0';
 }
 // X16R CPU Hash (Validation)
 extern "C" void x16r_hash(void *output, const void *input)
 {
 	unsigned char _ALIGN(64) hash[128];
 	sph_blake512_context ctx_blake;
 	sph_bmw512_context ctx_bmw;
 	sph_groestl512_context ctx_groestl;
 	sph_jh512_context ctx_jh;
 	sph_keccak512_context ctx_keccak;
 	sph_skein512_context ctx_skein;
 	sph_luffa512_context ctx_luffa;
 	sph_cubehash512_context ctx_cubehash;
 	sph_shavite512_context ctx_shavite;
 	sph_simd512_context ctx_simd;
 	sph_echo512_context ctx_echo;
 	sph_hamsi512_context ctx_hamsi;
 	sph_fugue512_context ctx_fugue;
 	sph_shabal512_context ctx_shabal;
 	sph_whirlpool_context ctx_whirlpool;
 	sph_sha512_context ctx_sha512;
 	void *in = (void*) input;
 	int size = 80;
 	uint32_t *in32 = (uint32_t*) input;
 	getAlgoString(&in32[1], hashOrder);
 	for (int i = 0; i < 16; i++)
 	{
 		const char elem = hashOrder[i];
 		const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
 		switch (algo) {
 		case BLAKE:
 			sph_blake512_init(&ctx_blake);
 			sph_blake512(&ctx_blake, in, size);
 			sph_blake512_close(&ctx_blake, hash);
 			break;
 		case BMW:
 			sph_bmw512_init(&ctx_bmw);
 			sph_bmw512(&ctx_bmw, in, size);
 			sph_bmw512_close(&ctx_bmw, hash);
 			break;
 		case GROESTL:
 			sph_groestl512_init(&ctx_groestl);
 			sph_groestl512(&ctx_groestl, in, size);
 			sph_groestl512_close(&ctx_groestl, hash);
 			break;
 		case SKEIN:
 			sph_skein512_init(&ctx_skein);
 			sph_skein512(&ctx_skein, in, size);
 			sph_skein512_close(&ctx_skein, hash);
 			break;
 		case JH:
 			sph_jh512_init(&ctx_jh);
 			sph_jh512(&ctx_jh, in, size);
 			sph_jh512_close(&ctx_jh, hash);
 			break;
 		case KECCAK:
 			sph_keccak512_init(&ctx_keccak);
 			sph_keccak512(&ctx_keccak, in, size);
 			sph_keccak512_close(&ctx_keccak, hash);
 			break;
 		case LUFFA:
 			sph_luffa512_init(&ctx_luffa);
 			sph_luffa512(&ctx_luffa, in, size);
 			sph_luffa512_close(&ctx_luffa, hash);
 			break;
 		case CUBEHASH:
 			sph_cubehash512_init(&ctx_cubehash);
 			sph_cubehash512(&ctx_cubehash, in, size);
 			sph_cubehash512_close(&ctx_cubehash, hash);
 			break;
 		case SHAVITE:
 			sph_shavite512_init(&ctx_shavite);
 			sph_shavite512(&ctx_shavite, in, size);
 			sph_shavite512_close(&ctx_shavite, hash);
 			break;
 		case SIMD:
 			sph_simd512_init(&ctx_simd);
 			sph_simd512(&ctx_simd, in, size);
 			sph_simd512_close(&ctx_simd, hash);
 			break;
 		case ECHO:
 			sph_echo512_init(&ctx_echo);
 			sph_echo512(&ctx_echo, in, size);
 			sph_echo512_close(&ctx_echo, hash);
 			break;
 		case HAMSI:
 			sph_hamsi512_init(&ctx_hamsi);
 			sph_hamsi512(&ctx_hamsi, in, size);
 			sph_hamsi512_close(&ctx_hamsi, hash);
 			break;
 		case FUGUE:
 			sph_fugue512_init(&ctx_fugue);
 			sph_fugue512(&ctx_fugue, in, size);
 			sph_fugue512_close(&ctx_fugue, hash);
 			break;
 		case SHABAL:
 			sph_shabal512_init(&ctx_shabal);
 			sph_shabal512(&ctx_shabal, in, size);
 			sph_shabal512_close(&ctx_shabal, hash);
 			break;
 		case WHIRLPOOL:
 			sph_whirlpool_init(&ctx_whirlpool);
 			sph_whirlpool(&ctx_whirlpool, in, size);
 			sph_whirlpool_close(&ctx_whirlpool, hash);
 			break;
 		case SHA512:
 			sph_sha512_init(&ctx_sha512);
 			sph_sha512(&ctx_sha512,(const void*) in, size);
 			sph_sha512_close(&ctx_sha512,(void*) hash);
 			break;
 		}
 		in = (void*) hash;
 		size = 64;
 	}
 	memcpy(output, hash, 32);
 }
 void whirlpool_midstate(void *state, const void *input)
 {
 	sph_whirlpool_context ctx;
 	sph_whirlpool_init(&ctx);
 	sph_whirlpool(&ctx, input, 64);
 	memcpy(state, ctx.state, 64);
 }
 static bool init[MAX_GPUS] = { 0 };
 //#define _DEBUG
 #define _DEBUG_PREFIX "x16r-"
 #include "cuda_debug.cuh"
 //static int algo80_tests[HASH_FUNC_COUNT] = { 0 };
 //static int algo64_tests[HASH_FUNC_COUNT] = { 0 };
 static int algo80_fails[HASH_FUNC_COUNT] = { 0 };
 extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
 	const int dev_id = device_map[thr_id];
 	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19;
 	if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20;
 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
 	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
 			// reduce cpu usage
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		}
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_bmw512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
 		quark_jh512_cpu_init(thr_id, throughput);
 		quark_keccak512_cpu_init(thr_id, throughput);
 		qubit_luffa512_cpu_init(thr_id, throughput);
 		x11_luffa512_cpu_init(thr_id, throughput); // 64
 		x11_shavite512_cpu_init(thr_id, throughput);
 		x11_simd512_cpu_init(thr_id, throughput); // 64
 		x11_echo512_cpu_init(thr_id, throughput);
 		x16_echo512_cuda_init(thr_id, throughput);
 		x13_hamsi512_cpu_init(thr_id, throughput);
 		x13_fugue512_cpu_init(thr_id, throughput);
 		x16_fugue512_cpu_init(thr_id, throughput);
 		x14_shabal512_cpu_init(thr_id, throughput);
 		x15_whirlpool_cpu_init(thr_id, throughput, 0);
 		x16_whirlpool512_init(thr_id, throughput);
 		x17_sha512_cpu_init(thr_id, throughput);
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
 		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
 	if (opt_benchmark) {
 		((uint32_t*)ptarget)[7] = 0x003f;
 		((uint8_t*)pdata)[8] = 0x90; // hashOrder[0] = '9'; for simd 80 + blake512 64
 		//((uint8_t*)pdata)[8] = 0xA0; // hashOrder[0] = 'A'; for echo 80 + blake512 64
 		//((uint8_t*)pdata)[8] = 0xB0; // hashOrder[0] = 'B'; for hamsi 80 + blake512 64
 		//((uint8_t*)pdata)[8] = 0xC0; // hashOrder[0] = 'C'; for fugue 80 + blake512 64
 		//((uint8_t*)pdata)[8] = 0xE0; // hashOrder[0] = 'E'; for whirlpool 80 + blake512 64
 	}
 	uint32_t _ALIGN(64) endiandata[20];
 	for (int k=0; k < 19; k++)
 		be32enc(&endiandata[k], pdata[k]);
 	uint32_t ntime = swab32(pdata[17]);
 	if (s_ntime != ntime) {
 		getAlgoString(&endiandata[1], hashOrder);
 		s_ntime = ntime;
 		s_implemented = true;
 		if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime);
 	}
 	if (!s_implemented) {
 		sleep(1);
 		return -1;
 	}
 	cuda_check_cpu_setTarget(ptarget);
 	char elem = hashOrder[0];
 	const uint8_t algo80 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
 	switch (algo80) {
 		case BLAKE:
 			quark_blake512_cpu_setBlock_80(thr_id, endiandata);
 			break;
 		case BMW:
 			quark_bmw512_cpu_setBlock_80(endiandata);
 			break;
 		case GROESTL:
 			groestl512_setBlock_80(thr_id, endiandata);
 			break;
 		case JH:
 			jh512_setBlock_80(thr_id, endiandata);
 			break;
 		case KECCAK:
 			keccak512_setBlock_80(thr_id, endiandata);
 			break;
 		case SKEIN:
 			skein512_cpu_setBlock_80((void*)endiandata);
 			break;
 		case LUFFA:
 			qubit_luffa512_cpu_setBlock_80((void*)endiandata);
 			break;
 		case CUBEHASH:
 			cubehash512_setBlock_80(thr_id, endiandata);
 			break;
 		case SHAVITE:
 			x11_shavite512_setBlock_80((void*)endiandata);
 			break;
 		case SIMD:
 			x16_simd512_setBlock_80((void*)endiandata);
 			break;
 		case ECHO:
 			x16_echo512_setBlock_80((void*)endiandata);
 			break;
 		case HAMSI:
 			x16_hamsi512_setBlock_80((void*)endiandata);
 			break;
 		case FUGUE:
 			x16_fugue512_setBlock_80((void*)pdata);
 			break;
 		case SHABAL:
 			x16_shabal512_setBlock_80((void*)endiandata);
 			break;
 		case WHIRLPOOL:
 			x16_whirlpool512_setBlock_80((void*)endiandata);
 			break;
 		case SHA512:
 			x16_sha512_setBlock_80(endiandata);
 			break;
 		default: {
 			if (!thr_id)
 				applog(LOG_WARNING, "kernel %s %c unimplemented, order %s", algo_strings[algo80], elem, hashOrder);
 			s_implemented = false;
 			sleep(5);
 			return -1;
 		}
 	}
 	int warn = 0;
 	do {
 		int order = 0;
 		// Hash with CUDA
 		switch (algo80) {
 			case BLAKE:
 				quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("blake80:");
 				break;
 			case BMW:
 				quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 				TRACE("bmw80  :");
 				break;
 			case GROESTL:
 				groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("grstl80:");
 				break;
 			case JH:
 				jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("jh51280:");
 				break;
 			case KECCAK:
 				keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("kecck80:");
 				break;
 			case SKEIN:
 				skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
 				TRACE("skein80:");
 				break;
 			case LUFFA:
 				qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 				TRACE("luffa80:");
 				break;
 			case CUBEHASH:
 				cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("cube 80:");
 				break;
 			case SHAVITE:
 				x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 				TRACE("shavite:");
 				break;
 			case SIMD:
 				x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("simd512:");
 				break;
 			case ECHO:
 				x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("echo   :");
 				break;
 			case HAMSI:
 				x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("hamsi  :");
 				break;
 			case FUGUE:
 				x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("fugue  :");
 				break;
 			case SHABAL:
 				x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("shabal :");
 				break;
 			case WHIRLPOOL:
 				x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("whirl  :");
 				break;
 			case SHA512:
 				x16_sha512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("sha512 :");
 				break;
 		}
 		for (int i = 1; i < 16; i++)
 		{
 			const char elem = hashOrder[i];
 			const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
 			switch (algo64) {
 			case BLAKE:
 				quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("blake  :");
 				break;
 			case BMW:
 				quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("bmw    :");
 				break;
 			case GROESTL:
 				quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("groestl:");
 				break;
 			case JH:
 				quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("jh512  :");
 				break;
 			case KECCAK:
 				quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("keccak :");
 				break;
 			case SKEIN:
 				quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("skein  :");
 				break;
 			case LUFFA:
 				x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("luffa  :");
 				break;
 			case CUBEHASH:
 				x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("cube   :");
 				break;
 			case SHAVITE:
 				x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("shavite:");
 				break;
 			case SIMD:
 				x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("simd   :");
 				break;
 			case ECHO:
 				x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("echo   :");
 				break;
 			case HAMSI:
 				x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("hamsi  :");
 				break;
 			case FUGUE:
 				x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("fugue  :");
 				break;
 			case SHABAL:
 				x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("shabal :");
 				break;
 			case WHIRLPOOL:
 				x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("shabal :");
 				break;
 			case SHA512:
 				x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("sha512 :");
 				break;
 			}
 		}
 		*hashes_done = pdata[19] - first_nonce + throughput;
 		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 #ifdef _DEBUG
 		uint32_t _ALIGN(64) dhash[8];
 		be32enc(&endiandata[19], pdata[19]);
 		x16r_hash(dhash, endiandata);
 		applog_hash(dhash);
 		return -1;
 #endif
 		if (work->nonces[0] != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
 			uint32_t _ALIGN(64) vhash[8];
 			be32enc(&endiandata[19], work->nonces[0]);
 			x16r_hash(vhash, endiandata);
 			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
 				work->valid_nonces = 1;
 				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 				work_set_target_ratio(work, vhash);
 				if (work->nonces[1] != 0) {
 					be32enc(&endiandata[19], work->nonces[1]);
 					x16r_hash(vhash, endiandata);
 					bn_set_target_ratio(work, vhash, 1);
 					work->valid_nonces++;
 					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
 				} else {
 					pdata[19] = work->nonces[0] + 1; // cursor
 				}
 #if 0
 				gpulog(LOG_INFO, thr_id, "hash found with %s 80!", algo_strings[algo80]);
 				algo80_tests[algo80] += work->valid_nonces;
 				char oks64[128] = { 0 };
 				char oks80[128] = { 0 };
 				char fails[128] = { 0 };
 				for (int a = 0; a < HASH_FUNC_COUNT; a++) {
 					const char elem = hashOrder[a];
 					const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
 					if (a > 0) algo64_tests[algo64] += work->valid_nonces;
 					sprintf(&oks64[strlen(oks64)], "|%X:%2d", a, algo64_tests[a] < 100 ? algo64_tests[a] : 99);
 					sprintf(&oks80[strlen(oks80)], "|%X:%2d", a, algo80_tests[a] < 100 ? algo80_tests[a] : 99);
 					sprintf(&fails[strlen(fails)], "|%X:%2d", a, algo80_fails[a] < 100 ? algo80_fails[a] : 99);
 				}
 				applog(LOG_INFO, "K64: %s", oks64);
 				applog(LOG_INFO, "K80: %s", oks80);
 				applog(LOG_ERR,  "F80: %s", fails);
 #endif
 				return work->valid_nonces;
 			}
 			else if (vhash[7] > Htarg) {
 				// x11+ coins could do some random error, but not on retry
 				gpu_increment_reject(thr_id);
 				algo80_fails[algo80]++;
 				if (!warn) {
 					warn++;
 					pdata[19] = work->nonces[0] + 1;
 					continue;
 				} else {
 					if (!opt_quiet)	gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! %s %s",
 						work->nonces[0], algo_strings[algo80], hashOrder);
 					warn = 0;
 				}
 			}
 		}
 		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
 		pdata[19] += throughput;
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 // cleanup
 extern "C" void free_x16r(int thr_id)
 {
 	if (!init[thr_id])
 		return;
 	cudaThreadSynchronize();
 	cudaFree(d_hash[thr_id]);
 	quark_blake512_cpu_free(thr_id);
 	quark_groestl512_cpu_free(thr_id);
 	x11_simd512_cpu_free(thr_id);
 	x13_fugue512_cpu_free(thr_id);
 	x16_fugue512_cpu_free(thr_id); // to merge with x13_fugue512 ?
 	x15_whirlpool_cpu_free(thr_id);
 	cuda_check_cpu_free(thr_id);
 	cudaDeviceSynchronize();
 	init[thr_id] = false;
 }
--- a/x17/cuda_x17_sha512.cu
+++ b/x17/cuda_x17_sha512.cu
@ -169,3 +169,80 @@ void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,
 	x17_sha512_gpu_hash_64 <<<grid, block>>> (threads, (uint64_t*)d_hash);
 }
 __constant__
 static uint64_t c_PaddedMessage80[10];
 __global__
 /*__launch_bounds__(256, 4)*/
 void x16_sha512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint64_t W[80];
 		#pragma unroll
 		for (int i = 0; i < 9; i ++) {
 			W[i] = SWAP64(c_PaddedMessage80[i]);
 		}
 		const uint32_t nonce = startNonce + thread;
 		//((uint32_t*)W)[19] = cuda_swab32(nonce);
 		W[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nonce));
 		W[9] = cuda_swab64(W[9]);
 		W[10] = 0x8000000000000000;
 		#pragma unroll
 		for (int i = 11; i<15; i++) {
 			W[i] = 0U;
 		}
 		W[15] = 0x0000000000000280;
 		#pragma unroll 64
 		for (int i = 16; i < 80; i ++) {
 			W[i] = SSG5_1(W[i-2]) + W[i-7];
 			W[i] += SSG5_0(W[i-15]) + W[i-16];
 		}
 		const uint64_t IV512[8] = {
 			0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
 			0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
 			0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
 			0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
 		};
 		uint64_t r[8];
 		#pragma unroll
 		for (int i = 0; i < 8; i++) {
 			r[i] = IV512[i];
 		}
 		#pragma unroll
 		for (int i = 0; i < 80; i++) {
 			SHA3_STEP(c_WB, r, W, i&7, i);
 		}
 		const uint64_t hashPosition = thread;
 		uint64_t *pHash = &g_hash[hashPosition << 3];
 		#pragma unroll
 		for (int u = 0; u < 8; u ++) {
 			pHash[u] = SWAP64(r[u] + IV512[u]);
 		}
 	}
 }
 __host__
 void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
 {
 	const uint32_t threadsperblock = 256;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 	x16_sha512_gpu_hash_80 <<<grid, block >>> (threads, startNounce, (uint64_t*)d_hash);
 }
 __host__
 void x16_sha512_setBlock_80(void *pdata)
 {
 	cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
 }