Merge branch 'windows' of https://github.com/tpruvot/ccminer

7 years ago · ded6a391b2
28 changed files with 5229 additions and 486 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -69,13 +69,16 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
				@@ -69,13 +69,16 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu lbry/cuda_lbry_merged.cu \
 			  qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
 			  tribus/tribus.cu tribus/cuda_echo512_final.cu \
-			  x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
+			  x11/x11.cu x12/x12.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
 			  x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
 			  x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu x11/timetravel.cu x11/bitcore.cu \
 			  x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
 			  x13/hsr.cu x13/cuda_hsr_sm3.cu x13/sm3.c \
 			  x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \
 			  x15/whirlpool.cu x15/cuda_x15_whirlpool_sm3.cu \
+			  x16/x16r.cu x16/x16s.cu x16/cuda_x16_echo512.cu x16/cuda_x16_fugue512.cu \
+			  x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \
+			  x16/cuda_x16_echo512_64.cu \
 			  x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
 			  x11/phi.cu x11/cuda_streebog_maxwell.cu \
 			  x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu \
--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@
				@@ -1,5 +1,5 @@

-ccminer 2.2.4 (Jan. 2018)     "lyra2v2 and keccak improvements"
+ccminer 2.2.5 (Apr 2018)             "x12, x16r and x16s algos"
 ---------------------------------------------------------------

 ***************************************************************
@ -120,8 +120,12 @@ its command line interface and options.
				@@ -120,8 +120,12 @@ its command line interface and options.
                          tribus      use to mine Denarius
                          x11evo      use to mine Revolver
                          x11         use to mine DarkCoin
-                          x14         use to mine X14Coin
+                          x12         use to mine GalaxyCash
+                          x13         use to mine X13
+                          x14         use to mine X14
                          x15         use to mine Halcyon
+                          x16r        use to mine Raven
+                          x16s        use to mine Pigeon and Eden
                          x17         use to mine X17
                          vanilla     use to mine Vanilla (Blake256)
                          veltor      use to mine VeltorCoin
@ -277,7 +281,13 @@ so we can more efficiently implement new algorithms using the latest hardware
				@@ -277,7 +281,13 @@ so we can more efficiently implement new algorithms using the latest hardware
 features.

 >>> RELEASE HISTORY <<<
-  Jan. 04th 2017  v2.2.4
+  Apr. 02nd 2018  v2.2.5
+                  New x16r algo for Raven
+                  New x16s algo for Pigeon and Eden
+                  New x12 algo for Galaxycash
+                  Equihash (SIMT) sync issues for the Volta generation
+
+  Jan. 04th 2018  v2.2.4
                  Improve lyra2v2
                  Higher keccak default intensity
                  Drop SM 2.x support by default, for CUDA 9 and more recent
--- a/algos.h
+++ b/algos.h
@ -57,9 +57,12 @@ enum sha_algos {
				@@ -57,9 +57,12 @@ enum sha_algos {
 	ALGO_BITCORE,
 	ALGO_X11EVO,
 	ALGO_X11,
+	ALGO_X12,
 	ALGO_X13,
 	ALGO_X14,
 	ALGO_X15,
+	ALGO_X16R,
+	ALGO_X16S,
 	ALGO_X17,
 	ALGO_VANILLA,
 	ALGO_VELTOR,
@ -127,9 +130,12 @@ static const char *algo_names[] = {
				@@ -127,9 +130,12 @@ static const char *algo_names[] = {
 	"bitcore",
 	"x11evo",
 	"x11",
+	"x12",
 	"x13",
 	"x14",
 	"x15",
+	"x16r",
+	"x16s",
 	"x17",
 	"vanilla",
 	"veltor",
--- a/api.cpp
+++ b/api.cpp
@ -1252,7 +1252,7 @@ static void api()
				@@ -1252,7 +1252,7 @@ static void api()
 			char *wskey = NULL;
 			n = recv(c, &buf[0], SOCK_REC_BUFSZ, 0);

-			fail = SOCKETFAIL(n);
+			fail = SOCKETFAIL(n) || n < 0;
 			if (fail)
 				buf[0] = '\0';
 			else if (n > 0 && buf[n-1] == '\n') {
@ -1261,7 +1261,7 @@ static void api()
				@@ -1261,7 +1261,7 @@ static void api()
 				if (n > 0 && buf[n-1] == '\r')
 					buf[n-1] = '\0';
 			}
-			buf[n] = '\0';
+			else buf[n] = '\0';

 			//if (opt_debug && opt_protocol && n > 0)
 			//	applog(LOG_DEBUG, "API: recv command: (%d) '%s'+char(%x)", n, buf, buf[n-1]);
--- a/bench.cpp
+++ b/bench.cpp
@ -100,9 +100,12 @@ void algo_free_all(int thr_id)
				@@ -100,9 +100,12 @@ void algo_free_all(int thr_id)
 	free_wildkeccak(thr_id);
 	free_x11evo(thr_id);
 	free_x11(thr_id);
+	free_x12(thr_id);
 	free_x13(thr_id);
 	free_x14(thr_id);
 	free_x15(thr_id);
+	free_x16r(thr_id);
+	free_x16s(thr_id);
 	free_x17(thr_id);
 	free_zr5(thr_id);
 	free_scrypt(thr_id);
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -291,9 +291,12 @@ Options:\n\
				@@ -291,9 +291,12 @@ Options:\n\
 			whirlpool   Whirlpool algo\n\
 			x11evo      Permuted x11 (Revolver)\n\
 			x11         X11 (DarkCoin)\n\
+			x12         X12 (GalaxyCash)\n\
 			x13         X13 (MaruCoin)\n\
 			x14         X14\n\
 			x15         X15\n\
+			x16r        X16R (Raven)\n\
+			x16s        X16S\n\
 			x17         X17\n\
 			wildkeccak  Boolberry\n\
 			zr5         ZR5 (ZiftrCoin)\n\
@ -1714,6 +1717,8 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
				@@ -1714,6 +1717,8 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		case ALGO_LYRA2Z:
 		case ALGO_TIMETRAVEL:
 		case ALGO_BITCORE:
+		case ALGO_X16R:
+		case ALGO_X16S:
 			work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));
 			break;
 		case ALGO_KECCAK:
@ -2253,6 +2258,7 @@ static void *miner_thread(void *userdata)
				@@ -2253,6 +2258,7 @@ static void *miner_thread(void *userdata)
 			case ALGO_BITCORE:
 			case ALGO_X11EVO:
 			case ALGO_X11:
+			case ALGO_X12:
 			case ALGO_X13:
 			case ALGO_WHIRLCOIN:
 			case ALGO_WHIRLPOOL:
@ -2503,6 +2509,9 @@ static void *miner_thread(void *userdata)
				@@ -2503,6 +2509,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_X11:
 			rc = scanhash_x11(thr_id, &work, max_nonce, &hashes_done);
 			break;
+		case ALGO_X12:
+			rc = scanhash_x12(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_X13:
 			rc = scanhash_x13(thr_id, &work, max_nonce, &hashes_done);
 			break;
@ -2512,6 +2521,12 @@ static void *miner_thread(void *userdata)
				@@ -2512,6 +2521,12 @@ static void *miner_thread(void *userdata)
 		case ALGO_X15:
 			rc = scanhash_x15(thr_id, &work, max_nonce, &hashes_done);
 			break;
+		case ALGO_X16R:
+			rc = scanhash_x16r(thr_id, &work, max_nonce, &hashes_done);
+			break;
+		case ALGO_X16S:
+			rc = scanhash_x16s(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_X17:
 			rc = scanhash_x17(thr_id, &work, max_nonce, &hashes_done);
 			break;
@ -3159,6 +3174,7 @@ void parse_arg(int key, char *arg)
				@@ -3159,6 +3174,7 @@ void parse_arg(int key, char *arg)
 		if (v < 1 || v > 65535) // sanity check
 			show_usage_and_exit(1);
 		opt_api_mcast_port = v;
+		break;
 	case 'B':
 		opt_background = true;
 		break;
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -269,6 +269,7 @@
				@@ -269,6 +269,7 @@
    <ClCompile Include="neoscrypt\neoscrypt-cpu.c" />
    <ClInclude Include="neoscrypt\cuda_vectors.h" />
    <ClInclude Include="x11\cuda_x11_simd512_sm2.cuh" />
+    <ClInclude Include="x16\cuda_x16.h" />
    <CudaCompile Include="Algo256\bmw.cu" />
    <CudaCompile Include="Algo256\cuda_bmw.cu">
      <MaxRegCount>76</MaxRegCount>
@ -573,6 +574,7 @@
				@@ -573,6 +574,7 @@
    <CudaCompile Include="x11\veltor.cu" />
    <CudaCompile Include="x11\x11.cu" />
    <CudaCompile Include="x11\x11evo.cu" />
+    <CudaCompile Include="x12\x12.cu" />
    <CudaCompile Include="x13\cuda_x13_hamsi512.cu">
      <MaxRegCount>72</MaxRegCount>
    </CudaCompile>
@ -587,8 +589,17 @@
				@@ -587,8 +589,17 @@
    <CudaCompile Include="x17\hmq17.cu" />
    <CudaCompile Include="x15\x15.cu" />
    <CudaCompile Include="x15\whirlpool.cu" />
-    <CudaCompile Include="x17\x17.cu">
+    <CudaCompile Include="x15\cuda_x15_whirlpool_sm3.cu" />
+    <CudaCompile Include="x16\x16r.cu" />
+    <CudaCompile Include="x16\x16s.cu" />
+    <CudaCompile Include="x16\cuda_x16_echo512.cu" />
+    <CudaCompile Include="x16\cuda_x16_fugue512.cu" />
+    <CudaCompile Include="x16\cuda_x16_shabal512.cu" />
+    <CudaCompile Include="x16\cuda_x16_simd512_80.cu" />
+    <CudaCompile Include="x16\cuda_x16_echo512_64.cu">
+      <CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
    </CudaCompile>
+    <CudaCompile Include="x17\x17.cu" />
    <CudaCompile Include="x17\cuda_x17_haval256.cu">
    </CudaCompile>
    <CudaCompile Include="x17\cuda_x17_sha512.cu">
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -58,6 +58,9 @@
				@@ -58,6 +58,9 @@
    <Filter Include="Source Files\CUDA\x15">
      <UniqueIdentifier>{a2403c22-6777-46ab-a55a-3fcc7386c974}</UniqueIdentifier>
    </Filter>
+    <Filter Include="Source Files\CUDA\x16">
+      <UniqueIdentifier>{55dfae6a-66ba-43e2-8ceb-98ee70cbdf16}</UniqueIdentifier>
+    </Filter>
    <Filter Include="Source Files\CUDA\x17">
      <UniqueIdentifier>{85dfae6a-66ca-4332-8cec-98ee70cbdf2f}</UniqueIdentifier>
    </Filter>
@ -112,6 +115,9 @@
				@@ -112,6 +115,9 @@
    <Filter Include="Source Files\CUDA\tribus">
      <UniqueIdentifier>{1e548d79-c217-4203-989a-a592fe2b2de3}</UniqueIdentifier>
    </Filter>
+    <Filter Include="Source Files\CUDA\x12">
+      <UniqueIdentifier>{xde48d89-fx12-1323-129a-b592fe2b2de3}</UniqueIdentifier>
+    </Filter>
    <Filter Include="Source Files\CUDA\gost">
      <UniqueIdentifier>{6a99bc95-f402-465e-9e64-b042bd241bb7}</UniqueIdentifier>
 	</Filter>
@ -599,6 +605,9 @@
				@@ -599,6 +605,9 @@
    <ClInclude Include="equi\equihash.h">
      <Filter>Source Files\equi</Filter>
    </ClInclude>
+    <ClInclude Include="x16\cuda_x16.h">
+      <Filter>Header Files\CUDA</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <CudaCompile Include="cuda.cpp">
@ -808,6 +817,9 @@
				@@ -808,6 +817,9 @@
    <CudaCompile Include="x11\s3.cu">
      <Filter>Source Files\CUDA\x11</Filter>
    </CudaCompile>
+    <CudaCompile Include="x12\x12.cu">
+      <Filter>Source Files\CUDA\x12</Filter>
+    </CudaCompile>
    <CudaCompile Include="x11\timetravel.cu">
      <Filter>Source Files\CUDA\x11</Filter>
    </CudaCompile>
@ -970,6 +982,30 @@
				@@ -970,6 +982,30 @@
    <CudaCompile Include="equi\cuda_equi.cu">
      <Filter>Source Files\equi</Filter>
    </CudaCompile>
+    <CudaCompile Include="x15\cuda_x15_whirlpool_sm3.cu">
+      <Filter>Source Files\CUDA\x15</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x16\cuda_x16_echo512.cu">
+      <Filter>Source Files\CUDA\x16</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x16\cuda_x16_echo512_64.cu">
+      <Filter>Source Files\CUDA\x16</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x16\cuda_x16_fugue512.cu">
+      <Filter>Source Files\CUDA\x16</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x16\cuda_x16_shabal512.cu">
+      <Filter>Source Files\CUDA\x16</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x16\cuda_x16_simd512_80.cu">
+      <Filter>Source Files\CUDA\x16</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x16\x16r.cu">
+      <Filter>Source Files\CUDA\x16</Filter>
+    </CudaCompile>
+    <CudaCompile Include="x16\x16s.cu">
+      <Filter>Source Files\CUDA\x16</Filter>
+    </CudaCompile>
    <CudaCompile Include="gost\cuda_gosthash.cu">
      <Filter>Source Files\CUDA\gost</Filter>
    </CudaCompile>
--- a/compat/ccminer-config.h
+++ b/compat/ccminer-config.h
@ -164,7 +164,7 @@
				@@ -164,7 +164,7 @@
 #define PACKAGE_URL "http://github.com/tpruvot/ccminer"

 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.2.4"
+#define PACKAGE_VERSION "2.2.5"

 /* If using the C implementation of alloca, define if you know the
   direction of stack growth for your system; otherwise it will be
--- a/equi/cuda_equi.cu
+++ b/equi/cuda_equi.cu
@ -80,6 +80,8 @@ u32 umin(const u32, const u32);
				@@ -80,6 +80,8 @@ u32 umin(const u32, const u32);
 u32 umax(const u32, const u32);
 #endif

+#define OPT_SYNC_ALL
+
 #if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
 #define __shfl2(var, srcLane)  __shfl_sync(0xFFFFFFFFu, var, srcLane)
 #undef __any
@ -514,10 +516,11 @@ __global__ void digit_1(equi<RB, SM>* eq)
				@@ -514,10 +516,11 @@ __global__ void digit_1(equi<RB, SM>* eq)

 	u32 si[2];

+#ifdef OPT_SYNC_ALL
 	// enable this to make fully safe shared mem operations;
 	// disabled gains some speed, but can rarely cause a crash
-	//__syncthreads();
-
+	__syncthreads();
+#endif
 	#pragma unroll
 	for (u32 i = 0; i != 2; ++i)
 	{
@ -654,11 +657,9 @@ __global__ void digit_2(equi<RB, SM>* eq)
				@@ -654,11 +657,9 @@ __global__ void digit_2(equi<RB, SM>* eq)
 	uint4 tt[2];

 	u32 si[2];
-
-	// enable this to make fully safe shared mem operations;
-	// disabled gains some speed, but can rarely cause a crash
-	//__syncthreads();
-
+#ifdef OPT_SYNC_ALL
+	__syncthreads();
+#endif
 	#pragma unroll 2
 	for (u32 i = 0; i < 2; i++)
 	{
@ -785,9 +786,9 @@ __global__ void digit_3(equi<RB, SM>* eq)
				@@ -785,9 +786,9 @@ __global__ void digit_3(equi<RB, SM>* eq)
 	uint4 tt[2];
 	u32 ta[2];

-	// enable this to make fully safe shared mem operations;
-	// disabled gains some speed, but can rarely cause a crash
-	//__syncthreads();
+#ifdef OPT_SYNC_ALL
+	__syncthreads();
+#endif

 	#pragma unroll 2
 	for (u32 i = 0; i < 2; i++)
@ -919,11 +920,9 @@ __global__ void digit_4(equi<RB, SM>* eq)
				@@ -919,11 +920,9 @@ __global__ void digit_4(equi<RB, SM>* eq)

 	u32 si[2];
 	uint4 tt[2];
-
-	// enable this to make fully safe shared mem operations;
-	// disabled gains some speed, but can rarely cause a crash
-	//__syncthreads();
-
+#ifdef OPT_SYNC_ALL
+	__syncthreads();
+#endif
 	#pragma unroll 2
 	for (u32 i = 0; i < 2; i++)
 	{
@ -1035,11 +1034,9 @@ __global__ void digit_5(equi<RB, SM>* eq)
				@@ -1035,11 +1034,9 @@ __global__ void digit_5(equi<RB, SM>* eq)

 	u32 si[2];
 	uint4 tt[2];
-
-	// enable this to make fully safe shared mem operations;
-	// disabled gains some speed, but can rarely cause a crash
-	//__syncthreads();
-
+#ifdef OPT_SYNC_ALL
+	__syncthreads();
+#endif
 	#pragma unroll 2
 	for (u32 i = 0; i < 2; i++)
 	{
--- a/miner.h
+++ b/miner.h
@ -324,9 +324,12 @@ extern int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, uns
				@@ -324,9 +324,12 @@ extern int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, uns
 extern int scanhash_wildkeccak(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x11evo(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_zr5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);

@ -389,9 +392,12 @@ extern void free_whirl(int thr_id);
				@@ -389,9 +392,12 @@ extern void free_whirl(int thr_id);
 extern void free_wildkeccak(int thr_id);
 extern void free_x11evo(int thr_id);
 extern void free_x11(int thr_id);
+extern void free_x12(int thr_id);
 extern void free_x13(int thr_id);
 extern void free_x14(int thr_id);
 extern void free_x15(int thr_id);
+extern void free_x16r(int thr_id);
+extern void free_x16s(int thr_id);
 extern void free_x17(int thr_id);
 extern void free_zr5(int thr_id);
 //extern void free_sha256d(int thr_id);
@ -934,9 +940,12 @@ void wcoinhash(void *state, const void *input);
				@@ -934,9 +940,12 @@ void wcoinhash(void *state, const void *input);
 void whirlxHash(void *state, const void *input);
 void x11evo_hash(void *output, const void *input);
 void x11hash(void *output, const void *input);
+void x12hash(void *output, const void *input);
 void x13hash(void *output, const void *input);
 void x14hash(void *output, const void *input);
 void x15hash(void *output, const void *input);
+void x16r_hash(void *output, const void *input);
+void x16s_hash(void *output, const void *input);
 void x17hash(void *output, const void *input);
 void wildkeccak_hash(void *output, const void *input, uint64_t* scratchpad, uint64_t ssize);
 void zr5hash(void *output, const void *input);
--- a/neoscrypt/cuda_neoscrypt.cu
+++ b/neoscrypt/cuda_neoscrypt.cu
@ -1319,7 +1319,6 @@ static void Blake2Shost(uint32_t * inout, const uint32_t * inkey)
				@@ -1319,7 +1319,6 @@ static void Blake2Shost(uint32_t * inout, const uint32_t * inkey)
 }


-#define SHIFT 128U
 #define TPB 32
 #define TPB2 64

@ -1346,7 +1345,7 @@ __launch_bounds__(TPB, 1)
				@@ -1346,7 +1345,7 @@ __launch_bounds__(TPB, 1)
 void neoscrypt_gpu_hash_chacha1()
 {
 	const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
-	const uint32_t shift = SHIFT * 8U * (thread & 8191);
+	const uint32_t threads = (gridDim.x * blockDim.y);
 	const uint32_t shiftTr = 8U * thread;

 	uint4 X[4];
@ -1361,7 +1360,7 @@ void neoscrypt_gpu_hash_chacha1()
				@@ -1361,7 +1360,7 @@ void neoscrypt_gpu_hash_chacha1()
 	#pragma nounroll
 	for (int i = 0; i < 128; i++)
 	{
-		uint32_t offset = shift + i * 8U;
+		uint32_t offset = 8U * (thread + threads * i);
 		for (int j = 0; j < 4; j++)
 			((uint4*)(W + offset))[j * 4 + threadIdx.x] = X[j];
 		neoscrypt_chacha(X);
@ -1370,7 +1369,7 @@ void neoscrypt_gpu_hash_chacha1()
				@@ -1370,7 +1369,7 @@ void neoscrypt_gpu_hash_chacha1()
 	#pragma nounroll
 	for (int t = 0; t < 128; t++)
 	{
-		uint32_t offset = shift + (WarpShuffle(X[3].x, 0, 4) & 0x7F) * 8U;
+		uint32_t offset = 8U * (thread + threads * (WarpShuffle(X[3].x, 0, 4) & 0x7F));
 		for (int j = 0; j < 4; j++)
 			X[j] ^= ((uint4*)(W + offset))[j * 4 + threadIdx.x];
 		neoscrypt_chacha(X);
@ -1391,7 +1390,7 @@ __launch_bounds__(TPB, 1)
				@@ -1391,7 +1390,7 @@ __launch_bounds__(TPB, 1)
 void neoscrypt_gpu_hash_salsa1()
 {
 	const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
-	const uint32_t shift = SHIFT * 8U * (thread & 8191);
+	const uint32_t threads = (gridDim.x * blockDim.y);
 	const uint32_t shiftTr = 8U * thread;

 	uint4 Z[4];
@ -1406,7 +1405,7 @@ void neoscrypt_gpu_hash_salsa1()
				@@ -1406,7 +1405,7 @@ void neoscrypt_gpu_hash_salsa1()
 	#pragma nounroll
 	for (int i = 0; i < 128; i++)
 	{
-		uint32_t offset = shift + i * 8U;
+		uint32_t offset = 8U * (thread + threads * i);
 		for (int j = 0; j < 4; j++)
 			((uint4*)(W + offset))[j * 4 + threadIdx.x] = Z[j];
 		neoscrypt_salsa(Z);
@ -1415,7 +1414,7 @@ void neoscrypt_gpu_hash_salsa1()
				@@ -1415,7 +1414,7 @@ void neoscrypt_gpu_hash_salsa1()
 	#pragma nounroll
 	for (int t = 0; t < 128; t++)
 	{
-		uint32_t offset = shift + (WarpShuffle(Z[3].x, 0, 4) & 0x7F) * 8U;
+		uint32_t offset = 8U * (thread + threads * (WarpShuffle(Z[3].x, 0, 4) & 0x7F));
 		for (int j = 0; j < 4; j++)
 			Z[j] ^= ((uint4*)(W + offset))[j * 4 + threadIdx.x];
 		neoscrypt_salsa(Z);
@ -1474,7 +1473,7 @@ void neoscrypt_init(int thr_id, uint32_t threads)
				@@ -1474,7 +1473,7 @@ void neoscrypt_init(int thr_id, uint32_t threads)
 	cuda_get_arch(thr_id);

 	CUDA_SAFE_CALL(cudaMalloc(&d_NNonce[thr_id], 2 * sizeof(uint32_t)));
-	CUDA_SAFE_CALL(cudaMalloc(&hash1, 32 * 128 * sizeof(uint64_t) * min(8192, threads)));
+	CUDA_SAFE_CALL(cudaMalloc(&hash1, 32 * 128 * sizeof(uint64_t) * threads));
 	CUDA_SAFE_CALL(cudaMalloc(&Trans1, 32 * sizeof(uint64_t) * threads));
 	CUDA_SAFE_CALL(cudaMalloc(&Trans2, 32 * sizeof(uint64_t) * threads));
 	CUDA_SAFE_CALL(cudaMalloc(&Trans3, 32 * sizeof(uint64_t) * threads));
--- a/neoscrypt/neoscrypt.cpp
+++ b/neoscrypt/neoscrypt.cpp
@ -22,6 +22,7 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
				@@ -22,6 +22,7 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
 	int dev_id = device_map[thr_id];
 	int intensity = is_windows() ? 18 : 19;
 	if (strstr(device_name[dev_id], "GTX 10")) intensity = 21; // >= 20 need more than 2GB
+	if (strstr(device_name[dev_id], "TITAN")) intensity = 21;

 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
 	throughput = throughput / 32; /* set for max intensity ~= 20 */
--- a/res/ccminer.rc
+++ b/res/ccminer.rc
@ -13,7 +13,7 @@
				@@ -13,7 +13,7 @@
 #undef APSTUDIO_READONLY_SYMBOLS

 /////////////////////////////////////////////////////////////////////////////
-// Anglais (États-Unis) resources
+// English (United States) resources

 #if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU)
 LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
@ -60,8 +60,8 @@ IDI_ICON1               ICON                    "ccminer.ico"
				@@ -60,8 +60,8 @@ IDI_ICON1               ICON                    "ccminer.ico"
 //

 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 2,2,4,0
- PRODUCTVERSION 2,2,4,0
+ FILEVERSION 2,2,5,0
+ PRODUCTVERSION 2,2,5,0
 FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
 FILEFLAGS 0x21L
@ -76,10 +76,10 @@ BEGIN
				@@ -76,10 +76,10 @@ BEGIN
    BEGIN
        BLOCK "040904e4"
        BEGIN
-            VALUE "FileVersion", "2.2.4"
+            VALUE "FileVersion", "2.2.5"
            VALUE "LegalCopyright", "Copyright (C) 2018"
            VALUE "ProductName", "ccminer"
-            VALUE "ProductVersion", "2.2.4"
+            VALUE "ProductVersion", "2.2.5"
        END
    END
    BLOCK "VarFileInfo"
@ -88,7 +88,7 @@ BEGIN
				@@ -88,7 +88,7 @@ BEGIN
    END
 END

-#endif    // Anglais (États-Unis) resources
+#endif    // English (United States) resources
 /////////////////////////////////////////////////////////////////////////////


--- a/scrypt/salsa_kernel.cu
+++ b/scrypt/salsa_kernel.cu
@ -240,8 +240,9 @@ inline int _ConvertSMVer2Cores(int major, int minor)
				@@ -240,8 +240,9 @@ inline int _ConvertSMVer2Cores(int major, int minor)
 		{ 0x21, 48  }, // Fermi Generation (SM 2.1) GF10x class
 		{ 0x30, 192 }, // Kepler Generation (SM 3.0) GK10x class - GK104 = 1536 cores / 8 SMs
 		{ 0x35, 192 }, // Kepler Generation (SM 3.5) GK11x class
-		{ 0x50, 128 }, // Maxwell Generation (SM 5.0) GTX750/750Ti
+		{ 0x50, 128 }, // Maxwell First Generation (SM 5.0) GTX750/750Ti
 		{ 0x52, 128 }, // Maxwell Second Generation (SM 5.2) GTX980 = 2048 cores / 16 SMs - GTX970 1664 cores / 13 SMs
+		{ 0x61, 128 }, // Pascal GeForce (SM 6.1)
 		{ -1, -1 },
 	};

--- a/util.cpp
+++ b/util.cpp
@ -1354,7 +1354,11 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p
				@@ -1354,7 +1354,11 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p

 	if (!res_val || json_is_false(res_val) ||
 	    (err_val && !json_is_null(err_val)))  {
-		applog(LOG_ERR, "Stratum authentication failed");
+		if (err_val && json_is_array(err_val)) {
+			const char* reason = json_string_value(json_array_get(err_val, 1));
+			applog(LOG_ERR, "Stratum authentication failed (%s)", reason);
+		}
+		else applog(LOG_ERR, "Stratum authentication failed");
 		goto out;
 	}

@ -2313,19 +2317,28 @@ void print_hash_tests(void)
				@@ -2313,19 +2317,28 @@ void print_hash_tests(void)
 	printpfx("x11evo", hash);

 	x11hash(&hash[0], &buf[0]);
-	printpfx("X11", hash);
+	printpfx("x11", hash);
+
+	x12hash(&hash[0], &buf[0]);
+	printpfx("x12", hash);

 	x13hash(&hash[0], &buf[0]);
-	printpfx("X13", hash);
+	printpfx("x13", hash);

 	x14hash(&hash[0], &buf[0]);
-	printpfx("X14", hash);
+	printpfx("x14", hash);

 	x15hash(&hash[0], &buf[0]);
-	printpfx("X15", hash);
+	printpfx("x15", hash);
+
+	x16r_hash(&hash[0], &buf[0]);
+	printpfx("x16r", hash);
+
+	x16s_hash(&hash[0], &buf[0]);
+	printpfx("x16s", hash);

 	x17hash(&hash[0], &buf[0]);
-	printpfx("X17", hash);
+	printpfx("x17", hash);

 	//memcpy(buf, zrtest, 80);
 	zr5hash(&hash[0], &buf[0]);
--- a/x12/x12.cu
+++ b/x12/x12.cu
@ -0,0 +1,236 @@
				@@ -0,0 +1,236 @@
+/*
+ * X12 algorithm
+ */
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_hamsi.h"
+}
+#include "miner.h"
+
+#include "cuda_helper.h"
+#include "x11/cuda_x11.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+// X12 CPU Hash
+extern "C" void x12hash(void *output, const void *input)
+{
+	sph_blake512_context    ctx_blake;
+	sph_bmw512_context      ctx_bmw;
+	sph_luffa512_context    ctx_luffa;
+	sph_cubehash512_context ctx_cubehash;
+	sph_shavite512_context  ctx_shavite;
+	sph_simd512_context     ctx_simd;
+	sph_echo512_context     ctx_echo;
+	sph_groestl512_context  ctx_groestl;
+	sph_skein512_context    ctx_skein;
+	sph_jh512_context       ctx_jh;
+	sph_keccak512_context   ctx_keccak;
+	sph_hamsi512_context    ctx_hamsi;
+
+	uint32_t hash[32];
+	memset(hash, 0, sizeof hash);
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512 (&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, (void*) hash);
+
+	sph_bmw512_init(&ctx_bmw);
+	sph_bmw512(&ctx_bmw, (const void*) hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*) hash);
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_groestl512_init(&ctx_groestl);
+	sph_groestl512(&ctx_groestl, (const void*) hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*) hash);
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, (const void*) hash, 64);
+	sph_skein512_close(&ctx_skein, (void*) hash);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512(&ctx_jh, (const void*) hash, 64);
+	sph_jh512_close(&ctx_jh, (void*) hash);
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512(&ctx_keccak, (const void*) hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*) hash);
+
+	sph_hamsi512_init(&ctx_hamsi);
+	sph_hamsi512(&ctx_hamsi, (const void*) hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*) hash);
+
+	memcpy(output, hash, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int intensity = (device_sm[device_map[thr_id]] > 500 && !is_windows()) ? 20 : 19;
+	uint32_t throughput =  cuda_default_throughput(thr_id, 1 << intensity); // 19=256*256*8;
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x000f;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		x11_luffaCubehash512_cpu_init(thr_id, throughput);
+		x11_shavite512_cpu_init(thr_id, throughput);
+		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
+			return 0;
+		}
+		x11_echo512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		x13_hamsi512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		CUDA_LOG_ERROR();
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			x12hash(vhash, endiandata);
+
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					x12hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	CUDA_LOG_ERROR();
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_x12(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	CUDA_LOG_ERROR();
+
+	cudaDeviceSynchronize();
+	init[thr_id] = false;
+}
--- a/x13/cuda_x13_hamsi512.cu
+++ b/x13/cuda_x13_hamsi512.cu
@ -1,6 +1,6 @@
				@@ -1,6 +1,6 @@
 /*
- * Quick Hamsi-512 for X13
- * by tsiv - 2014
+ * Quick Hamsi-512 for X13 by tsiv - 2014
+ * + Hamsi-512 80 by tpruvot - 2018
 */

 #include <stdio.h>
@ -16,31 +16,17 @@ static __constant__ uint32_t d_alpha_f[32];
				@@ -16,31 +16,17 @@ static __constant__ uint32_t d_alpha_f[32];
 static __constant__ uint32_t d_T512[64][16];

 static const uint32_t alpha_n[] = {
-	SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
-	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
-	SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
-	SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
-	SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
-	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
-	SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
-	SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
-	SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
-	SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
-	SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
+	0xff00f0f0, 0xccccaaaa, 0xf0f0cccc, 0xff00aaaa, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xf0f0ff00,
+	0xf0f0cccc, 0xaaaaff00, 0xccccff00, 0xaaaaf0f0, 0xaaaaf0f0, 0xff00cccc, 0xccccf0f0, 0xff00aaaa,
+	0xccccaaaa, 0xff00f0f0, 0xff00aaaa, 0xf0f0cccc, 0xf0f0ff00, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc,
+	0xaaaaff00, 0xf0f0cccc, 0xaaaaf0f0, 0xccccff00, 0xff00cccc, 0xaaaaf0f0, 0xff00aaaa, 0xccccf0f0
 };

 static const uint32_t alpha_f[] = {
-	SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
-	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
-	SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
-	SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
-	SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
-	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
-	SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
-	SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
-	SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
-	SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
-	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
+	0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0, 0xcaf9f9c0, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, 0x639ccaf9,
+	0x639c0ff0, 0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c, 0xf9c0639c, 0xcaf90ff0, 0x0ff0639c, 0xcaf9f9c0,
+	0x0ff0f9c0, 0xcaf9639c, 0xcaf9f9c0, 0x639c0ff0, 0x639ccaf9, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0,
+	0xf9c0caf9, 0x639c0ff0, 0xf9c0639c, 0x0ff0caf9, 0xcaf90ff0, 0xf9c0639c, 0xcaf9f9c0, 0x0ff0639c
 };

 #define hamsi_s00   m0
@ -200,390 +186,134 @@ static const uint32_t alpha_f[] = {
				@@ -200,390 +186,134 @@ static const uint32_t alpha_f[] = {


 static const uint32_t T512[64][16] = {
-	{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
-	  SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
-	  SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
-	  SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
-	  SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
-	  SPH_C32(0x9e69af68) },
-	{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
-	  SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
-	  SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
-	  SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
-	  SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
-	  SPH_C32(0x0c26f262) },
-	{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
-	  SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
-	  SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
-	  SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
-	  SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
-	  SPH_C32(0xdc24e61f) },
-	{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
-	  SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
-	  SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
-	  SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
-	  SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
-	  SPH_C32(0x3daac2da) },
-	{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
-	  SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
-	  SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
-	  SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
-	  SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
-	  SPH_C32(0x78cace29) },
-	{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
-	  SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
-	  SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
-	  SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
-	  SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
-	  SPH_C32(0x2dd1f9ab) },
-	{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
-	  SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
-	  SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
-	  SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
-	  SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
-	  SPH_C32(0xbf2c0be2) },
-	{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
-	  SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
-	  SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
-	  SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
-	  SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
-	  SPH_C32(0x32219526) },
-	{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
-	  SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
-	  SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
-	  SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
-	  SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
-	  SPH_C32(0xac8e6c88) },
-	{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
-	  SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
-	  SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
-	  SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
-	  SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
-	  SPH_C32(0x7b1bd6b9) },
-	{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
-	  SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
-	  SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
-	  SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
-	  SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
-	  SPH_C32(0xf746c320) },
-	{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
-	  SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
-	  SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
-	  SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
-	  SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
-	  SPH_C32(0x69505b3a) },
-	{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
-	  SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
-	  SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
-	  SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
-	  SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
-	  SPH_C32(0x8a341574) },
-	{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
-	  SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
-	  SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
-	  SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
-	  SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
-	  SPH_C32(0x450360bf) },
-	{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
-	  SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
-	  SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
-	  SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
-	  SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
-	  SPH_C32(0xf3d45758) },
-	{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
-	  SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
-	  SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
-	  SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
-	  SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
-	  SPH_C32(0x925c44e9) },
-	{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
-	  SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
-	  SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
-	  SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
-	  SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
-	  SPH_C32(0xa123ff9f) },
-	{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
-	  SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
-	  SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
-	  SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
-	  SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
-	  SPH_C32(0x1568ff0f) },
-	{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
-	  SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
-	  SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
-	  SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
-	  SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
-	  SPH_C32(0xc5c1eb3e) },
-	{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
-	  SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
-	  SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
-	  SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
-	  SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
-	  SPH_C32(0x1af21fe1) },
-	{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
-	  SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
-	  SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
-	  SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
-	  SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
-	  SPH_C32(0x857f3c2b) },
-	{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
-	  SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
-	  SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
-	  SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
-	  SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
-	  SPH_C32(0x2ba05a55) },
-	{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
-	  SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
-	  SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
-	  SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
-	  SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
-	  SPH_C32(0xfeabf254) },
-	{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
-	  SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
-	  SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
-	  SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
-	  SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
-	  SPH_C32(0xfe1cdc7f) },
-	{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
-	  SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
-	  SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
-	  SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
-	  SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
-	  SPH_C32(0xb0a51834) },
-	{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
-	  SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
-	  SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
-	  SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
-	  SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
-	  SPH_C32(0xa6b8c28d) },
-	{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
-	  SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
-	  SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
-	  SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
-	  SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
-	  SPH_C32(0x3a4e99d7) },
-	{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
-	  SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
-	  SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
-	  SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
-	  SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
-	  SPH_C32(0xe1844257) },
-	{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
-	  SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
-	  SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
-	  SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
-	  SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
-	  SPH_C32(0x2c3b504e) },
-	{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
-	  SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
-	  SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
-	  SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
-	  SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
-	  SPH_C32(0x524a0d59) },
-	{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
-	  SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
-	  SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
-	  SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
-	  SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
-	  SPH_C32(0x378dd173) },
-	{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
-	  SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
-	  SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
-	  SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
-	  SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
-	  SPH_C32(0x8b6c72bd) },
-	{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
-	  SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
-	  SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
-	  SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
-	  SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
-	  SPH_C32(0x8e67b7fa) },
-	{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
-	  SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
-	  SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
-	  SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
-	  SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
-	  SPH_C32(0x443d3004) },
-	{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
-	  SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
-	  SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
-	  SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
-	  SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
-	  SPH_C32(0xf4f6ea7b) },
-	{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
-	  SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
-	  SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
-	  SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
-	  SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
-	  SPH_C32(0x979961d0) },
-	{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
-	  SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
-	  SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
-	  SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
-	  SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
-	  SPH_C32(0x98aa496e) },
-	{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
-	  SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
-	  SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
-	  SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
-	  SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
-	  SPH_C32(0x094e3198) },
-	{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
-	  SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
-	  SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
-	  SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
-	  SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
-	  SPH_C32(0xe86cba2e) },
-	{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
-	  SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
-	  SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
-	  SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
-	  SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
-	  SPH_C32(0x4b7eec55) },
-	{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
-	  SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
-	  SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
-	  SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
-	  SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
-	  SPH_C32(0x1e7536a6) },
-	{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
-	  SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
-	  SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
-	  SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
-	  SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
-	  SPH_C32(0x24314f17) },
-	{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
-	  SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
-	  SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
-	  SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
-	  SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
-	  SPH_C32(0x9075b1ce) },
-	{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
-	  SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
-	  SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
-	  SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
-	  SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
-	  SPH_C32(0x9b6ef888) },
-	{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
-	  SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
-	  SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
-	  SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
-	  SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
-	  SPH_C32(0xd8b61463) },
-	{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
-	  SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
-	  SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
-	  SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
-	  SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
-	  SPH_C32(0x3ea660f7) },
-	{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
-	  SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
-	  SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
-	  SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
-	  SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
-	  SPH_C32(0x7f975691) },
-	{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
-	  SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
-	  SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
-	  SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
-	  SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
-	  SPH_C32(0x2c94459e) },
-	{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
-	  SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
-	  SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
-	  SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
-	  SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
-	  SPH_C32(0x56a7b19f) },
-	{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
-	  SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
-	  SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
-	  SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
-	  SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
-	  SPH_C32(0x81fdf908) },
-	{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
-	  SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
-	  SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
-	  SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
-	  SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
-	  SPH_C32(0x5bd61539) },
-	{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
-	  SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
-	  SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
-	  SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
-	  SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
-	  SPH_C32(0x15b961e7) },
-	{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
-	  SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
-	  SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
-	  SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
-	  SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
-	  SPH_C32(0x2a2c18f0) },
-	{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
-	  SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
-	  SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
-	  SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
-	  SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
-	  SPH_C32(0x551e3d6e) },
-	{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
-	  SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
-	  SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
-	  SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
-	  SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
-	  SPH_C32(0x33c5244f) },
-	{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
-	  SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
-	  SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
-	  SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
-	  SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
-	  SPH_C32(0x8a58e6a4) },
-	{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
-	  SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
-	  SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
-	  SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
-	  SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
-	  SPH_C32(0xda878000) },
-	{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
-	  SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
-	  SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
-	  SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
-	  SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
-	  SPH_C32(0x3c5dfffe) },
-	{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
-	  SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
-	  SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
-	  SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
-	  SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
-	  SPH_C32(0x7b1675d7) },
-	{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
-	  SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
-	  SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
-	  SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
-	  SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
-	  SPH_C32(0x2879ebac) },
-	{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
-	  SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
-	  SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
-	  SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
-	  SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
-	  SPH_C32(0xbe0a679e) },
-	{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
-	  SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
-	  SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
-	  SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
-	  SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
-	  SPH_C32(0x30aebcf7) },
-	{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
-	  SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
-	  SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
-	  SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
-	  SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
-	  SPH_C32(0xc7ff60f0) },
-	{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
-	  SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
-	  SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
-	  SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
-	  SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
-	  SPH_C32(0xe7e00a94) }
+{ 0xef0b0270, 0x3afd0000, 0x5dae0000, 0x69490000, 0x9b0f3c06, 0x4405b5f9, 0x66140a51, 0x924f5d0a, // 0
+  0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68 },
+{ 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68,
+  0x26600240, 0xddd80000, 0x722a0000, 0x4f060000, 0x936667ff, 0x29f944ce, 0x368b63d5, 0x0c26f262 },
+{ 0x145a3c00, 0xb9e90000, 0x61270000, 0xf1610000, 0xce613d6c, 0xb0493d78, 0x47a96720, 0xe18e24c5,
+  0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f },
+{ 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f,
+  0x373d2800, 0x71500000, 0x95e00000, 0x0a140000, 0xbdac1909, 0x48ef9831, 0x456d6d1f, 0x3daac2da },
+{ 0x54285c00, 0xeaed0000, 0xc5d60000, 0xa1c50000, 0xb3a26770, 0x94a5c4e1, 0x6bb0419d, 0x551b3782,
+  0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29 },
+{ 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29,
+  0xc8934400, 0x5a3e0000, 0x57870000, 0x4c560000, 0xea982435, 0x75b11115, 0x28b67247, 0x2dd1f9ab },
+{ 0x29449c00, 0x64e70000, 0xf24b0000, 0xc2f30000, 0x0ede4e8f, 0x56c23745, 0xf3e04259, 0x8d0d9ec4,
+  0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2 },
+{ 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2, // 7
+  0x6f299000, 0x6c850000, 0x2f160000, 0x782e0000, 0x644c37cd, 0x12dd1cd6, 0xd26a8c36, 0x32219526 },
+{ 0xf6800005, 0x3443c000, 0x24070000, 0x8f3d0000, 0x21373bfb, 0x0ab8d5ae, 0xcdc58b19, 0xd795ba31,
+  0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88 },
+{ 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88,
+  0x50ff0004, 0x45744000, 0x3dfb0000, 0x19e60000, 0x1bbc5606, 0xe1727b5d, 0xe1a8cc96, 0x7b1bd6b9 },
+{ 0xf7750009, 0xcf3cc000, 0xc3d60000, 0x04920000, 0x029519a9, 0xf8e836ba, 0x7a87f14e, 0x9e16981a,
+  0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320 },
+{ 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320,
+  0x231f0009, 0x42f40000, 0x66790000, 0x4ebb0000, 0xfedb5bd3, 0x315cb0d6, 0xe2b1674a, 0x69505b3a },
+{ 0x774400f0, 0xf15a0000, 0xf5b20000, 0x34140000, 0x89377e8c, 0x5a8bec25, 0x0bc3cd1e, 0xcf3775cb,
+  0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574 },
+{ 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574,
+  0x832800a0, 0x67420000, 0xe1170000, 0x370b0000, 0xcba30034, 0x3c34923c, 0x9767bdcc, 0x450360bf },
+{ 0xe8870170, 0x9d720000, 0x12db0000, 0xd4220000, 0xf2886b27, 0xa921e543, 0x4ef8b518, 0x618813b1, // 14
+  0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758 },
+{ 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758,
+  0x5cb00110, 0x913e0000, 0x44190000, 0x888c0000, 0x66dc7418, 0x921f1d66, 0x55ceea25, 0x925c44e9 },
+{ 0x0c720000, 0x49e50f00, 0x42790000, 0x5cea0000, 0x33aa301a, 0x15822514, 0x95a34b7b, 0xb44b0090,
+  0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f },
+{ 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f,
+  0xf2500000, 0xeebd0a00, 0x67a80000, 0xab8a0000, 0xba9b48c0, 0x0a56dd74, 0xdb73e86e, 0x1568ff0f },
+{ 0x45180000, 0xa5b51700, 0xf96a0000, 0x3b480000, 0x1ecc142c, 0x231395d6, 0x16bca6b0, 0xdf33f4df,
+  0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e },
+{ 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e,
+  0xfd250000, 0xb3c41100, 0xcef00000, 0xcef90000, 0x3c4d7580, 0x8d5b6493, 0x7098b0a6, 0x1af21fe1 },
+{ 0x75a40000, 0xc28b2700, 0x94a40000, 0x90f50000, 0xfb7857e0, 0x49ce0bae, 0x1767c483, 0xaedf667e,
+  0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b },
+{ 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b, // 21
+  0xa4c20000, 0xd9372400, 0x0a480000, 0x66610000, 0xf87a12c7, 0x86bef75c, 0xa324df94, 0x2ba05a55 },
+{ 0x75c90003, 0x0e10c000, 0xd1200000, 0xbaea0000, 0x8bc42f3e, 0x8758b757, 0xbb28761d, 0x00b72e2b,
+  0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254 },
+{ 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254,
+  0x9b060002, 0x61468000, 0x221e0000, 0x1d740000, 0x36715d27, 0x30495c92, 0xf11336a7, 0xfe1cdc7f },
+{ 0x86790000, 0x3f390002, 0xe19ae000, 0x98560000, 0x9565670e, 0x4e88c8ea, 0xd3dd4944, 0x161ddab9,
+  0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834 },
+{ 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834,
+  0xb6ce0000, 0xdae90002, 0x156e8000, 0xda920000, 0xf6dd5a64, 0x36325c8a, 0xf272e8ae, 0xa6b8c28d },
+{ 0x14190000, 0x23ca003c, 0x50df0000, 0x44b60000, 0x1b6c67b0, 0x3cf3ac75, 0x61e610b0, 0xdbcadb80,
+  0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7 },
+{ 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7,
+  0xf75a0000, 0x19840028, 0xa2190000, 0xeef80000, 0xc0722516, 0x19981260, 0x73dba1e6, 0xe1844257 },
+{ 0x54500000, 0x0671005c, 0x25ae0000, 0x6a1e0000, 0x2ea54edf, 0x664e8512, 0xbfba18c3, 0x7e715d17, // 28
+  0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e },
+{ 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e,
+  0xe8dd0000, 0xfa4a0044, 0x3c2d0000, 0xbb150000, 0x80bd361b, 0x24e81d44, 0xbfa8c2f4, 0x524a0d59 },
+{ 0x69510000, 0xd4e1009c, 0xc3230000, 0xac2f0000, 0xe4950bae, 0xcea415dc, 0x87ec287c, 0xbce1a3ce,
+  0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173 },
+{ 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173,
+  0xaf220000, 0x7b6c0090, 0x67e20000, 0x8da20000, 0xc7841e29, 0xb7b744f3, 0x9ac484f4, 0x8b6c72bd },
+{ 0xcc140000, 0xa5630000, 0x5ab90780, 0x3b500000, 0x4bd013ff, 0x879b3418, 0x694348c1, 0xca5a87fe,
+  0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa },
+{ 0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa,
+  0x4d8a0000, 0x49340000, 0x3c8b0500, 0xaea30000, 0x16793bfd, 0xcf6f08a4, 0x8f19eaec, 0x443d3004 },
+{ 0x78230000, 0x12fc0000, 0xa93a0b80, 0x90a50000, 0x713e2879, 0x7ee98924, 0xf08ca062, 0x636f8bab,
+  0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b },
+{ 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b, // 35
+  0x7a8c0000, 0xa5d40000, 0x13260880, 0xc63d0000, 0xcbb36daa, 0xfea14f43, 0x59d0b4f8, 0x979961d0 },
+{ 0xac480000, 0x1ba60000, 0x45fb1380, 0x03430000, 0x5a85316a, 0x1fb250b6, 0xfe72c7fe, 0x91e478f6,
+  0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e },
+{ 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e,
+  0xb2060000, 0xc5690000, 0x28031200, 0x74670000, 0xb6c236f4, 0xeb1239f8, 0x33d1dfec, 0x094e3198 },
+{ 0xaec30000, 0x9c4f0001, 0x79d1e000, 0x2c150000, 0x45cc75b3, 0x6650b736, 0xab92f78f, 0xa312567b,
+  0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e },
+{ 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e,
+  0x75e60000, 0x95660001, 0x307b2000, 0xadf40000, 0x8f321eea, 0x24298307, 0xe8c49cf9, 0x4b7eec55 },
+{ 0x58430000, 0x807e0000, 0x78330001, 0xc66b3800, 0xe7375cdc, 0x79ad3fdd, 0xac73fe6f, 0x3a4479b1,
+  0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6 },
+{ 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6,
+  0x45190000, 0xab0c0000, 0x30be0001, 0x690a2000, 0xc2fc7219, 0xb1d4800d, 0x2dd1fa46, 0x24314f17 },
+{ 0xa53b0000, 0x14260000, 0x4e30001e, 0x7cae0000, 0x8f9e0dd5, 0x78dfaa3d, 0xf73168d8, 0x0b1b4946, // 42
+  0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce },
+{ 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce,
+  0xa2d60000, 0xa6760000, 0xc9440014, 0xeba30000, 0xccec2e7b, 0x3018c499, 0x03490afa, 0x9b6ef888 },
+{ 0x88980000, 0x1f940000, 0x7fcf002e, 0xfb4e0000, 0xf158079a, 0x61ae9167, 0xa895706c, 0xe6107494,
+  0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463 },
+{ 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463,
+  0x835a0000, 0xc4f70000, 0x01470022, 0xeec80000, 0x60a54f69, 0x142f2a24, 0x5cf534f2, 0x3ea660f7 },
+{ 0x52500000, 0x29540000, 0x6a61004e, 0xf0ff0000, 0x9a317eec, 0x452341ce, 0xcf568fe5, 0x5303130f,
+  0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691 },
+{ 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691,
+  0x01dd0000, 0x80a80000, 0xf4960048, 0xa6000000, 0x90d57ea2, 0xd7e68c37, 0x6612cffd, 0x2c94459e },
+{ 0xe6280000, 0x4c4b0000, 0xa8550000, 0xd3d002e0, 0xd86130b8, 0x98a7b0da, 0x289506b4, 0xd75a4897,
+  0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f },
+{ 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f, // 49
+  0x16ed0000, 0x15680000, 0xedd70000, 0x325d0220, 0xe30c3689, 0x5a4ae643, 0xe375f8a8, 0x81fdf908 },
+{ 0xb4310000, 0x77330000, 0xb15d0000, 0x7fd004e0, 0x78a26138, 0xd116c35d, 0xd256d489, 0x4e6f74de,
+  0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539 },
+{ 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539,
+  0x57370000, 0xcaf20000, 0x364e0000, 0xc0220480, 0x56186b22, 0x5ca3f40c, 0xa1937f8f, 0x15b961e7 },
+{ 0x02f20000, 0xa2810000, 0x873f0000, 0xe36c7800, 0x1e1d74ef, 0x073d2bd6, 0xc4c23237, 0x7f32259e,
+  0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0 },
+{ 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0,
+  0xb82f0000, 0xb12c0000, 0x30d80000, 0x14445000, 0xc15860a2, 0x3127e8ec, 0x2e98bf23, 0x551e3d6e },
+{ 0x1e6c0000, 0xc4420000, 0x8a2e0000, 0xbcb6b800, 0x2c4413b6, 0x8bfdd3da, 0x6a0c1bc8, 0xb99dc2eb,
+  0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f },
+{ 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f,
+  0x8c3a0000, 0xda980000, 0x607f0000, 0x54078800, 0x85714513, 0x6006b243, 0xdb50399c, 0x8a58e6a4 },
+{ 0x033d0000, 0x08b30000, 0xf33a0000, 0x3ac20007, 0x51298a50, 0x6b6e661f, 0x0ea5cfe3, 0xe6da7ffe, // 56
+  0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000 },
+{ 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000,
+  0xabe70000, 0x9e0d0000, 0xaf270000, 0x3d180005, 0x2c4f1fd3, 0x74f61695, 0xb5c347eb, 0x3c5dfffe },
+{ 0x01930000, 0xe7820000, 0xedfb0000, 0xcf0c000b, 0x8dd08d58, 0xbca3b42e, 0x063661e1, 0x536f9e7b,
+  0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7 },
+{ 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7,
+  0x93bb0000, 0x3b070000, 0xba010000, 0x99d00008, 0x3739ae4e, 0xe64c1722, 0x96f896b3, 0x2879ebac },
+{ 0x5fa80000, 0x56030000, 0x43ae0000, 0x64f30013, 0x257e86bf, 0x1311944e, 0x541e95bf, 0x8ea4db69,
+  0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e },
+{ 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e,
+  0x5fec0000, 0x294b0000, 0x99d20000, 0x4ed00012, 0x1ed34f73, 0xbaa708c9, 0x57140bdf, 0x30aebcf7 },
+{ 0xee930000, 0xd6070000, 0x92c10000, 0x2b9801e0, 0x9451287c, 0x3b6cfb57, 0x45312374, 0x201f6a64,
+  0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0 },
+{ 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0,
+  0x95bb0000, 0x81450000, 0x3b240000, 0x48db0140, 0x0a8a6c53, 0x56f56eec, 0x62c91877, 0xe7e00a94 }
 };

 __global__
@ -598,12 +328,12 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
				@@ -598,12 +328,12 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
 		uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
 		unsigned char *h1 = (unsigned char *)Hash;

-		uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265);
-		uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c);
-		uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48);
-		uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d);
-		uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
+		uint32_t c0 = 0x73746565, c1 = 0x6c706172, c2 = 0x6b204172, c3 = 0x656e6265;
+		uint32_t c4 = 0x72672031, c5 = 0x302c2062, c6 = 0x75732032, c7 = 0x3434362c;
+		uint32_t c8 = 0x20422d33, c9 = 0x30303120, cA = 0x4c657576, cB = 0x656e2d48;
+		uint32_t cC = 0x65766572, cD = 0x6c65652c, cE = 0x2042656c, cF = 0x6769756d;
 		uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
+		uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
 		uint32_t *tp, db, dm;

 		for(int i = 0; i < 64; i += 8) {
@ -637,16 +367,16 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
				@@ -637,16 +367,16 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
 			T_BIG;
 		}

+		// precomputed for 64 bytes blocks ?
 		tp = &d_T512[0][0] + 112;
-
-		m0 = *(tp+ 0); m1 = *(tp+ 1);
-		m2 = *(tp+ 2); m3 = *(tp+ 3);
-		m4 = *(tp+ 4); m5 = *(tp+ 5);
-		m6 = *(tp+ 6); m7 = *(tp+ 7);
-		m8 = *(tp+ 8); m9 = *(tp+ 9);
-		mA = *(tp+10); mB = *(tp+11);
-		mC = *(tp+12); mD = *(tp+13);
-		mE = *(tp+14); mF = *(tp+15);
+		m0 = tp[ 0]; m1 = tp[ 1];
+		m2 = tp[ 2]; m3 = tp[ 3];
+		m4 = tp[ 4]; m5 = tp[ 5];
+		m6 = tp[ 6]; m7 = tp[ 7];
+		m8 = tp[ 8]; m9 = tp[ 9];
+		mA = tp[10]; mB = tp[11];
+		mC = tp[12]; mD = tp[13];
+		mE = tp[14]; mF = tp[15];

 		for( int r = 0; r < 6; r += 2 ) {
 			ROUND_BIG(r, d_alpha_n);
@ -655,15 +385,14 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
				@@ -655,15 +385,14 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
 		T_BIG;

 		tp = &d_T512[0][0] + 784;
-
-		m0 = *(tp+ 0); m1 = *(tp+ 1);
-		m2 = *(tp+ 2); m3 = *(tp+ 3);
-		m4 = *(tp+ 4); m5 = *(tp+ 5);
-		m6 = *(tp+ 6); m7 = *(tp+ 7);
-		m8 = *(tp+ 8); m9 = *(tp+ 9);
-		mA = *(tp+10); mB = *(tp+11);
-		mC = *(tp+12); mD = *(tp+13);
-		mE = *(tp+14); mF = *(tp+15);
+		m0 = tp[ 0]; m1 = tp[ 1];
+		m2 = tp[ 2]; m3 = tp[ 3];
+		m4 = tp[ 4]; m5 = tp[ 5];
+		m6 = tp[ 6]; m7 = tp[ 7];
+		m8 = tp[ 8]; m9 = tp[ 9];
+		mA = tp[10]; mB = tp[11];
+		mC = tp[12]; mD = tp[13];
+		mE = tp[14]; mF = tp[15];

 		for( int r = 0; r < 12; r += 2 ) {
 			ROUND_BIG(r, d_alpha_f);
@ -696,3 +425,127 @@ void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce
				@@ -696,3 +425,127 @@ void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce
 	x13_hamsi512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 	//MyStreamSynchronize(NULL, order, thr_id);
 }
+
+__constant__ static uint64_t c_PaddedMessage80[10];
+
+__host__
+void x16_hamsi512_setBlock_80(void *pdata)
+{
+	cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+}
+
+__global__
+void x16_hamsi512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		unsigned char h1[80];
+		#pragma unroll
+		for (int i = 0; i < 10; i++)
+			((uint2*)h1)[i] = ((uint2*)c_PaddedMessage80)[i];
+		//((uint64_t*)h1)[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(startNonce + thread));
+		((uint32_t*)h1)[19] = cuda_swab32(startNonce + thread);
+
+		uint32_t c0 = 0x73746565, c1 = 0x6c706172, c2 = 0x6b204172, c3 = 0x656e6265;
+		uint32_t c4 = 0x72672031, c5 = 0x302c2062, c6 = 0x75732032, c7 = 0x3434362c;
+		uint32_t c8 = 0x20422d33, c9 = 0x30303120, cA = 0x4c657576, cB = 0x656e2d48;
+		uint32_t cC = 0x65766572, cD = 0x6c65652c, cE = 0x2042656c, cF = 0x6769756d;
+		uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
+		uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
+		uint32_t *tp, db, dm;
+
+		for(int i = 0; i < 80; i += 8)
+		{
+			m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0;
+			m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0;
+			tp = &d_T512[0][0];
+
+			#pragma unroll
+			for (int u = 0; u < 8; u++) {
+				db = h1[i + u];
+				#pragma unroll 2
+				for (int v = 0; v < 8; v++, db >>= 1) {
+					dm = -(uint32_t)(db & 1);
+					m0 ^= dm & tp[ 0]; m1 ^= dm & tp[ 1];
+					m2 ^= dm & tp[ 2]; m3 ^= dm & tp[ 3];
+					m4 ^= dm & tp[ 4]; m5 ^= dm & tp[ 5];
+					m6 ^= dm & tp[ 6]; m7 ^= dm & tp[ 7];
+					m8 ^= dm & tp[ 8]; m9 ^= dm & tp[ 9];
+					mA ^= dm & tp[10]; mB ^= dm & tp[11];
+					mC ^= dm & tp[12]; mD ^= dm & tp[13];
+					mE ^= dm & tp[14]; mF ^= dm & tp[15];
+					tp += 16;
+				}
+			}
+
+			#pragma unroll
+			for (int r = 0; r < 6; r++) {
+				ROUND_BIG(r, d_alpha_n);
+			}
+			T_BIG;
+		}
+
+		#define INPUT_BIG { \
+			m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0; \
+			m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0; \
+			tp = &d_T512[0][0]; \
+			for (int u = 0; u < 8; u++) { \
+				db = endtag[u]; \
+				for (int v = 0; v < 8; v++, db >>= 1) { \
+					dm = -(uint32_t)(db & 1); \
+					m0 ^= dm & tp[ 0]; m1 ^= dm & tp[ 1]; \
+					m2 ^= dm & tp[ 2]; m3 ^= dm & tp[ 3]; \
+					m4 ^= dm & tp[ 4]; m5 ^= dm & tp[ 5]; \
+					m6 ^= dm & tp[ 6]; m7 ^= dm & tp[ 7]; \
+					m8 ^= dm & tp[ 8]; m9 ^= dm & tp[ 9]; \
+					mA ^= dm & tp[10]; mB ^= dm & tp[11]; \
+					mC ^= dm & tp[12]; mD ^= dm & tp[13]; \
+					mE ^= dm & tp[14]; mF ^= dm & tp[15]; \
+					tp += 16; \
+				} \
+			} \
+		}
+
+		// close
+		uint8_t endtag[8] = { 0x80, 0x00, 0x00, 0x00,  0x00, 0x00, 0x00, 0x00 };
+		INPUT_BIG;
+
+		#pragma unroll
+		for (int r = 0; r < 6; r++) {
+			ROUND_BIG(r, d_alpha_n);
+		}
+		T_BIG;
+
+		endtag[0] = endtag[1] = 0x00;
+		endtag[6] = 0x02;
+		endtag[7] = 0x80;
+		INPUT_BIG;
+
+		// PF_BIG
+		#pragma unroll
+		for(int r = 0; r < 12; r++) {
+			ROUND_BIG(r, d_alpha_f);
+		}
+		T_BIG;
+
+		uint64_t hashPosition = thread;
+		uint32_t *Hash = (uint32_t*)&g_hash[hashPosition << 3];
+		#pragma unroll 16
+		for(int i = 0; i < 16; i++)
+			Hash[i] = cuda_swab32(h[i]);
+
+		#undef INPUT_BIG
+	}
+}
+
+__host__
+void x16_hamsi512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 128;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	x16_hamsi512_gpu_hash_80 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash);
+}
--- a/x15/cuda_x15_whirlpool_sm3.cu
+++ b/x15/cuda_x15_whirlpool_sm3.cu
@ -1998,7 +1998,7 @@ const int i0, const int i1, const int i2, const int i3, const int i4, const int
				@@ -1998,7 +1998,7 @@ const int i0, const int i1, const int i2, const int i3, const int i4, const int


 __global__
-void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, int swab)
+void oldwhirlpool_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, void *outputHash, int swab)
 {
 	__shared__ uint64_t sharedMemory[2048];

@ -2014,7 +2014,8 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
				@@ -2014,7 +2014,8 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
 			sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
 		#endif
 	}
-	__threadfence_block(); // ensure shared mem is ready
+	//__threadfence_block(); // ensure shared mem is ready
+	__syncthreads();

 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@ -2028,7 +2029,8 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
				@@ -2028,7 +2029,8 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
 		uint64_t state[8];
 		#pragma unroll 8
 		for (int i=0; i < 8; i++) {
-			state[i] = c_PaddedMessage80[i];
+			//state[i] = c_PaddedMessage80[i];
+			AS_UINT2(&state[i]) = AS_UINT2(&c_PaddedMessage80[i]);
 		}
 #else
 		#pragma unroll 8
@ -2050,6 +2052,7 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
				@@ -2050,6 +2052,7 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
 			state[i] = xor1(n[i],c_PaddedMessage80[i]);
 		}
 #endif
+
 		/// round 2 ///////
 		//////////////////////////////////
 		n[0] = c_PaddedMessage80[8];    //read data
@ -2331,7 +2334,7 @@ extern uint32_t whirlpool512_finalhash_64(int thr_id, uint32_t threads, uint32_t
				@@ -2331,7 +2334,7 @@ extern uint32_t whirlpool512_finalhash_64(int thr_id, uint32_t threads, uint32_t
 }

 __host__
-void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
+void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash)
 {
 	dim3 grid((threads + threadsperblock-1) / threadsperblock);
 	dim3 block(threadsperblock);
@ -2339,7 +2342,7 @@ void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNounce
				@@ -2339,7 +2342,7 @@ void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNounce
 	if (threads < 256)
 		applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!");

-	oldwhirlpool_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_outputHash, 1);
+	oldwhirlpool_gpu_hash_80<<<grid, block>>>(threads, startNonce, d_outputHash, 1);
 }

 extern void whirl_midstate(void *state, const void *input);
@ -2363,3 +2366,54 @@ void whirlpool512_setBlock_80_sm3(void *pdata, const void *ptarget)
				@@ -2363,3 +2366,54 @@ void whirlpool512_setBlock_80_sm3(void *pdata, const void *ptarget)
 	cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice);
 }
+
+// ------------------------------------------------------------------------------------------------
+
+__host__
+void x16_whirlpool512_init(int thr_id, uint32_t threads)
+{
+	cudaMemcpyToSymbol(InitVector_RC, plain_RC, sizeof(plain_RC), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(mixTob0Tox, plain_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice);
+#if USE_ALL_TABLES
+	cudaMemcpyToSymbol(mixTob1Tox, plain_T1, (256 * 8), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(mixTob2Tox, plain_T2, (256 * 8), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(mixTob3Tox, plain_T3, (256 * 8), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(mixTob4Tox, plain_T4, (256 * 8), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(mixTob5Tox, plain_T5, (256 * 8), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(mixTob6Tox, plain_T6, (256 * 8), 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(mixTob7Tox, plain_T7, (256 * 8), 0, cudaMemcpyHostToDevice);
+#endif
+}
+
+extern void whirlpool_midstate(void *state, const void *input);
+
+__host__
+void x16_whirlpool512_setBlock_80(void *pdata)
+{
+	unsigned char PaddedMessage[128];
+
+	memcpy(PaddedMessage, pdata, 80);
+	memset(PaddedMessage + 80, 0, 48);
+	PaddedMessage[80] = 0x80; /* ending */
+
+#if HOST_MIDSTATE
+	// compute constant first block
+	unsigned char midstate[64] = { 0 };
+	whirlpool_midstate(midstate, pdata);
+	memcpy(PaddedMessage, midstate, 64);
+#endif
+
+	cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice);
+}
+
+__host__
+void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_outputHash)
+{
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	if (threads < 256)
+		applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!");
+
+	oldwhirlpool_gpu_hash_80 <<<grid, block>>> (threads, startNonce, d_outputHash, 1);
+}
--- a/x16/cuda_x16.h
+++ b/x16/cuda_x16.h
@ -0,0 +1,80 @@
				@@ -0,0 +1,80 @@
+#include "x11/cuda_x11.h"
+
+extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
+
+extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
+extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag);
+extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x15_whirlpool_cpu_free(int thr_id);
+
+extern void x17_sha512_cpu_init(int thr_id, uint32_t threads);
+extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash);
+
+extern void x17_haval256_cpu_init(int thr_id, uint32_t threads);
+extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, const int outlen);
+
+void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
+
+// ---- optimised but non compatible kernels
+
+void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+
+// ---- 80 bytes kernels
+
+void quark_bmw512_cpu_setBlock_80(void *pdata);
+void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
+
+void groestl512_setBlock_80(int thr_id, uint32_t *endiandata);
+void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void skein512_cpu_setBlock_80(void *pdata);
+void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap);
+
+void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
+void qubit_luffa512_cpu_setBlock_80(void *pdata);
+void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
+
+void jh512_setBlock_80(int thr_id, uint32_t *endiandata);
+void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void keccak512_setBlock_80(int thr_id, uint32_t *endiandata);
+void keccak512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
+void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x11_shavite512_setBlock_80(void *pdata);
+void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
+
+void x16_shabal512_setBlock_80(void *pdata);
+void x16_shabal512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x16_simd512_setBlock_80(void *pdata);
+void x16_simd512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x16_echo512_cuda_init(int thr_id, const uint32_t threads);
+void x16_echo512_setBlock_80(void *pdata);
+void x16_echo512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x16_hamsi512_setBlock_80(void *pdata);
+void x16_hamsi512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x16_fugue512_cpu_init(int thr_id, uint32_t threads);
+void x16_fugue512_cpu_free(int thr_id);
+void x16_fugue512_setBlock_80(void *pdata);
+void x16_fugue512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x16_whirlpool512_init(int thr_id, uint32_t threads);
+void x16_whirlpool512_setBlock_80(void* endiandata);
+void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
+void x16_sha512_setBlock_80(void *pdata);
+void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
+
--- a/x16/cuda_x16_echo512.cu
+++ b/x16/cuda_x16_echo512.cu
@ -0,0 +1,214 @@
				@@ -0,0 +1,214 @@
+/**
+ * echo512-80 cuda kernel for X16R algorithm
+ *
+ * tpruvot 2018 - GPL code
+ */
+
+#include <stdio.h>
+#include <memory.h>
+
+#include "cuda_helper.h"
+
+extern __device__ __device_builtin__ void __threadfence_block(void);
+
+#include "../x11/cuda_x11_aes.cuh"
+
+__device__ __forceinline__ void AES_2ROUND(const uint32_t* __restrict__ sharedMemory,
+	uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3,
+	uint32_t &k0)
+{
+	uint32_t y0, y1, y2, y3;
+
+	aes_round(sharedMemory,
+		x0, x1, x2, x3,
+		k0,
+		y0, y1, y2, y3);
+
+	aes_round(sharedMemory,
+		y0, y1, y2, y3,
+		x0, x1, x2, x3);
+
+	k0++;
+}
+
+__device__
+static void echo_round(uint32_t* const sharedMemory, uint32_t *W, uint32_t &k0)
+{
+	// Big Sub Words
+	#pragma unroll 16
+	for (int idx = 0; idx < 16; idx++) {
+		AES_2ROUND(sharedMemory, W[(idx << 2) + 0], W[(idx << 2) + 1], W[(idx << 2) + 2], W[(idx << 2) + 3], k0);
+	}
+
+	// Shift Rows
+	#pragma unroll 4
+	for (int i = 0; i < 4; i++)
+	{
+		uint32_t t[4];
+		/// 1, 5, 9, 13
+		t[0] = W[i +  4];
+		t[1] = W[i +  8];
+		t[2] = W[i + 24];
+		t[3] = W[i + 60];
+
+		W[i +  4] = W[i + 20];
+		W[i +  8] = W[i + 40];
+		W[i + 24] = W[i + 56];
+		W[i + 60] = W[i + 44];
+
+		W[i + 20] = W[i + 36];
+		W[i + 40] = t[1];
+		W[i + 56] = t[2];
+		W[i + 44] = W[i + 28];
+
+		W[i + 28] = W[i + 12];
+		W[i + 12] = t[3];
+		W[i + 36] = W[i + 52];
+		W[i + 52] = t[0];
+	}
+
+	// Mix Columns
+	#pragma unroll 4
+	for (int i = 0; i < 4; i++)
+	{
+		#pragma unroll 4
+		for (int idx = 0; idx < 64; idx += 16)
+		{
+			uint32_t a[4];
+			a[0] = W[idx + i];
+			a[1] = W[idx + i + 4];
+			a[2] = W[idx + i + 8];
+			a[3] = W[idx + i + 12];
+
+			uint32_t ab = a[0] ^ a[1];
+			uint32_t bc = a[1] ^ a[2];
+			uint32_t cd = a[2] ^ a[3];
+
+			uint32_t t, t2, t3;
+			t  = (ab & 0x80808080);
+			t2 = (bc & 0x80808080);
+			t3 = (cd & 0x80808080);
+
+			uint32_t abx = (t  >> 7) * 27U ^ ((ab^t)  << 1);
+			uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+			uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+			W[idx + i] = bc ^ a[3] ^ abx;
+			W[idx + i +  4] = a[0] ^ cd ^ bcx;
+			W[idx + i +  8] = ab ^ a[3] ^ cdx;
+			W[idx + i + 12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx);
+		}
+	}
+}
+
+__device__ __forceinline__
+void cuda_echo_round_80(uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__ data, const uint32_t nonce, uint32_t *hash)
+{
+	uint32_t h[29]; // <= 127 bytes input
+
+	#pragma unroll 8
+	for (int i = 0; i < 18; i += 2)
+		AS_UINT2(&h[i]) = AS_UINT2(&data[i]);
+	h[18] = data[18];
+	h[19] = cuda_swab32(nonce);
+	h[20] = 0x80;
+	h[21] = h[22] = h[23] = h[24] = h[25] = h[26] = 0;
+	//((uint8_t*)h)[80] = 0x80;
+	//((uint8_t*)h)[128-17] = 0x02;
+	//((uint8_t*)h)[128-16] = 0x80;
+	//((uint8_t*)h)[128-15] = 0x02;
+	h[27] = 0x2000000;
+	h[28] = 0x280;
+	//h[29] = h[30] = h[31] = 0;
+
+	uint32_t k0 = 640; // bitlen
+	uint32_t W[64];
+
+	#pragma unroll 8
+	for (int i = 0; i < 32; i+=4) {
+		W[i] = 512; // L
+		W[i+1] = 0; // H
+		W[i+2] = 0; // X
+		W[i+3] = 0;
+	}
+
+	uint32_t Z[16];
+	#pragma unroll
+	for (int i = 0;  i<16; i++) Z[i] = W[i];
+	#pragma unroll
+	for (int i = 32; i<61; i++) W[i] = h[i - 32];
+	#pragma unroll
+	for (int i = 61; i<64; i++) W[i] = 0;
+
+	for (int i = 0; i < 10; i++)
+		echo_round(sharedMemory, W, k0);
+
+	#pragma unroll 16
+	for (int i = 0; i < 16; i++) {
+		Z[i] ^= h[i] ^ W[i] ^ W[i + 32];
+	}
+
+	#pragma unroll 8
+	for (int i = 0; i < 16; i += 2)
+		AS_UINT2(&hash[i]) = AS_UINT2(&Z[i]);
+}
+
+__device__ __forceinline__
+void echo_gpu_init(uint32_t *const __restrict__ sharedMemory)
+{
+	/* each thread startup will fill a uint32 */
+	if (threadIdx.x < 128) {
+		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
+		sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
+		sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
+		sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
+
+		sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
+		sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
+	}
+}
+
+__host__
+void x16_echo512_cuda_init(int thr_id, const uint32_t threads)
+{
+	aes_cpu_init(thr_id);
+}
+
+__constant__ static uint32_t c_PaddedMessage80[20];
+
+__host__
+void x16_echo512_setBlock_80(void *endiandata)
+{
+	cudaMemcpyToSymbol(c_PaddedMessage80, endiandata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+}
+
+__global__ __launch_bounds__(128, 7) /* will force 72 registers */
+void x16_echo512_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint64_t *g_hash)
+{
+	__shared__ uint32_t sharedMemory[1024];
+
+	echo_gpu_init(sharedMemory);
+	__threadfence_block();
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint64_t hashPosition = thread;
+		uint32_t *pHash = (uint32_t*)&g_hash[hashPosition<<3];
+
+		cuda_echo_round_80(sharedMemory, c_PaddedMessage80, startNonce + thread, pHash);
+	}
+}
+
+__host__
+void x16_echo512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 128;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	x16_echo512_gpu_hash_80<<<grid, block>>>(threads, startNonce, (uint64_t*)d_hash);
+}
--- a/x16/cuda_x16_echo512_64.cu
+++ b/x16/cuda_x16_echo512_64.cu
@ -0,0 +1,248 @@
				@@ -0,0 +1,248 @@
+/**
+ * Echo512-64 kernel for maxwell, based on alexis work
+ */
+
+#include <cuda_helper.h>
+#include <cuda_vector_uint2x4.h>
+#include <cuda_vectors.h>
+
+#define INTENSIVE_GMF
+#include "tribus/cuda_echo512_aes.cuh"
+
+#ifdef __INTELLISENSE__
+#define __byte_perm(x, y, b) x
+#define atomicExch(p,y) (*p) = y
+#endif
+
+__device__
+static void echo_round_alexis(const uint32_t sharedMemory[4][256], uint32_t *W, uint32_t &k0)
+{
+	// Big Sub Words
+	#pragma unroll 16
+	for (int idx = 0; idx < 16; idx++)
+		AES_2ROUND(sharedMemory,W[(idx<<2) + 0], W[(idx<<2) + 1], W[(idx<<2) + 2], W[(idx<<2) + 3], k0);
+
+	// Shift Rows
+	#pragma unroll 4
+	for (int i = 0; i < 4; i++){
+		uint32_t t[4];
+		/// 1, 5, 9, 13
+		t[0] = W[i+ 4];
+		t[1] = W[i+ 8];
+		t[2] = W[i+24];
+		t[3] = W[i+60];
+		W[i + 4] = W[i + 20];
+		W[i + 8] = W[i + 40];
+		W[i +24] = W[i + 56];
+		W[i +60] = W[i + 44];
+
+		W[i +20] = W[i +36];
+		W[i +40] = t[1];
+		W[i +56] = t[2];
+		W[i +44] = W[i +28];
+
+		W[i +28] = W[i +12];
+		W[i +12] = t[3];
+		W[i +36] = W[i +52];
+		W[i +52] = t[0];
+	}
+	// Mix Columns
+	#pragma unroll 4
+	for (int i = 0; i < 4; i++){ // Schleife über je 2*uint32_t
+		#pragma unroll 4
+		for (int idx = 0; idx < 64; idx += 16){ // Schleife über die elemnte
+			uint32_t a[4];
+			a[0] = W[idx + i];
+			a[1] = W[idx + i + 4];
+			a[2] = W[idx + i + 8];
+			a[3] = W[idx + i +12];
+
+			uint32_t ab = a[0] ^ a[1];
+			uint32_t bc = a[1] ^ a[2];
+			uint32_t cd = a[2] ^ a[3];
+
+			uint32_t t, t2, t3;
+			t = (ab & 0x80808080);
+			t2 = (bc & 0x80808080);
+			t3 = (cd & 0x80808080);
+
+			uint32_t abx = (t  >> 7) * 27U ^ ((ab^t) << 1);
+			uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+			uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+			W[idx + i] = bc ^ a[3] ^ abx;
+			W[idx + i + 4] = a[0] ^ cd ^ bcx;
+			W[idx + i + 8] = ab ^ a[3] ^ cdx;
+			W[idx + i +12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx);
+		}
+	}
+}
+
+__global__ __launch_bounds__(128, 5) /* will force 80 registers */
+static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
+{
+	__shared__ uint32_t sharedMemory[4][256];
+
+	aes_gpu_init128(sharedMemory);
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	uint32_t k0;
+	uint32_t h[16];
+	uint32_t hash[16];
+	if (thread < threads)
+	{
+		uint32_t *Hash = &g_hash[thread<<4];
+
+		*(uint2x4*)&h[ 0] = __ldg4((uint2x4*)&Hash[ 0]);
+		*(uint2x4*)&h[ 8] = __ldg4((uint2x4*)&Hash[ 8]);
+
+		*(uint2x4*)&hash[ 0] = *(uint2x4*)&h[ 0];
+		*(uint2x4*)&hash[ 8] = *(uint2x4*)&h[ 8];
+
+		__syncthreads();
+
+		const uint32_t P[48] = {
+			0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+			//8-12
+			0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+			//21-25
+			0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751, 0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+			//34-38
+			0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7, 0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
+			0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968,
+			0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af
+			//58-61
+		};
+
+		k0 = 520;
+
+		#pragma unroll 4
+		for (uint32_t idx = 0; idx < 16; idx += 4) {
+			AES_2ROUND(sharedMemory, h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0);
+		}
+		k0 += 4;
+
+		uint32_t W[64];
+
+		#pragma unroll 4
+		for (uint32_t i = 0; i < 4; i++)
+		{
+			uint32_t a = P[i];
+			uint32_t b = P[i + 4];
+			uint32_t c = h[i + 8];
+			uint32_t d = P[i + 8];
+
+			uint32_t ab = a ^ b;
+			uint32_t bc = b ^ c;
+			uint32_t cd = c ^ d;
+
+
+			uint32_t t =  (ab & 0x80808080);
+			uint32_t t2 = (bc & 0x80808080);
+			uint32_t t3 = (cd & 0x80808080);
+
+			uint32_t abx = (t  >> 7) * 27U ^ ((ab^t) << 1);
+			uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+			uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+			W[i] = abx ^ bc ^ d;
+			W[i + 4] = bcx ^ a ^ cd;
+			W[i + 8] = cdx ^ ab ^ d;
+			W[i +12] = abx ^ bcx ^ cdx ^ ab ^ c;
+
+			a = P[i +12];
+			b = h[i + 4];
+			c = P[i +16];
+			d = P[i +20];
+
+			ab = a ^ b;
+			bc = b ^ c;
+			cd = c ^ d;
+
+
+			t = (ab & 0x80808080);
+			t2 = (bc & 0x80808080);
+			t3 = (cd & 0x80808080);
+
+			abx = (t >> 7) * 27U ^ ((ab^t) << 1);
+			bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+			cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+			W[16 + i] = bc ^ d ^ abx;
+			W[16 + i + 4] = a ^ cd ^ bcx;
+			W[16 + i + 8] = d ^ ab ^ cdx;
+			W[16 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx;
+
+			a = h[i];
+			b = P[24 + i + 0];
+			c = P[24 + i + 4];
+			d = P[24 + i + 8];
+
+			ab = a ^ b;
+			bc = b ^ c;
+			cd = c ^ d;
+
+
+			t = (ab & 0x80808080);
+			t2 = (bc & 0x80808080);
+			t3 = (cd & 0x80808080);
+
+			abx = (t >> 7) * 27U ^ ((ab^t) << 1);
+			bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+			cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+			W[32 + i] = bc ^ d ^ abx;
+			W[32 + i + 4] = a ^ cd ^ bcx;
+			W[32 + i + 8] = d ^ ab ^ cdx;
+			W[32 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx;
+
+			a = P[36 + i ];
+			b = P[36 + i + 4];
+			c = P[36 + i + 8];
+			d = h[i + 12];
+
+			ab = a ^ b;
+			bc = b ^ c;
+			cd = c ^ d;
+
+			t = (ab & 0x80808080);
+			t2 = (bc & 0x80808080);
+			t3 = (cd & 0x80808080);
+
+			abx = (t >> 7) * 27U ^ ((ab^t) << 1);
+			bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
+			cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
+
+			W[48 + i] = bc ^ d ^ abx;
+			W[48 + i + 4] = a ^ cd ^ bcx;
+			W[48 + i + 8] = d ^ ab ^ cdx;
+			W[48 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx;
+
+		}
+
+		for (int k = 1; k < 10; k++)
+			echo_round_alexis(sharedMemory,W,k0);
+
+		#pragma unroll 4
+		for (int i = 0; i < 16; i += 4)
+		{
+			W[i] ^= W[32 + i] ^ 512;
+			W[i + 1] ^= W[32 + i + 1];
+			W[i + 2] ^= W[32 + i + 2];
+			W[i + 3] ^= W[32 + i + 3];
+		}
+		*(uint2x4*)&Hash[ 0] = *(uint2x4*)&hash[ 0] ^ *(uint2x4*)&W[ 0];
+		*(uint2x4*)&Hash[ 8] = *(uint2x4*)&hash[ 8] ^ *(uint2x4*)&W[ 8];
+	}
+}
+
+__host__
+void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){
+
+	const uint32_t threadsperblock = 128;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	x16_echo512_gpu_hash_64<<<grid, block>>>(threads, d_hash);
+}
--- a/x16/cuda_x16_fugue512.cu
+++ b/x16/cuda_x16_fugue512.cu
@ -0,0 +1,467 @@
				@@ -0,0 +1,467 @@
+
+#include <stdio.h>
+#include <cuda_helper.h>
+
+#define TPB 256
+
+/*
+ * fugue512-80 x16r kernel implementation.
+ *
+ * ==========================(LICENSE BEGIN)============================
+ *
+ * Copyright (c) 2018 tpruvot
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ===========================(LICENSE END)=============================
+ */
+
+#ifdef __INTELLISENSE__
+#define __byte_perm(x, y, m) (x|y)
+#define tex1Dfetch(t, n) (n)
+#define __CUDACC__
+#include <cuda_texture_types.h>
+#endif
+
+// store allocated textures device addresses
+static unsigned int* d_textures[MAX_GPUS][1];
+
+#define mixtab0(x) mixtabs[(x)]
+#define mixtab1(x) mixtabs[(x)+256]
+#define mixtab2(x) mixtabs[(x)+512]
+#define mixtab3(x) mixtabs[(x)+768]
+
+static texture<unsigned int, 1, cudaReadModeElementType> mixTab0Tex;
+
+static const uint32_t mixtab0[] = {
+	0x63633297, 0x7c7c6feb, 0x77775ec7, 0x7b7b7af7, 0xf2f2e8e5, 0x6b6b0ab7, 0x6f6f16a7, 0xc5c56d39,
+	0x303090c0, 0x01010704, 0x67672e87, 0x2b2bd1ac, 0xfefeccd5, 0xd7d71371, 0xabab7c9a, 0x767659c3,
+	0xcaca4005, 0x8282a33e, 0xc9c94909, 0x7d7d68ef, 0xfafad0c5, 0x5959947f, 0x4747ce07, 0xf0f0e6ed,
+	0xadad6e82, 0xd4d41a7d, 0xa2a243be, 0xafaf608a, 0x9c9cf946, 0xa4a451a6, 0x727245d3, 0xc0c0762d,
+	0xb7b728ea, 0xfdfdc5d9, 0x9393d47a, 0x2626f298, 0x363682d8, 0x3f3fbdfc, 0xf7f7f3f1, 0xcccc521d,
+	0x34348cd0, 0xa5a556a2, 0xe5e58db9, 0xf1f1e1e9, 0x71714cdf, 0xd8d83e4d, 0x313197c4, 0x15156b54,
+	0x04041c10, 0xc7c76331, 0x2323e98c, 0xc3c37f21, 0x18184860, 0x9696cf6e, 0x05051b14, 0x9a9aeb5e,
+	0x0707151c, 0x12127e48, 0x8080ad36, 0xe2e298a5, 0xebeba781, 0x2727f59c, 0xb2b233fe, 0x757550cf,
+	0x09093f24, 0x8383a43a, 0x2c2cc4b0, 0x1a1a4668, 0x1b1b416c, 0x6e6e11a3, 0x5a5a9d73, 0xa0a04db6,
+	0x5252a553, 0x3b3ba1ec, 0xd6d61475, 0xb3b334fa, 0x2929dfa4, 0xe3e39fa1, 0x2f2fcdbc, 0x8484b126,
+	0x5353a257, 0xd1d10169, 0x00000000, 0xededb599, 0x2020e080, 0xfcfcc2dd, 0xb1b13af2, 0x5b5b9a77,
+	0x6a6a0db3, 0xcbcb4701, 0xbebe17ce, 0x3939afe4, 0x4a4aed33, 0x4c4cff2b, 0x5858937b, 0xcfcf5b11,
+	0xd0d0066d, 0xefefbb91, 0xaaaa7b9e, 0xfbfbd7c1, 0x4343d217, 0x4d4df82f, 0x333399cc, 0x8585b622,
+	0x4545c00f, 0xf9f9d9c9, 0x02020e08, 0x7f7f66e7, 0x5050ab5b, 0x3c3cb4f0, 0x9f9ff04a, 0xa8a87596,
+	0x5151ac5f, 0xa3a344ba, 0x4040db1b, 0x8f8f800a, 0x9292d37e, 0x9d9dfe42, 0x3838a8e0, 0xf5f5fdf9,
+	0xbcbc19c6, 0xb6b62fee, 0xdada3045, 0x2121e784, 0x10107040, 0xffffcbd1, 0xf3f3efe1, 0xd2d20865,
+	0xcdcd5519, 0x0c0c2430, 0x1313794c, 0xececb29d, 0x5f5f8667, 0x9797c86a, 0x4444c70b, 0x1717655c,
+	0xc4c46a3d, 0xa7a758aa, 0x7e7e61e3, 0x3d3db3f4, 0x6464278b, 0x5d5d886f, 0x19194f64, 0x737342d7,
+	0x60603b9b, 0x8181aa32, 0x4f4ff627, 0xdcdc225d, 0x2222ee88, 0x2a2ad6a8, 0x9090dd76, 0x88889516,
+	0x4646c903, 0xeeeebc95, 0xb8b805d6, 0x14146c50, 0xdede2c55, 0x5e5e8163, 0x0b0b312c, 0xdbdb3741,
+	0xe0e096ad, 0x32329ec8, 0x3a3aa6e8, 0x0a0a3628, 0x4949e43f, 0x06061218, 0x2424fc90, 0x5c5c8f6b,
+	0xc2c27825, 0xd3d30f61, 0xacac6986, 0x62623593, 0x9191da72, 0x9595c662, 0xe4e48abd, 0x797974ff,
+	0xe7e783b1, 0xc8c84e0d, 0x373785dc, 0x6d6d18af, 0x8d8d8e02, 0xd5d51d79, 0x4e4ef123, 0xa9a97292,
+	0x6c6c1fab, 0x5656b943, 0xf4f4fafd, 0xeaeaa085, 0x6565208f, 0x7a7a7df3, 0xaeae678e, 0x08083820,
+	0xbaba0bde, 0x787873fb, 0x2525fb94, 0x2e2ecab8, 0x1c1c5470, 0xa6a65fae, 0xb4b421e6, 0xc6c66435,
+	0xe8e8ae8d, 0xdddd2559, 0x747457cb, 0x1f1f5d7c, 0x4b4bea37, 0xbdbd1ec2, 0x8b8b9c1a, 0x8a8a9b1e,
+	0x70704bdb, 0x3e3ebaf8, 0xb5b526e2, 0x66662983, 0x4848e33b, 0x0303090c, 0xf6f6f4f5, 0x0e0e2a38,
+	0x61613c9f, 0x35358bd4, 0x5757be47, 0xb9b902d2, 0x8686bf2e, 0xc1c17129, 0x1d1d5374, 0x9e9ef74e,
+	0xe1e191a9, 0xf8f8decd, 0x9898e556, 0x11117744, 0x696904bf, 0xd9d93949, 0x8e8e870e, 0x9494c166,
+	0x9b9bec5a, 0x1e1e5a78, 0x8787b82a, 0xe9e9a989, 0xcece5c15, 0x5555b04f, 0x2828d8a0, 0xdfdf2b51,
+	0x8c8c8906, 0xa1a14ab2, 0x89899212, 0x0d0d2334, 0xbfbf10ca, 0xe6e684b5, 0x4242d513, 0x686803bb,
+	0x4141dc1f, 0x9999e252, 0x2d2dc3b4, 0x0f0f2d3c, 0xb0b03df6, 0x5454b74b, 0xbbbb0cda, 0x16166258
+};
+
+#define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \
+	x22 ^= x00; \
+	x00 = (q); \
+	x08 ^= x00; \
+	x01 ^= x24; \
+	x04 ^= x27; \
+	x07 ^= x30; \
+}
+
+#define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20) { \
+	x00 ^= x04; \
+	x01 ^= x05; \
+	x02 ^= x06; \
+	x18 ^= x04; \
+	x19 ^= x05; \
+	x20 ^= x06; \
+}
+
+#define SMIX(x0, x1, x2, x3) { \
+	uint32_t tmp; \
+	uint32_t r0 = 0; \
+	uint32_t r1 = 0; \
+	uint32_t r2 = 0; \
+	uint32_t r3 = 0; \
+	uint32_t c0 = mixtab0(x0 >> 24); \
+	tmp = mixtab1((x0 >> 16) & 0xFF); \
+	c0 ^= tmp; \
+	r1 ^= tmp; \
+	tmp = mixtab2((x0 >>  8) & 0xFF); \
+	c0 ^= tmp; \
+	r2 ^= tmp; \
+	tmp = mixtab3(x0 & 0xFF); \
+	c0 ^= tmp; \
+	r3 ^= tmp; \
+	tmp = mixtab0(x1 >> 24); \
+	uint32_t c1 = tmp; \
+	r0 ^= tmp; \
+	tmp = mixtab1((x1 >> 16) & 0xFF); \
+	c1 ^= tmp; \
+	tmp = mixtab2((x1 >>  8) & 0xFF); \
+	c1 ^= tmp; \
+	r2 ^= tmp; \
+	tmp = mixtab3(x1 & 0xFF); \
+	c1 ^= tmp; \
+	r3 ^= tmp; \
+	tmp = mixtab0(x2 >> 24); \
+	uint32_t c2 = tmp; \
+	r0 ^= tmp; \
+	tmp = mixtab1((x2 >> 16) & 0xFF); \
+	c2 ^= tmp; \
+	r1 ^= tmp; \
+	tmp = mixtab2((x2 >>  8) & 0xFF); \
+	c2 ^= tmp; \
+	tmp = mixtab3(x2 & 0xFF); \
+	c2 ^= tmp; \
+	r3 ^= tmp; \
+	tmp = mixtab0(x3 >> 24); \
+	uint32_t c3 = tmp; \
+	r0 ^= tmp; \
+	tmp = mixtab1((x3 >> 16) & 0xFF); \
+	c3 ^= tmp; \
+	r1 ^= tmp; \
+	tmp = mixtab2((x3 >>  8) & 0xFF); \
+	c3 ^= tmp; \
+	r2 ^= tmp; \
+	tmp = mixtab3(x3 & 0xFF); \
+	c3 ^= tmp; \
+	x0 = ((c0 ^ r0) & 0xFF000000) | ((c1 ^ r1) & 0x00FF0000) \
+		| ((c2 ^ r2) & 0x0000FF00) | ((c3 ^ r3) & 0x000000FF); \
+	x1 = ((c1 ^ (r0 <<  8)) & 0xFF000000) | ((c2 ^ (r1 <<  8)) & 0x00FF0000) \
+		| ((c3 ^ (r2 <<  8)) & 0x0000FF00) | ((c0 ^ (r3 >> 24)) & 0x000000FF); \
+	x2 = ((c2 ^ (r0 << 16)) & 0xFF000000) | ((c3 ^ (r1 << 16)) & 0x00FF0000) \
+		| ((c0 ^ (r2 >> 16)) & 0x0000FF00) | ((c1 ^ (r3 >> 16)) & 0x000000FF); \
+	x3 = ((c3 ^ (r0 << 24)) & 0xFF000000) | ((c0 ^ (r1 >>  8)) & 0x00FF0000) \
+		| ((c1 ^ (r2 >>  8)) & 0x0000FF00) | ((c2 ^ (r3 >>  8)) & 0x000000FF); \
+}
+
+#define SUB_ROR3 { \
+	B33 = S33, B34 = S34, B35 = S35; \
+	S35 = S32; S34 = S31; S33 = S30; S32 = S29; S31 = S28; S30 = S27; S29 = S26; S28 = S25; S27 = S24; \
+	S26 = S23; S25 = S22; S24 = S21; S23 = S20; S22 = S19; S21 = S18; S20 = S17; S19 = S16; S18 = S15; \
+	S17 = S14; S16 = S13; S15 = S12; S14 = S11; S13 = S10; S12 = S09; S11 = S08; S10 = S07; S09 = S06; \
+	S08 = S05; S07 = S04; S06 = S03; S05 = S02; S04 = S01; S03 = S00; S02 = B35; S01 = B34; S00 = B33; \
+}
+
+#define SUB_ROR8 { \
+	B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
+	S35 = S27; S34 = S26; S33 = S25; S32 = S24; S31 = S23; S30 = S22; S29 = S21; S28 = S20; S27 = S19; \
+	S26 = S18; S25 = S17; S24 = S16; S23 = S15; S22 = S14; S21 = S13; S20 = S12; S19 = S11; S18 = S10; \
+	S17 = S09; S16 = S08; S15 = S07; S14 = S06; S13 = S05; S12 = S04; S11 = S03; S10 = S02; S09 = S01; \
+	S08 = S00; S07 = B35; S06 = B34; S05 = B33; S04 = B32; S03 = B31; S02 = B30; S01 = B29; S00 = B28; \
+}
+
+#define SUB_ROR9 { \
+	B27 = S27, B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
+	S35 = S26; S34 = S25; S33 = S24; S32 = S23; S31 = S22; S30 = S21; S29 = S20; S28 = S19; S27 = S18; \
+	S26 = S17; S25 = S16; S24 = S15; S23 = S14; S22 = S13; S21 = S12; S20 = S11; S19 = S10; S18 = S09; \
+	S17 = S08; S16 = S07; S15 = S06; S14 = S05; S13 = S04; S12 = S03; S11 = S02; S10 = S01; S09 = S00; \
+	S08 = B35; S07 = B34; S06 = B33; S05 = B32; S04 = B31; S03 = B30; S02 = B29; S01 = B28; S00 = B27; \
+}
+
+#define SUB_ROR9_3 { \
+	SUB_ROR3; SUB_ROR3; SUB_ROR3; \
+}
+
+#define SUB_ROR12 { /* to fix */ \
+	B24 = S00; B25 = S01; B26 = S02; B27 = S03; B28 = S04; B29 = S05; B30 = S06; B31 = S07; B32 = S08; B33 = S09; B34 = S10; B35 = S11; \
+	S00 = S12; S01 = S13; S02 = S14; S03 = S15; S04 = S16; S05 = S17; S06 = S18; S07 = S19; S08 = S20; S09 = S21; S10 = S22; S11 = S23; \
+	S12 = S24; S13 = S25; S14 = S26; S15 = S27; S16 = S28; S17 = S29; S18 = S30; S19 = S31; S20 = S32; S21 = S33; S22 = S34; S23 = S35; \
+	S24 = B24; S25 = B25; S26 = B26; S27 = B27; S28 = B28; S29 = B29; S30 = B30; S31 = B31; S32 = B32; S33 = B33; S34 = B34; S35 = B35; \
+}
+
+#define FUGUE512_3(x, y, z) { \
+	TIX4(x, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
+	CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
+	SMIX(S33, S34, S35, S00); \
+	CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
+	SMIX(S30, S31, S32, S33); \
+	CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
+	SMIX(S27, S28, S29, S30); \
+	CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
+	SMIX(S24, S25, S26, S27); \
+	\
+	TIX4(y, S24, S25, S28, S31, S32, S10, S12, S15, S18); \
+	CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \
+	SMIX(S21, S22, S23, S24); \
+	CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \
+	SMIX(S18, S19, S20, S21); \
+	CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \
+	SMIX(S15, S16, S17, S18); \
+	CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \
+	SMIX(S12, S13, S14, S15); \
+	\
+	TIX4(z, S12, S13, S16, S19, S20, S34, S00, S03, S06); \
+	CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \
+	SMIX(S09, S10, S11, S12); \
+	CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \
+	SMIX(S06, S07, S08, S09); \
+	CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \
+	SMIX(S03, S04, S05, S06); \
+	CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \
+	SMIX(S00, S01, S02, S03); \
+}
+
+#define FUGUE512_F(w, x, y, z) { \
+	TIX4(w, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
+	CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
+	SMIX(S33, S34, S35, S00); \
+	CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
+	SMIX(S30, S31, S32, S33); \
+	CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
+	SMIX(S27, S28, S29, S30); \
+	CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
+	SMIX(S24, S25, S26, S27); \
+	\
+	TIX4(x, S24, S25, S28, S31, S32, S10, S12, S15, S18); \
+	CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \
+	SMIX(S21, S22, S23, S24); \
+	CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \
+	SMIX(S18, S19, S20, S21); \
+	CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \
+	SMIX(S15, S16, S17, S18); \
+	CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \
+	SMIX(S12, S13, S14, S15); \
+	\
+	TIX4(y, S12, S13, S16, S19, S20, S34, S00, S03, S06); \
+	CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \
+	SMIX(S09, S10, S11, S12); \
+	CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \
+	SMIX(S06, S07, S08, S09); \
+	CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \
+	SMIX(S03, S04, S05, S06); \
+	CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \
+	SMIX(S00, S01, S02, S03); \
+	\
+	TIX4(z, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
+	CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
+	SMIX(S33, S34, S35, S00); \
+	CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
+	SMIX(S30, S31, S32, S33); \
+	CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
+	SMIX(S27, S28, S29, S30); \
+	CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
+	SMIX(S24, S25, S26, S27); \
+}
+
+#undef ROL8
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__
+uint32_t ROL8(const uint32_t a) {
+	return __byte_perm(a, 0, 0x2103);
+}
+__device__ __forceinline__
+uint32_t ROR8(const uint32_t a) {
+	return __byte_perm(a, 0, 0x0321);
+}
+__device__ __forceinline__
+uint32_t ROL16(const uint32_t a) {
+	return __byte_perm(a, 0, 0x1032);
+}
+#else
+#define ROL8(u)  ROTL32(u, 8)
+#define ROR8(u)  ROTR32(u, 8)
+#define ROL16(u) ROTL32(u,16)
+#endif
+
+//#define AS_UINT4(addr) *((uint4*)(addr))
+
+__constant__ static uint64_t c_PaddedMessage80[10];
+
+__host__
+void x16_fugue512_setBlock_80(void *pdata)
+{
+	cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+}
+
+/***************************************************/
+
+__global__
+__launch_bounds__(TPB)
+void x16_fugue512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
+{
+	__shared__ uint32_t mixtabs[1024];
+
+	// load shared mem (with 256 threads)
+	const uint32_t thr = threadIdx.x & 0xFF;
+	const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);
+	mixtabs[thr] = tmp;
+	mixtabs[thr+256] = ROR8(tmp);
+	mixtabs[thr+512] = ROL16(tmp);
+	mixtabs[thr+768] = ROL8(tmp);
+#if TPB <= 256
+	if (blockDim.x < 256) {
+		const uint32_t thr = (threadIdx.x + 0x80) & 0xFF;
+		const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);
+		mixtabs[thr] = tmp;
+		mixtabs[thr + 256] = ROR8(tmp);
+		mixtabs[thr + 512] = ROL16(tmp);
+		mixtabs[thr + 768] = ROL8(tmp);
+	}
+#endif
+
+	__syncthreads();
+
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint32_t Data[20];
+
+		#pragma unroll
+		for(int i = 0; i < 10; i++)
+			AS_UINT2(&Data[i * 2]) = AS_UINT2(&c_PaddedMessage80[i]);
+		Data[19] = (startNonce + thread);
+
+		uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09, S10, S11;
+		uint32_t S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23;
+		uint32_t S24, S25, S26, S27, S28, S29, S30, S31, S32, S33, S34, S35;
+		//uint32_t B24, B25, B26,
+		uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35;
+		//const uint64_t bc = 640 bits to hash
+		//const uint32_t bclo = (uint32_t)(bc);
+		//const uint32_t bchi = (uint32_t)(bc >> 32);
+
+		S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = 0;
+		S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
+		S20 = 0x8807a57e; S21 = 0xe616af75; S22 = 0xc5d3e4db; S23 = 0xac9ab027;
+		S24 = 0xd915f117; S25 = 0xb6eecc54; S26 = 0x06e8020b; S27 = 0x4a92efd1;
+		S28 = 0xaac6e2c9; S29 = 0xddb21398; S30 = 0xcae65838; S31 = 0x437f203f;
+		S32 = 0x25ea78e7; S33 = 0x951fddd6; S34 = 0xda6ed11d; S35 = 0xe13e3567;
+
+		FUGUE512_3((Data[ 0]), (Data[ 1]), (Data[ 2]));
+		FUGUE512_3((Data[ 3]), (Data[ 4]), (Data[ 5]));
+		FUGUE512_3((Data[ 6]), (Data[ 7]), (Data[ 8]));
+		FUGUE512_3((Data[ 9]), (Data[10]), (Data[11]));
+		FUGUE512_3((Data[12]), (Data[13]), (Data[14]));
+		FUGUE512_3((Data[15]), (Data[16]), (Data[17]));
+		FUGUE512_F((Data[18]), (Data[19]), 0/*bchi*/, (80*8)/*bclo*/);
+
+		// rotate right state by 3 dwords (S00 = S33, S03 = S00)
+		SUB_ROR3;
+		SUB_ROR9;
+
+		#pragma unroll 32
+		for (int i = 0; i < 32; i++) {
+			SUB_ROR3;
+			CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
+			SMIX(S00, S01, S02, S03);
+		}
+		#pragma unroll 13
+		for (int i = 0; i < 13; i++) {
+			S04 ^= S00;
+			S09 ^= S00;
+			S18 ^= S00;
+			S27 ^= S00;
+			SUB_ROR9;
+			SMIX(S00, S01, S02, S03);
+			S04 ^= S00;
+			S10 ^= S00;
+			S18 ^= S00;
+			S27 ^= S00;
+			SUB_ROR9;
+			SMIX(S00, S01, S02, S03);
+			S04 ^= S00;
+			S10 ^= S00;
+			S19 ^= S00;
+			S27 ^= S00;
+			SUB_ROR9;
+			SMIX(S00, S01, S02, S03);
+			S04 ^= S00;
+			S10 ^= S00;
+			S19 ^= S00;
+			S28 ^= S00;
+			SUB_ROR8;
+			SMIX(S00, S01, S02, S03);
+		}
+		S04 ^= S00;
+		S09 ^= S00;
+		S18 ^= S00;
+		S27 ^= S00;
+
+		Data[ 0] = cuda_swab32(S01);
+		Data[ 1] = cuda_swab32(S02);
+		Data[ 2] = cuda_swab32(S03);
+		Data[ 3] = cuda_swab32(S04);
+		Data[ 4] = cuda_swab32(S09);
+		Data[ 5] = cuda_swab32(S10);
+		Data[ 6] = cuda_swab32(S11);
+		Data[ 7] = cuda_swab32(S12);
+		Data[ 8] = cuda_swab32(S18);
+		Data[ 9] = cuda_swab32(S19);
+		Data[10] = cuda_swab32(S20);
+		Data[11] = cuda_swab32(S21);
+		Data[12] = cuda_swab32(S27);
+		Data[13] = cuda_swab32(S28);
+		Data[14] = cuda_swab32(S29);
+		Data[15] = cuda_swab32(S30);
+
+		const size_t hashPosition = thread;
+		uint64_t* pHash = &g_hash[hashPosition << 3];
+		#pragma unroll 4
+		for(int i = 0; i < 4; i++)
+			AS_UINT4(&pHash[i * 2]) = AS_UINT4(&Data[i * 4]);
+	}
+}
+
+#define texDef(id, texname, texmem, texsource, texsize) { \
+	unsigned int *texmem; \
+	cudaMalloc(&texmem, texsize); \
+	d_textures[thr_id][id] = texmem; \
+	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
+	texname.normalized = 0; \
+	texname.filterMode = cudaFilterModePoint; \
+	texname.addressMode[0] = cudaAddressModeClamp; \
+	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
+	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
+	} \
+}
+
+__host__
+void x16_fugue512_cpu_init(int thr_id, uint32_t threads)
+{
+	texDef(0, mixTab0Tex, mixTab0m, mixtab0, sizeof(uint32_t)*256);
+}
+
+__host__
+void x16_fugue512_cpu_free(int thr_id)
+{
+	cudaFree(d_textures[thr_id][0]);
+}
+
+__host__
+void x16_fugue512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = TPB;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	x16_fugue512_gpu_hash_80 <<<grid, block>>> (threads, startNonce, (uint64_t*)d_hash);
+}
--- a/x16/cuda_x16_shabal512.cu
+++ b/x16/cuda_x16_shabal512.cu
@ -0,0 +1,350 @@
				@@ -0,0 +1,350 @@
+/*
+* Shabal-512 for X16R
+* tpruvot 2018, based on alexis x14 and xevan kernlx code
+*/
+
+#include <cuda_helper.h>
+#include <cuda_vectors.h>
+#include <cuda_vector_uint2x4.h>
+
+typedef uint32_t sph_u32;
+
+#define C32(x) (x)
+#define T32(x) (x)
+
+#define INPUT_BLOCK_ADD do { \
+		B0 = T32(B0 + M0); \
+		B1 = T32(B1 + M1); \
+		B2 = T32(B2 + M2); \
+		B3 = T32(B3 + M3); \
+		B4 = T32(B4 + M4); \
+		B5 = T32(B5 + M5); \
+		B6 = T32(B6 + M6); \
+		B7 = T32(B7 + M7); \
+		B8 = T32(B8 + M8); \
+		B9 = T32(B9 + M9); \
+		BA = T32(BA + MA); \
+		BB = T32(BB + MB); \
+		BC = T32(BC + MC); \
+		BD = T32(BD + MD); \
+		BE = T32(BE + ME); \
+		BF = T32(BF + MF); \
+		} while (0)
+
+#define INPUT_BLOCK_SUB do { \
+		C0 = T32(C0 - M0); \
+		C1 = T32(C1 - M1); \
+		C2 = T32(C2 - M2); \
+		C3 = T32(C3 - M3); \
+		C4 = T32(C4 - M4); \
+		C5 = T32(C5 - M5); \
+		C6 = T32(C6 - M6); \
+		C7 = T32(C7 - M7); \
+		C8 = T32(C8 - M8); \
+		C9 = T32(C9 - M9); \
+		CA = T32(CA - MA); \
+		CB = T32(CB - MB); \
+		CC = T32(CC - MC); \
+		CD = T32(CD - MD); \
+		CE = T32(CE - ME); \
+		CF = T32(CF - MF); \
+		} while (0)
+
+#define XOR_W   do { \
+		A00 ^= Wlow; \
+		A01 ^= Whigh; \
+		} while (0)
+
+#define SWAP(v1, v2) do { \
+		sph_u32 tmp = (v1); \
+		(v1) = (v2); \
+		(v2) = tmp; \
+		} while (0)
+
+#define SWAP_BC do { \
+		SWAP(B0, C0); \
+		SWAP(B1, C1); \
+		SWAP(B2, C2); \
+		SWAP(B3, C3); \
+		SWAP(B4, C4); \
+		SWAP(B5, C5); \
+		SWAP(B6, C6); \
+		SWAP(B7, C7); \
+		SWAP(B8, C8); \
+		SWAP(B9, C9); \
+		SWAP(BA, CA); \
+		SWAP(BB, CB); \
+		SWAP(BC, CC); \
+		SWAP(BD, CD); \
+		SWAP(BE, CE); \
+		SWAP(BF, CF); \
+		} while (0)
+
+#define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) do { \
+		xa0 = T32((xa0 \
+			^ (((xa1 << 15) | (xa1 >> 17)) * 5U) \
+			^ xc) * 3U) \
+			^ xb1 ^ (xb2 & ~xb3) ^ xm; \
+		xb0 = T32(~(((xb0 << 1) | (xb0 >> 31)) ^ xa0)); \
+		} while (0)
+
+#define PERM_STEP_0 do { \
+		PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
+		} while (0)
+
+#define PERM_STEP_1 do { \
+		PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
+		} while (0)
+
+#define PERM_STEP_2 do { \
+		PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
+		PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
+		PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
+		PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
+		PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
+		PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
+		PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
+		PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
+		PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
+		PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
+		PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
+		PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
+		PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
+		PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
+		PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
+		PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
+		} while (0)
+
+#define APPLY_P do { \
+		B0 = T32(B0 << 17) | (B0 >> 15); \
+		B1 = T32(B1 << 17) | (B1 >> 15); \
+		B2 = T32(B2 << 17) | (B2 >> 15); \
+		B3 = T32(B3 << 17) | (B3 >> 15); \
+		B4 = T32(B4 << 17) | (B4 >> 15); \
+		B5 = T32(B5 << 17) | (B5 >> 15); \
+		B6 = T32(B6 << 17) | (B6 >> 15); \
+		B7 = T32(B7 << 17) | (B7 >> 15); \
+		B8 = T32(B8 << 17) | (B8 >> 15); \
+		B9 = T32(B9 << 17) | (B9 >> 15); \
+		BA = T32(BA << 17) | (BA >> 15); \
+		BB = T32(BB << 17) | (BB >> 15); \
+		BC = T32(BC << 17) | (BC >> 15); \
+		BD = T32(BD << 17) | (BD >> 15); \
+		BE = T32(BE << 17) | (BE >> 15); \
+		BF = T32(BF << 17) | (BF >> 15); \
+		PERM_STEP_0; \
+		PERM_STEP_1; \
+		PERM_STEP_2; \
+		A0B = T32(A0B + C6); \
+		A0A = T32(A0A + C5); \
+		A09 = T32(A09 + C4); \
+		A08 = T32(A08 + C3); \
+		A07 = T32(A07 + C2); \
+		A06 = T32(A06 + C1); \
+		A05 = T32(A05 + C0); \
+		A04 = T32(A04 + CF); \
+		A03 = T32(A03 + CE); \
+		A02 = T32(A02 + CD); \
+		A01 = T32(A01 + CC); \
+		A00 = T32(A00 + CB); \
+		A0B = T32(A0B + CA); \
+		A0A = T32(A0A + C9); \
+		A09 = T32(A09 + C8); \
+		A08 = T32(A08 + C7); \
+		A07 = T32(A07 + C6); \
+		A06 = T32(A06 + C5); \
+		A05 = T32(A05 + C4); \
+		A04 = T32(A04 + C3); \
+		A03 = T32(A03 + C2); \
+		A02 = T32(A02 + C1); \
+		A01 = T32(A01 + C0); \
+		A00 = T32(A00 + CF); \
+		A0B = T32(A0B + CE); \
+		A0A = T32(A0A + CD); \
+		A09 = T32(A09 + CC); \
+		A08 = T32(A08 + CB); \
+		A07 = T32(A07 + CA); \
+		A06 = T32(A06 + C9); \
+		A05 = T32(A05 + C8); \
+		A04 = T32(A04 + C7); \
+		A03 = T32(A03 + C6); \
+		A02 = T32(A02 + C5); \
+		A01 = T32(A01 + C4); \
+		A00 = T32(A00 + C3); \
+	} while (0)
+
+#define INCR_W do { \
+	if ((Wlow = T32(Wlow + 1)) == 0) \
+		Whigh = T32(Whigh + 1); \
+	} while (0)
+
+__constant__ static const sph_u32 A_init_512[] = {
+	C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
+	C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
+	C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
+};
+
+__constant__ static const sph_u32 B_init_512[] = {
+	C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
+	C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
+	C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
+	C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
+};
+
+__constant__ static const sph_u32 C_init_512[] = {
+	C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
+	C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
+	C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
+	C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
+};
+
+__constant__ static uint32_t c_PaddedMessage80[20];
+
+__host__
+void x16_shabal512_setBlock_80(void *pdata)
+{
+	cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+}
+
+#define TPB_SHABAL 256
+
+__global__ __launch_bounds__(TPB_SHABAL, 2)
+void x16_shabal512_gpu_hash_80(uint32_t threads, const uint32_t startNonce, uint32_t *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	uint32_t B[] = {
+		0xC1099CB7, 0x07B385F3, 0xE7442C26, 0xCC8AD640, 0xEB6F56C7, 0x1EA81AA9, 0x73B9D314, 0x1DE85D08,
+		0x48910A5A, 0x893B22DB, 0xC5A0DF44, 0xBBC4324E, 0x72D2F240, 0x75941D99, 0x6D8BDE82, 0xA1A7502B
+	};
+	uint32_t M[16];
+
+	if (thread < threads)
+	{
+		// todo: try __ldc
+		*(uint2x4*)&M[0] = *(uint2x4*)&c_PaddedMessage80[0];
+		*(uint2x4*)&M[8] = *(uint2x4*)&c_PaddedMessage80[8];
+
+		sph_u32 A00 = A_init_512[0], A01 = A_init_512[1], A02 = A_init_512[ 2], A03 = A_init_512[ 3];
+		sph_u32 A04 = A_init_512[4], A05 = A_init_512[5], A06 = A_init_512[ 6], A07 = A_init_512[ 7];
+		sph_u32 A08 = A_init_512[8], A09 = A_init_512[9], A0A = A_init_512[10], A0B = A_init_512[11];
+
+		sph_u32 B0 = B_init_512[ 0], B1 = B_init_512[ 1], B2 = B_init_512[ 2], B3 = B_init_512 [3];
+		sph_u32 B4 = B_init_512[ 4], B5 = B_init_512[ 5], B6 = B_init_512[ 6], B7 = B_init_512[ 7];
+		sph_u32 B8 = B_init_512[ 8], B9 = B_init_512[ 9], BA = B_init_512[10], BB = B_init_512[11];
+		sph_u32 BC = B_init_512[12], BD = B_init_512[13], BE = B_init_512[14], BF = B_init_512[15];
+
+		sph_u32 C0 = C_init_512[ 0], C1 = C_init_512[ 1], C2 = C_init_512[ 2], C3 = C_init_512[ 3];
+		sph_u32 C4 = C_init_512[ 4], C5 = C_init_512[ 5], C6 = C_init_512[ 6], C7 = C_init_512[ 7];
+		sph_u32 C8 = C_init_512[ 8], C9 = C_init_512[ 9], CA = C_init_512[10], CB = C_init_512[11];
+		sph_u32 CC = C_init_512[12], CD = C_init_512[13], CE = C_init_512[14], CF = C_init_512[15];
+
+		sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
+		sph_u32 Wlow = 1, Whigh = 0;
+
+		M0 = M[ 0];
+		M1 = M[ 1];
+		M2 = M[ 2];
+		M3 = M[ 3];
+		M4 = M[ 4];
+		M5 = M[ 5];
+		M6 = M[ 6];
+		M7 = M[ 7];
+		M8 = M[ 8];
+		M9 = M[ 9];
+		MA = M[10];
+		MB = M[11];
+		MC = M[12];
+		MD = M[13];
+		ME = M[14];
+		MF = M[15];
+
+		INPUT_BLOCK_ADD;
+		XOR_W;
+		APPLY_P;
+		INPUT_BLOCK_SUB;
+		SWAP_BC;
+		INCR_W;
+
+		M0 = c_PaddedMessage80[16];
+		M1 = c_PaddedMessage80[17];
+		M2 = c_PaddedMessage80[18];
+		M3 = cuda_swab32(startNonce + thread);
+		M4 = 0x80;
+		M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0;
+
+		INPUT_BLOCK_ADD;
+		XOR_W;
+		APPLY_P;
+
+		for (unsigned i = 0; i < 3; i++) {
+			SWAP_BC;
+			XOR_W;
+			APPLY_P;
+		}
+
+		B[ 0] = B0;
+		B[ 1] = B1;
+		B[ 2] = B2;
+		B[ 3] = B3;
+		B[ 4] = B4;
+		B[ 5] = B5;
+		B[ 6] = B6;
+		B[ 7] = B7;
+		B[ 8] = B8;
+		B[ 9] = B9;
+		B[10] = BA;
+		B[11] = BB;
+		B[12] = BC;
+		B[13] = BD;
+		B[14] = BE;
+		B[15] = BF;
+
+		// output
+		uint64_t hashPosition = thread;
+		uint32_t *Hash = &g_hash[hashPosition << 4];
+		*(uint2x4*)&Hash[0] = *(uint2x4*)&B[0];
+		*(uint2x4*)&Hash[8] = *(uint2x4*)&B[8];
+	}
+}
+
+__host__
+void x16_shabal512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = TPB_SHABAL;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	x16_shabal512_gpu_hash_80 <<<grid, block >>>(threads, startNonce, d_hash);
+}
--- a/x16/cuda_x16_simd512_80.cu
+++ b/x16/cuda_x16_simd512_80.cu
--- a/x16/x16r.cu
+++ b/x16/x16r.cu
@ -0,0 +1,622 @@
				@@ -0,0 +1,622 @@
+/**
+ * X16R algorithm (X16 with Randomized chain order)
+ *
+ * tpruvot 2018 - GPL code
+ */
+
+#include <stdio.h>
+#include <memory.h>
+#include <unistd.h>
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+
+#include "sph/sph_hamsi.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_shabal.h"
+#include "sph/sph_whirlpool.h"
+#include "sph/sph_sha2.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "cuda_x16.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+enum Algo {
+	BLAKE = 0,
+	BMW,
+	GROESTL,
+	JH,
+	KECCAK,
+	SKEIN,
+	LUFFA,
+	CUBEHASH,
+	SHAVITE,
+	SIMD,
+	ECHO,
+	HAMSI,
+	FUGUE,
+	SHABAL,
+	WHIRLPOOL,
+	SHA512,
+	HASH_FUNC_COUNT
+};
+
+static const char* algo_strings[] = {
+	"blake",
+	"bmw512",
+	"groestl",
+	"jh512",
+	"keccak",
+	"skein",
+	"luffa",
+	"cube",
+	"shavite",
+	"simd",
+	"echo",
+	"hamsi",
+	"fugue",
+	"shabal",
+	"whirlpool",
+	"sha512",
+	NULL
+};
+
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
+
+static void getAlgoString(const uint32_t* prevblock, char *output)
+{
+	char *sptr = output;
+	uint8_t* data = (uint8_t*)prevblock;
+
+	for (uint8_t j = 0; j < HASH_FUNC_COUNT; j++) {
+		uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed
+		uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4;
+		if (algoDigit >= 10)
+			sprintf(sptr, "%c", 'A' + (algoDigit - 10));
+		else
+			sprintf(sptr, "%u", (uint32_t) algoDigit);
+		sptr++;
+	}
+	*sptr = '\0';
+}
+
+// X16R CPU Hash (Validation)
+extern "C" void x16r_hash(void *output, const void *input)
+{
+	unsigned char _ALIGN(64) hash[128];
+
+	sph_blake512_context ctx_blake;
+	sph_bmw512_context ctx_bmw;
+	sph_groestl512_context ctx_groestl;
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_skein512_context ctx_skein;
+	sph_luffa512_context ctx_luffa;
+	sph_cubehash512_context ctx_cubehash;
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_echo512_context ctx_echo;
+	sph_hamsi512_context ctx_hamsi;
+	sph_fugue512_context ctx_fugue;
+	sph_shabal512_context ctx_shabal;
+	sph_whirlpool_context ctx_whirlpool;
+	sph_sha512_context ctx_sha512;
+
+	void *in = (void*) input;
+	int size = 80;
+
+	uint32_t *in32 = (uint32_t*) input;
+	getAlgoString(&in32[1], hashOrder);
+
+	for (int i = 0; i < 16; i++)
+	{
+		const char elem = hashOrder[i];
+		const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+		switch (algo) {
+		case BLAKE:
+			sph_blake512_init(&ctx_blake);
+			sph_blake512(&ctx_blake, in, size);
+			sph_blake512_close(&ctx_blake, hash);
+			break;
+		case BMW:
+			sph_bmw512_init(&ctx_bmw);
+			sph_bmw512(&ctx_bmw, in, size);
+			sph_bmw512_close(&ctx_bmw, hash);
+			break;
+		case GROESTL:
+			sph_groestl512_init(&ctx_groestl);
+			sph_groestl512(&ctx_groestl, in, size);
+			sph_groestl512_close(&ctx_groestl, hash);
+			break;
+		case SKEIN:
+			sph_skein512_init(&ctx_skein);
+			sph_skein512(&ctx_skein, in, size);
+			sph_skein512_close(&ctx_skein, hash);
+			break;
+		case JH:
+			sph_jh512_init(&ctx_jh);
+			sph_jh512(&ctx_jh, in, size);
+			sph_jh512_close(&ctx_jh, hash);
+			break;
+		case KECCAK:
+			sph_keccak512_init(&ctx_keccak);
+			sph_keccak512(&ctx_keccak, in, size);
+			sph_keccak512_close(&ctx_keccak, hash);
+			break;
+		case LUFFA:
+			sph_luffa512_init(&ctx_luffa);
+			sph_luffa512(&ctx_luffa, in, size);
+			sph_luffa512_close(&ctx_luffa, hash);
+			break;
+		case CUBEHASH:
+			sph_cubehash512_init(&ctx_cubehash);
+			sph_cubehash512(&ctx_cubehash, in, size);
+			sph_cubehash512_close(&ctx_cubehash, hash);
+			break;
+		case SHAVITE:
+			sph_shavite512_init(&ctx_shavite);
+			sph_shavite512(&ctx_shavite, in, size);
+			sph_shavite512_close(&ctx_shavite, hash);
+			break;
+		case SIMD:
+			sph_simd512_init(&ctx_simd);
+			sph_simd512(&ctx_simd, in, size);
+			sph_simd512_close(&ctx_simd, hash);
+			break;
+		case ECHO:
+			sph_echo512_init(&ctx_echo);
+			sph_echo512(&ctx_echo, in, size);
+			sph_echo512_close(&ctx_echo, hash);
+			break;
+		case HAMSI:
+			sph_hamsi512_init(&ctx_hamsi);
+			sph_hamsi512(&ctx_hamsi, in, size);
+			sph_hamsi512_close(&ctx_hamsi, hash);
+			break;
+		case FUGUE:
+			sph_fugue512_init(&ctx_fugue);
+			sph_fugue512(&ctx_fugue, in, size);
+			sph_fugue512_close(&ctx_fugue, hash);
+			break;
+		case SHABAL:
+			sph_shabal512_init(&ctx_shabal);
+			sph_shabal512(&ctx_shabal, in, size);
+			sph_shabal512_close(&ctx_shabal, hash);
+			break;
+		case WHIRLPOOL:
+			sph_whirlpool_init(&ctx_whirlpool);
+			sph_whirlpool(&ctx_whirlpool, in, size);
+			sph_whirlpool_close(&ctx_whirlpool, hash);
+			break;
+		case SHA512:
+			sph_sha512_init(&ctx_sha512);
+			sph_sha512(&ctx_sha512,(const void*) in, size);
+			sph_sha512_close(&ctx_sha512,(void*) hash);
+			break;
+		}
+		in = (void*) hash;
+		size = 64;
+	}
+	memcpy(output, hash, 32);
+}
+
+void whirlpool_midstate(void *state, const void *input)
+{
+	sph_whirlpool_context ctx;
+
+	sph_whirlpool_init(&ctx);
+	sph_whirlpool(&ctx, input, 64);
+
+	memcpy(state, ctx.state, 64);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+
+//#define _DEBUG
+#define _DEBUG_PREFIX "x16r-"
+#include "cuda_debug.cuh"
+
+//static int algo80_tests[HASH_FUNC_COUNT] = { 0 };
+//static int algo64_tests[HASH_FUNC_COUNT] = { 0 };
+static int algo80_fails[HASH_FUNC_COUNT] = { 0 };
+
+extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	const int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19;
+	if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_init(thr_id, throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		qubit_luffa512_cpu_init(thr_id, throughput);
+		x11_luffa512_cpu_init(thr_id, throughput); // 64
+		x11_shavite512_cpu_init(thr_id, throughput);
+		x11_simd512_cpu_init(thr_id, throughput); // 64
+		x16_echo512_cuda_init(thr_id, throughput);
+		x13_hamsi512_cpu_init(thr_id, throughput);
+		x13_fugue512_cpu_init(thr_id, throughput);
+		x16_fugue512_cpu_init(thr_id, throughput);
+		x14_shabal512_cpu_init(thr_id, throughput);
+		x15_whirlpool_cpu_init(thr_id, throughput, 0);
+		x16_whirlpool512_init(thr_id, throughput);
+		x17_sha512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	if (opt_benchmark) {
+		((uint32_t*)ptarget)[7] = 0x003f;
+		//((uint8_t*)pdata)[8] = 0x90; // hashOrder[0] = '9'; for simd 80 + blake512 64
+		((uint8_t*)pdata)[8] = 0xAA; // hashOrder[0] = 'A'; for echo 80 + 64
+		//((uint8_t*)pdata)[8] = 0xB0; // hashOrder[0] = 'B'; for hamsi 80 + blake512 64
+		//((uint8_t*)pdata)[8] = 0xC0; // hashOrder[0] = 'C'; for fugue 80 + blake512 64
+		//((uint8_t*)pdata)[8] = 0xE0; // hashOrder[0] = 'E'; for whirlpool 80 + blake512 64
+	}
+	uint32_t _ALIGN(64) endiandata[20];
+
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	uint32_t ntime = swab32(pdata[17]);
+	if (s_ntime != ntime) {
+		getAlgoString(&endiandata[1], hashOrder);
+		s_ntime = ntime;
+		if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime);
+	}
+
+	cuda_check_cpu_setTarget(ptarget);
+
+	char elem = hashOrder[0];
+	const uint8_t algo80 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+	switch (algo80) {
+		case BLAKE:
+			quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+			break;
+		case BMW:
+			quark_bmw512_cpu_setBlock_80(endiandata);
+			break;
+		case GROESTL:
+			groestl512_setBlock_80(thr_id, endiandata);
+			break;
+		case JH:
+			jh512_setBlock_80(thr_id, endiandata);
+			break;
+		case KECCAK:
+			keccak512_setBlock_80(thr_id, endiandata);
+			break;
+		case SKEIN:
+			skein512_cpu_setBlock_80((void*)endiandata);
+			break;
+		case LUFFA:
+			qubit_luffa512_cpu_setBlock_80((void*)endiandata);
+			break;
+		case CUBEHASH:
+			cubehash512_setBlock_80(thr_id, endiandata);
+			break;
+		case SHAVITE:
+			x11_shavite512_setBlock_80((void*)endiandata);
+			break;
+		case SIMD:
+			x16_simd512_setBlock_80((void*)endiandata);
+			break;
+		case ECHO:
+			x16_echo512_setBlock_80((void*)endiandata);
+			break;
+		case HAMSI:
+			x16_hamsi512_setBlock_80((void*)endiandata);
+			break;
+		case FUGUE:
+			x16_fugue512_setBlock_80((void*)pdata);
+			break;
+		case SHABAL:
+			x16_shabal512_setBlock_80((void*)endiandata);
+			break;
+		case WHIRLPOOL:
+			x16_whirlpool512_setBlock_80((void*)endiandata);
+			break;
+		case SHA512:
+			x16_sha512_setBlock_80(endiandata);
+			break;
+		default: {
+			return -1;
+		}
+	}
+
+	int warn = 0;
+
+	do {
+		int order = 0;
+
+		// Hash with CUDA
+
+		switch (algo80) {
+			case BLAKE:
+				quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("blake80:");
+				break;
+			case BMW:
+				quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+				TRACE("bmw80  :");
+				break;
+			case GROESTL:
+				groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("grstl80:");
+				break;
+			case JH:
+				jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("jh51280:");
+				break;
+			case KECCAK:
+				keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("kecck80:");
+				break;
+			case SKEIN:
+				skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
+				TRACE("skein80:");
+				break;
+			case LUFFA:
+				qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+				TRACE("luffa80:");
+				break;
+			case CUBEHASH:
+				cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("cube 80:");
+				break;
+			case SHAVITE:
+				x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+				TRACE("shavite:");
+				break;
+			case SIMD:
+				x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("simd512:");
+				break;
+			case ECHO:
+				x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("echo   :");
+				break;
+			case HAMSI:
+				x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("hamsi  :");
+				break;
+			case FUGUE:
+				x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("fugue  :");
+				break;
+			case SHABAL:
+				x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("shabal :");
+				break;
+			case WHIRLPOOL:
+				x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("whirl  :");
+				break;
+			case SHA512:
+				x16_sha512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("sha512 :");
+				break;
+		}
+
+		for (int i = 1; i < 16; i++)
+		{
+			const char elem = hashOrder[i];
+			const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+			switch (algo64) {
+			case BLAKE:
+				quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("blake  :");
+				break;
+			case BMW:
+				quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("bmw    :");
+				break;
+			case GROESTL:
+				quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("groestl:");
+				break;
+			case JH:
+				quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("jh512  :");
+				break;
+			case KECCAK:
+				quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("keccak :");
+				break;
+			case SKEIN:
+				quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("skein  :");
+				break;
+			case LUFFA:
+				x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("luffa  :");
+				break;
+			case CUBEHASH:
+				x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("cube   :");
+				break;
+			case SHAVITE:
+				x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("shavite:");
+				break;
+			case SIMD:
+				x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("simd   :");
+				break;
+			case ECHO:
+				if (use_compat_kernels[thr_id])
+					x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				else
+					x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+				TRACE("echo   :");
+				break;
+			case HAMSI:
+				x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("hamsi  :");
+				break;
+			case FUGUE:
+				x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("fugue  :");
+				break;
+			case SHABAL:
+				x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("shabal :");
+				break;
+			case WHIRLPOOL:
+				x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("shabal :");
+				break;
+			case SHA512:
+				x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("sha512 :");
+				break;
+			}
+		}
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+#ifdef _DEBUG
+		uint32_t _ALIGN(64) dhash[8];
+		be32enc(&endiandata[19], pdata[19]);
+		x16r_hash(dhash, endiandata);
+		applog_hash(dhash);
+		return -1;
+#endif
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			x16r_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					x16r_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+#if 0
+				gpulog(LOG_INFO, thr_id, "hash found with %s 80!", algo_strings[algo80]);
+
+				algo80_tests[algo80] += work->valid_nonces;
+				char oks64[128] = { 0 };
+				char oks80[128] = { 0 };
+				char fails[128] = { 0 };
+				for (int a = 0; a < HASH_FUNC_COUNT; a++) {
+					const char elem = hashOrder[a];
+					const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+					if (a > 0) algo64_tests[algo64] += work->valid_nonces;
+					sprintf(&oks64[strlen(oks64)], "|%X:%2d", a, algo64_tests[a] < 100 ? algo64_tests[a] : 99);
+					sprintf(&oks80[strlen(oks80)], "|%X:%2d", a, algo80_tests[a] < 100 ? algo80_tests[a] : 99);
+					sprintf(&fails[strlen(fails)], "|%X:%2d", a, algo80_fails[a] < 100 ? algo80_fails[a] : 99);
+				}
+				applog(LOG_INFO, "K64: %s", oks64);
+				applog(LOG_INFO, "K80: %s", oks80);
+				applog(LOG_ERR,  "F80: %s", fails);
+#endif
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				// x11+ coins could do some random error, but not on retry
+				gpu_increment_reject(thr_id);
+				algo80_fails[algo80]++;
+				if (!warn) {
+					warn++;
+					pdata[19] = work->nonces[0] + 1;
+					continue;
+				} else {
+					if (!opt_quiet)	gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! %s %s",
+						work->nonces[0], algo_strings[algo80], hashOrder);
+					warn = 0;
+				}
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_x16r(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+	x16_fugue512_cpu_free(thr_id); // to merge with x13_fugue512 ?
+	x15_whirlpool_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+
+	cudaDeviceSynchronize();
+	init[thr_id] = false;
+}
--- a/x16/x16s.cu
+++ b/x16/x16s.cu
@ -0,0 +1,601 @@
				@@ -0,0 +1,601 @@
+/**
+ * X16S algorithm (X16 with Shuffled chain order)
+ *
+ * tpruvot 2018 - GPL code
+ */
+
+#include <stdio.h>
+#include <memory.h>
+#include <unistd.h>
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+
+#include "sph/sph_hamsi.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_shabal.h"
+#include "sph/sph_whirlpool.h"
+#include "sph/sph_sha2.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "cuda_x16.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+enum Algo {
+	BLAKE = 0,
+	BMW,
+	GROESTL,
+	JH,
+	KECCAK,
+	SKEIN,
+	LUFFA,
+	CUBEHASH,
+	SHAVITE,
+	SIMD,
+	ECHO,
+	HAMSI,
+	FUGUE,
+	SHABAL,
+	WHIRLPOOL,
+	SHA512,
+	HASH_FUNC_COUNT
+};
+
+static const char* algo_strings[] = {
+	"blake",
+	"bmw512",
+	"groestl",
+	"jh512",
+	"keccak",
+	"skein",
+	"luffa",
+	"cube",
+	"shavite",
+	"simd",
+	"echo",
+	"hamsi",
+	"fugue",
+	"shabal",
+	"whirlpool",
+	"sha512",
+	NULL
+};
+
+static __thread uint32_t s_ntime = UINT32_MAX;
+static __thread char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
+
+static void getAlgoString(const uint32_t* prevblock, char *output)
+{
+	uint8_t* data = (uint8_t*)prevblock;
+
+	strcpy(output, "0123456789ABCDEF");
+
+	for (uint8_t i = 0; i < HASH_FUNC_COUNT; i++) {
+		uint8_t b = (15 - i) >> 1; // 16 ascii hex chars, reversed
+		uint8_t algoDigit = (i & 1) ? data[b] & 0xF : data[b] >> 4;
+		int offset = (int) algoDigit;
+		char oldVal = output[offset];
+		for(int j=offset; j-->0;)
+			output[j+1] = output[j];
+		output[0] = oldVal;
+	}
+}
+
+// X16S CPU Hash (Validation)
+extern "C" void x16s_hash(void *output, const void *input)
+{
+	unsigned char _ALIGN(64) hash[128];
+
+	sph_blake512_context ctx_blake;
+	sph_bmw512_context ctx_bmw;
+	sph_groestl512_context ctx_groestl;
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_skein512_context ctx_skein;
+	sph_luffa512_context ctx_luffa;
+	sph_cubehash512_context ctx_cubehash;
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_echo512_context ctx_echo;
+	sph_hamsi512_context ctx_hamsi;
+	sph_fugue512_context ctx_fugue;
+	sph_shabal512_context ctx_shabal;
+	sph_whirlpool_context ctx_whirlpool;
+	sph_sha512_context ctx_sha512;
+
+	void *in = (void*) input;
+	int size = 80;
+
+	uint32_t *in32 = (uint32_t*) input;
+	getAlgoString(&in32[1], hashOrder);
+
+	for (int i = 0; i < 16; i++)
+	{
+		const char elem = hashOrder[i];
+		const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+		switch (algo) {
+		case BLAKE:
+			sph_blake512_init(&ctx_blake);
+			sph_blake512(&ctx_blake, in, size);
+			sph_blake512_close(&ctx_blake, hash);
+			break;
+		case BMW:
+			sph_bmw512_init(&ctx_bmw);
+			sph_bmw512(&ctx_bmw, in, size);
+			sph_bmw512_close(&ctx_bmw, hash);
+			break;
+		case GROESTL:
+			sph_groestl512_init(&ctx_groestl);
+			sph_groestl512(&ctx_groestl, in, size);
+			sph_groestl512_close(&ctx_groestl, hash);
+			break;
+		case SKEIN:
+			sph_skein512_init(&ctx_skein);
+			sph_skein512(&ctx_skein, in, size);
+			sph_skein512_close(&ctx_skein, hash);
+			break;
+		case JH:
+			sph_jh512_init(&ctx_jh);
+			sph_jh512(&ctx_jh, in, size);
+			sph_jh512_close(&ctx_jh, hash);
+			break;
+		case KECCAK:
+			sph_keccak512_init(&ctx_keccak);
+			sph_keccak512(&ctx_keccak, in, size);
+			sph_keccak512_close(&ctx_keccak, hash);
+			break;
+		case LUFFA:
+			sph_luffa512_init(&ctx_luffa);
+			sph_luffa512(&ctx_luffa, in, size);
+			sph_luffa512_close(&ctx_luffa, hash);
+			break;
+		case CUBEHASH:
+			sph_cubehash512_init(&ctx_cubehash);
+			sph_cubehash512(&ctx_cubehash, in, size);
+			sph_cubehash512_close(&ctx_cubehash, hash);
+			break;
+		case SHAVITE:
+			sph_shavite512_init(&ctx_shavite);
+			sph_shavite512(&ctx_shavite, in, size);
+			sph_shavite512_close(&ctx_shavite, hash);
+			break;
+		case SIMD:
+			sph_simd512_init(&ctx_simd);
+			sph_simd512(&ctx_simd, in, size);
+			sph_simd512_close(&ctx_simd, hash);
+			break;
+		case ECHO:
+			sph_echo512_init(&ctx_echo);
+			sph_echo512(&ctx_echo, in, size);
+			sph_echo512_close(&ctx_echo, hash);
+			break;
+		case HAMSI:
+			sph_hamsi512_init(&ctx_hamsi);
+			sph_hamsi512(&ctx_hamsi, in, size);
+			sph_hamsi512_close(&ctx_hamsi, hash);
+			break;
+		case FUGUE:
+			sph_fugue512_init(&ctx_fugue);
+			sph_fugue512(&ctx_fugue, in, size);
+			sph_fugue512_close(&ctx_fugue, hash);
+			break;
+		case SHABAL:
+			sph_shabal512_init(&ctx_shabal);
+			sph_shabal512(&ctx_shabal, in, size);
+			sph_shabal512_close(&ctx_shabal, hash);
+			break;
+		case WHIRLPOOL:
+			sph_whirlpool_init(&ctx_whirlpool);
+			sph_whirlpool(&ctx_whirlpool, in, size);
+			sph_whirlpool_close(&ctx_whirlpool, hash);
+			break;
+		case SHA512:
+			sph_sha512_init(&ctx_sha512);
+			sph_sha512(&ctx_sha512,(const void*) in, size);
+			sph_sha512_close(&ctx_sha512,(void*) hash);
+			break;
+		}
+		in = (void*) hash;
+		size = 64;
+	}
+	memcpy(output, hash, 32);
+}
+
+#if 0 /* in x16r */
+void whirlpool_midstate(void *state, const void *input)
+{
+	sph_whirlpool_context ctx;
+
+	sph_whirlpool_init(&ctx);
+	sph_whirlpool(&ctx, input, 64);
+
+	memcpy(state, ctx.state, 64);
+}
+#endif
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+
+//#define _DEBUG
+#define _DEBUG_PREFIX "x16s-"
+#include "cuda_debug.cuh"
+
+extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	const int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19;
+	if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_init(thr_id, throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		qubit_luffa512_cpu_init(thr_id, throughput);
+		x11_luffa512_cpu_init(thr_id, throughput); // 64
+		x11_shavite512_cpu_init(thr_id, throughput);
+		x11_simd512_cpu_init(thr_id, throughput); // 64
+		x16_echo512_cuda_init(thr_id, throughput);
+		x13_hamsi512_cpu_init(thr_id, throughput);
+		x13_fugue512_cpu_init(thr_id, throughput);
+		x16_fugue512_cpu_init(thr_id, throughput);
+		x14_shabal512_cpu_init(thr_id, throughput);
+		x15_whirlpool_cpu_init(thr_id, throughput, 0);
+		x16_whirlpool512_init(thr_id, throughput);
+		x17_sha512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	if (opt_benchmark) {
+		((uint32_t*)ptarget)[7] = 0x003f;
+		//((uint8_t*)pdata)[8] = 0x90; // hashOrder[0] = '9'; for simd 80 + blake512 64
+		//((uint8_t*)pdata)[8] = 0x0A; // hashOrder[0] = 'A'; for echo 64
+		//((uint8_t*)pdata)[8] = 0xB0; // hashOrder[0] = 'B'; for hamsi 80 + blake512 64
+		//((uint8_t*)pdata)[8] = 0xC0; // hashOrder[0] = 'C'; for fugue 80 + blake512 64
+		//((uint8_t*)pdata)[8] = 0xE0; // hashOrder[0] = 'E'; for whirlpool 80 + blake512 64
+	}
+	uint32_t _ALIGN(64) endiandata[20];
+
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	uint32_t ntime = swab32(pdata[17]);
+	if (s_ntime != ntime) {
+		getAlgoString(&endiandata[1], hashOrder);
+		s_ntime = ntime;
+		if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime);
+	}
+
+	cuda_check_cpu_setTarget(ptarget);
+
+	char elem = hashOrder[0];
+	const uint8_t algo80 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+	switch (algo80) {
+		case BLAKE:
+			quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+			break;
+		case BMW:
+			quark_bmw512_cpu_setBlock_80(endiandata);
+			break;
+		case GROESTL:
+			groestl512_setBlock_80(thr_id, endiandata);
+			break;
+		case JH:
+			jh512_setBlock_80(thr_id, endiandata);
+			break;
+		case KECCAK:
+			keccak512_setBlock_80(thr_id, endiandata);
+			break;
+		case SKEIN:
+			skein512_cpu_setBlock_80((void*)endiandata);
+			break;
+		case LUFFA:
+			qubit_luffa512_cpu_setBlock_80((void*)endiandata);
+			break;
+		case CUBEHASH:
+			cubehash512_setBlock_80(thr_id, endiandata);
+			break;
+		case SHAVITE:
+			x11_shavite512_setBlock_80((void*)endiandata);
+			break;
+		case SIMD:
+			x16_simd512_setBlock_80((void*)endiandata);
+			break;
+		case ECHO:
+			x16_echo512_setBlock_80((void*)endiandata);
+			break;
+		case HAMSI:
+			x16_hamsi512_setBlock_80((void*)endiandata);
+			break;
+		case FUGUE:
+			x16_fugue512_setBlock_80((void*)pdata);
+			break;
+		case SHABAL:
+			x16_shabal512_setBlock_80((void*)endiandata);
+			break;
+		case WHIRLPOOL:
+			x16_whirlpool512_setBlock_80((void*)endiandata);
+			break;
+		case SHA512:
+			x16_sha512_setBlock_80(endiandata);
+			break;
+		default: {
+			return -1;
+		}
+	}
+
+	int warn = 0;
+
+	do {
+		int order = 0;
+
+		// Hash with CUDA
+
+		switch (algo80) {
+			case BLAKE:
+				quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("blake80:");
+				break;
+			case BMW:
+				quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+				TRACE("bmw80  :");
+				break;
+			case GROESTL:
+				groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("grstl80:");
+				break;
+			case JH:
+				jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("jh51280:");
+				break;
+			case KECCAK:
+				keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("kecck80:");
+				break;
+			case SKEIN:
+				skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
+				TRACE("skein80:");
+				break;
+			case LUFFA:
+				qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+				TRACE("luffa80:");
+				break;
+			case CUBEHASH:
+				cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("cube 80:");
+				break;
+			case SHAVITE:
+				x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+				TRACE("shavite:");
+				break;
+			case SIMD:
+				x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("simd512:");
+				break;
+			case ECHO:
+				x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("echo   :");
+				break;
+			case HAMSI:
+				x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("hamsi  :");
+				break;
+			case FUGUE:
+				x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("fugue  :");
+				break;
+			case SHABAL:
+				x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("shabal :");
+				break;
+			case WHIRLPOOL:
+				x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("whirl  :");
+				break;
+			case SHA512:
+				x16_sha512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("sha512 :");
+				break;
+		}
+
+		for (int i = 1; i < 16; i++)
+		{
+			const char elem = hashOrder[i];
+			const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+			switch (algo64) {
+			case BLAKE:
+				quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("blake  :");
+				break;
+			case BMW:
+				quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("bmw    :");
+				break;
+			case GROESTL:
+				quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("groestl:");
+				break;
+			case JH:
+				quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("jh512  :");
+				break;
+			case KECCAK:
+				quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("keccak :");
+				break;
+			case SKEIN:
+				quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("skein  :");
+				break;
+			case LUFFA:
+				x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("luffa  :");
+				break;
+			case CUBEHASH:
+				x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("cube   :");
+				break;
+			case SHAVITE:
+				x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("shavite:");
+				break;
+			case SIMD:
+				x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("simd   :");
+				break;
+			case ECHO:
+				if (use_compat_kernels[thr_id])
+					x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				else
+					x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+				TRACE("echo   :");
+				break;
+			case HAMSI:
+				x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("hamsi  :");
+				break;
+			case FUGUE:
+				x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("fugue  :");
+				break;
+			case SHABAL:
+				x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("shabal :");
+				break;
+			case WHIRLPOOL:
+				x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("shabal :");
+				break;
+			case SHA512:
+				x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("sha512 :");
+				break;
+			}
+		}
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+#ifdef _DEBUG
+		uint32_t _ALIGN(64) dhash[8];
+		be32enc(&endiandata[19], pdata[19]);
+		x16s_hash(dhash, endiandata);
+		applog_hash(dhash);
+		return -1;
+#endif
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			x16s_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					x16s_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				//gpulog(LOG_INFO, thr_id, "hash found with %s 80 (%s)!", algo_strings[algo80], hashOrder);
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				// x11+ coins could do some random error, but not on retry
+				gpu_increment_reject(thr_id);
+				if (!warn) {
+					warn++;
+					pdata[19] = work->nonces[0] + 1;
+					continue;
+				} else {
+					if (!opt_quiet)	gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! %s %s",
+						work->nonces[0], algo_strings[algo80], hashOrder);
+					warn = 0;
+				}
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_x16s(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+	x16_fugue512_cpu_free(thr_id); // to merge with x13_fugue512 ?
+	x15_whirlpool_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+
+	cudaDeviceSynchronize();
+	init[thr_id] = false;
+}
--- a/x17/cuda_x17_sha512.cu
+++ b/x17/cuda_x17_sha512.cu
@ -169,3 +169,80 @@ void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,
				@@ -169,3 +169,80 @@ void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,

 	x17_sha512_gpu_hash_64 <<<grid, block>>> (threads, (uint64_t*)d_hash);
 }
+
+__constant__
+static uint64_t c_PaddedMessage80[10];
+
+__global__
+/*__launch_bounds__(256, 4)*/
+void x16_sha512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint64_t W[80];
+		#pragma unroll
+		for (int i = 0; i < 9; i ++) {
+			W[i] = SWAP64(c_PaddedMessage80[i]);
+		}
+		const uint32_t nonce = startNonce + thread;
+		//((uint32_t*)W)[19] = cuda_swab32(nonce);
+		W[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nonce));
+		W[9] = cuda_swab64(W[9]);
+		W[10] = 0x8000000000000000;
+
+		#pragma unroll
+		for (int i = 11; i<15; i++) {
+			W[i] = 0U;
+		}
+		W[15] = 0x0000000000000280;
+
+		#pragma unroll 64
+		for (int i = 16; i < 80; i ++) {
+			W[i] = SSG5_1(W[i-2]) + W[i-7];
+			W[i] += SSG5_0(W[i-15]) + W[i-16];
+		}
+
+		const uint64_t IV512[8] = {
+			0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
+			0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
+			0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
+			0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
+		};
+
+		uint64_t r[8];
+		#pragma unroll
+		for (int i = 0; i < 8; i++) {
+			r[i] = IV512[i];
+		}
+
+		#pragma unroll
+		for (int i = 0; i < 80; i++) {
+			SHA3_STEP(c_WB, r, W, i&7, i);
+		}
+
+		const uint64_t hashPosition = thread;
+		uint64_t *pHash = &g_hash[hashPosition << 3];
+		#pragma unroll
+		for (int u = 0; u < 8; u ++) {
+			pHash[u] = SWAP64(r[u] + IV512[u]);
+		}
+	}
+}
+
+__host__
+void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 256;
+
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	x16_sha512_gpu_hash_80 <<<grid, block >>> (threads, startNounce, (uint64_t*)d_hash);
+}
+
+__host__
+void x16_sha512_setBlock_80(void *pdata)
+{
+	cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+}