Merge updates from trunk

7 years ago · 6e7d12e88e
28 changed files with 5229 additions and 486 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -69,13 +69,16 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  lbry/lbry.cu lbry/cuda_sha256_lbry.cu lbry/cuda_sha512_lbry.cu lbry/cuda_lbry_merged.cu \
 			  qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
 			  tribus/tribus.cu tribus/cuda_echo512_final.cu \
-			  x11/x11.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
+			  x11/x11.cu x12/x12.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
 			  x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
 			  x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu x11/timetravel.cu x11/bitcore.cu \
 			  x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
 			  x13/hsr.cu x13/cuda_hsr_sm3.cu x13/sm3.c \
 			  x15/x14.cu x15/x15.cu x15/cuda_x14_shabal512.cu x15/cuda_x15_whirlpool.cu \
 			  x15/whirlpool.cu x15/cuda_x15_whirlpool_sm3.cu \
 			  x16/x16r.cu x16/x16s.cu x16/cuda_x16_echo512.cu x16/cuda_x16_fugue512.cu \
 			  x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \
 			  x16/cuda_x16_echo512_64.cu \
 			  x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
 			  x11/phi.cu x11/cuda_streebog_maxwell.cu \
 			  x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu \
--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@
-ccminer 2.2.4 (Jan. 2018)     "lyra2v2 and keccak improvements"
+ccminer 2.2.5 (Apr 2018)             "x12, x16r and x16s algos"
 ---------------------------------------------------------------
 ***************************************************************
@ -120,8 +120,12 @@ its command line interface and options.
                          tribus      use to mine Denarius
                          x11evo      use to mine Revolver
                          x11         use to mine DarkCoin
-                          x14         use to mine X14Coin
+                          x12         use to mine GalaxyCash
                          x13         use to mine X13
                          x14         use to mine X14
                          x15         use to mine Halcyon
                          x16r        use to mine Raven
                          x16s        use to mine Pigeon and Eden
                          x17         use to mine X17
                          vanilla     use to mine Vanilla (Blake256)
                          veltor      use to mine VeltorCoin
@ -277,7 +281,13 @@ so we can more efficiently implement new algorithms using the latest hardware
 features.
 >>> RELEASE HISTORY <<<
-  Jan. 04th 2017  v2.2.4
+  Apr. 02nd 2018  v2.2.5
                  New x16r algo for Raven
                  New x16s algo for Pigeon and Eden
                  New x12 algo for Galaxycash
                  Equihash (SIMT) sync issues for the Volta generation
  Jan. 04th 2018  v2.2.4
                  Improve lyra2v2
                  Higher keccak default intensity
                  Drop SM 2.x support by default, for CUDA 9 and more recent
--- a/algos.h
+++ b/algos.h
@ -57,9 +57,12 @@ enum sha_algos {
 	ALGO_BITCORE,
 	ALGO_X11EVO,
 	ALGO_X11,
 	ALGO_X12,
 	ALGO_X13,
 	ALGO_X14,
 	ALGO_X15,
 	ALGO_X16R,
 	ALGO_X16S,
 	ALGO_X17,
 	ALGO_VANILLA,
 	ALGO_VELTOR,
@ -127,9 +130,12 @@ static const char *algo_names[] = {
 	"bitcore",
 	"x11evo",
 	"x11",
 	"x12",
 	"x13",
 	"x14",
 	"x15",
 	"x16r",
 	"x16s",
 	"x17",
 	"vanilla",
 	"veltor",
--- a/api.cpp
+++ b/api.cpp
@ -1252,7 +1252,7 @@ static void api()
 			char *wskey = NULL;
 			n = recv(c, &buf[0], SOCK_REC_BUFSZ, 0);
-			fail = SOCKETFAIL(n);
+			fail = SOCKETFAIL(n) || n < 0;
 			if (fail)
 				buf[0] = '\0';
 			else if (n > 0 && buf[n-1] == '\n') {
@ -1261,7 +1261,7 @@ static void api()
 				if (n > 0 && buf[n-1] == '\r')
 					buf[n-1] = '\0';
 			}
-			buf[n] = '\0';
+			else buf[n] = '\0';
 			//if (opt_debug && opt_protocol && n > 0)
 			//	applog(LOG_DEBUG, "API: recv command: (%d) '%s'+char(%x)", n, buf, buf[n-1]);
--- a/bench.cpp
+++ b/bench.cpp
@ -100,9 +100,12 @@ void algo_free_all(int thr_id)
 	free_wildkeccak(thr_id);
 	free_x11evo(thr_id);
 	free_x11(thr_id);
 	free_x12(thr_id);
 	free_x13(thr_id);
 	free_x14(thr_id);
 	free_x15(thr_id);
 	free_x16r(thr_id);
 	free_x16s(thr_id);
 	free_x17(thr_id);
 	free_zr5(thr_id);
 	free_scrypt(thr_id);
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -291,9 +291,12 @@ Options:\n\
 			whirlpool   Whirlpool algo\n\
 			x11evo      Permuted x11 (Revolver)\n\
 			x11         X11 (DarkCoin)\n\
 			x12         X12 (GalaxyCash)\n\
 			x13         X13 (MaruCoin)\n\
 			x14         X14\n\
 			x15         X15\n\
 			x16r        X16R (Raven)\n\
 			x16s        X16S\n\
 			x17         X17\n\
 			wildkeccak  Boolberry\n\
 			zr5         ZR5 (ZiftrCoin)\n\
@ -1714,6 +1717,8 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		case ALGO_LYRA2Z:
 		case ALGO_TIMETRAVEL:
 		case ALGO_BITCORE:
 		case ALGO_X16R:
 		case ALGO_X16S:
 			work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));
 			break;
 		case ALGO_KECCAK:
@ -2253,6 +2258,7 @@ static void *miner_thread(void *userdata)
 			case ALGO_BITCORE:
 			case ALGO_X11EVO:
 			case ALGO_X11:
 			case ALGO_X12:
 			case ALGO_X13:
 			case ALGO_WHIRLCOIN:
 			case ALGO_WHIRLPOOL:
@ -2503,6 +2509,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_X11:
 			rc = scanhash_x11(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_X12:
 			rc = scanhash_x12(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_X13:
 			rc = scanhash_x13(thr_id, &work, max_nonce, &hashes_done);
 			break;
@ -2512,6 +2521,12 @@ static void *miner_thread(void *userdata)
 		case ALGO_X15:
 			rc = scanhash_x15(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_X16R:
 			rc = scanhash_x16r(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_X16S:
 			rc = scanhash_x16s(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_X17:
 			rc = scanhash_x17(thr_id, &work, max_nonce, &hashes_done);
 			break;
@ -3159,6 +3174,7 @@ void parse_arg(int key, char *arg)
 		if (v < 1 || v > 65535) // sanity check
 			show_usage_and_exit(1);
 		opt_api_mcast_port = v;
 		break;
 	case 'B':
 		opt_background = true;
 		break;
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -269,6 +269,7 @@
    <ClCompile Include="neoscrypt\neoscrypt-cpu.c" />
    <ClInclude Include="neoscrypt\cuda_vectors.h" />
    <ClInclude Include="x11\cuda_x11_simd512_sm2.cuh" />
    <ClInclude Include="x16\cuda_x16.h" />
    <CudaCompile Include="Algo256\bmw.cu" />
    <CudaCompile Include="Algo256\cuda_bmw.cu">
      <MaxRegCount>76</MaxRegCount>
@ -573,6 +574,7 @@
    <CudaCompile Include="x11\veltor.cu" />
    <CudaCompile Include="x11\x11.cu" />
    <CudaCompile Include="x11\x11evo.cu" />
    <CudaCompile Include="x12\x12.cu" />
    <CudaCompile Include="x13\cuda_x13_hamsi512.cu">
      <MaxRegCount>72</MaxRegCount>
    </CudaCompile>
@ -587,8 +589,17 @@
    <CudaCompile Include="x17\hmq17.cu" />
    <CudaCompile Include="x15\x15.cu" />
    <CudaCompile Include="x15\whirlpool.cu" />
-    <CudaCompile Include="x17\x17.cu">
+    <CudaCompile Include="x15\cuda_x15_whirlpool_sm3.cu" />
    <CudaCompile Include="x16\x16r.cu" />
    <CudaCompile Include="x16\x16s.cu" />
    <CudaCompile Include="x16\cuda_x16_echo512.cu" />
    <CudaCompile Include="x16\cuda_x16_fugue512.cu" />
    <CudaCompile Include="x16\cuda_x16_shabal512.cu" />
    <CudaCompile Include="x16\cuda_x16_simd512_80.cu" />
    <CudaCompile Include="x16\cuda_x16_echo512_64.cu">
      <CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
    </CudaCompile>
    <CudaCompile Include="x17\x17.cu" />
    <CudaCompile Include="x17\cuda_x17_haval256.cu">
    </CudaCompile>
    <CudaCompile Include="x17\cuda_x17_sha512.cu">
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -58,6 +58,9 @@
    <Filter Include="Source Files\CUDA\x15">
      <UniqueIdentifier>{a2403c22-6777-46ab-a55a-3fcc7386c974}</UniqueIdentifier>
    </Filter>
    <Filter Include="Source Files\CUDA\x16">
      <UniqueIdentifier>{55dfae6a-66ba-43e2-8ceb-98ee70cbdf16}</UniqueIdentifier>
    </Filter>
    <Filter Include="Source Files\CUDA\x17">
      <UniqueIdentifier>{85dfae6a-66ca-4332-8cec-98ee70cbdf2f}</UniqueIdentifier>
    </Filter>
@ -112,9 +115,12 @@
    <Filter Include="Source Files\CUDA\tribus">
      <UniqueIdentifier>{1e548d79-c217-4203-989a-a592fe2b2de3}</UniqueIdentifier>
    </Filter>
    <Filter Include="Source Files\CUDA\x12">
      <UniqueIdentifier>{xde48d89-fx12-1323-129a-b592fe2b2de3}</UniqueIdentifier>
    </Filter>
    <Filter Include="Source Files\CUDA\gost">
      <UniqueIdentifier>{6a99bc95-f402-465e-9e64-b042bd241bb7}</UniqueIdentifier>
-    </Filter>
+	</Filter>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="compat\jansson\dump.c">
@ -599,6 +605,9 @@
    <ClInclude Include="equi\equihash.h">
      <Filter>Source Files\equi</Filter>
    </ClInclude>
    <ClInclude Include="x16\cuda_x16.h">
      <Filter>Header Files\CUDA</Filter>
    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <CudaCompile Include="cuda.cpp">
@ -808,6 +817,9 @@
    <CudaCompile Include="x11\s3.cu">
      <Filter>Source Files\CUDA\x11</Filter>
    </CudaCompile>
    <CudaCompile Include="x12\x12.cu">
      <Filter>Source Files\CUDA\x12</Filter>
    </CudaCompile>
    <CudaCompile Include="x11\timetravel.cu">
      <Filter>Source Files\CUDA\x11</Filter>
    </CudaCompile>
@ -970,6 +982,30 @@
    <CudaCompile Include="equi\cuda_equi.cu">
      <Filter>Source Files\equi</Filter>
    </CudaCompile>
    <CudaCompile Include="x15\cuda_x15_whirlpool_sm3.cu">
      <Filter>Source Files\CUDA\x15</Filter>
    </CudaCompile>
    <CudaCompile Include="x16\cuda_x16_echo512.cu">
      <Filter>Source Files\CUDA\x16</Filter>
    </CudaCompile>
    <CudaCompile Include="x16\cuda_x16_echo512_64.cu">
      <Filter>Source Files\CUDA\x16</Filter>
    </CudaCompile>
    <CudaCompile Include="x16\cuda_x16_fugue512.cu">
      <Filter>Source Files\CUDA\x16</Filter>
    </CudaCompile>
    <CudaCompile Include="x16\cuda_x16_shabal512.cu">
      <Filter>Source Files\CUDA\x16</Filter>
    </CudaCompile>
    <CudaCompile Include="x16\cuda_x16_simd512_80.cu">
      <Filter>Source Files\CUDA\x16</Filter>
    </CudaCompile>
    <CudaCompile Include="x16\x16r.cu">
      <Filter>Source Files\CUDA\x16</Filter>
    </CudaCompile>
    <CudaCompile Include="x16\x16s.cu">
      <Filter>Source Files\CUDA\x16</Filter>
    </CudaCompile>
    <CudaCompile Include="gost\cuda_gosthash.cu">
      <Filter>Source Files\CUDA\gost</Filter>
    </CudaCompile>
--- a/compat/ccminer-config.h
+++ b/compat/ccminer-config.h
@ -164,7 +164,7 @@
 #define PACKAGE_URL "http://github.com/tpruvot/ccminer"
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.2.4"
+#define PACKAGE_VERSION "2.2.5"
 /* If using the C implementation of alloca, define if you know the
   direction of stack growth for your system; otherwise it will be
--- a/equi/cuda_equi.cu
+++ b/equi/cuda_equi.cu
@ -80,6 +80,8 @@ u32 umin(const u32, const u32);
 u32 umax(const u32, const u32);
 #endif
 #define OPT_SYNC_ALL
 #if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
 #define __shfl2(var, srcLane)  __shfl_sync(0xFFFFFFFFu, var, srcLane)
 #undef __any
@ -514,10 +516,11 @@ __global__ void digit_1(equi<RB, SM>* eq)
 	u32 si[2];
 #ifdef OPT_SYNC_ALL
 	// enable this to make fully safe shared mem operations;
 	// disabled gains some speed, but can rarely cause a crash
-	//__syncthreads();
+	__syncthreads();
-
+#endif
 	#pragma unroll
 	for (u32 i = 0; i != 2; ++i)
 	{
@ -654,11 +657,9 @@ __global__ void digit_2(equi<RB, SM>* eq)
 	uint4 tt[2];
 	u32 si[2];
-
+#ifdef OPT_SYNC_ALL
-	// enable this to make fully safe shared mem operations;
+	__syncthreads();
-	// disabled gains some speed, but can rarely cause a crash
+#endif
 	//__syncthreads();
 	#pragma unroll 2
 	for (u32 i = 0; i < 2; i++)
 	{
@ -785,9 +786,9 @@ __global__ void digit_3(equi<RB, SM>* eq)
 	uint4 tt[2];
 	u32 ta[2];
-	// enable this to make fully safe shared mem operations;
+#ifdef OPT_SYNC_ALL
-	// disabled gains some speed, but can rarely cause a crash
+	__syncthreads();
-	//__syncthreads();
+#endif
 	#pragma unroll 2
 	for (u32 i = 0; i < 2; i++)
@ -919,11 +920,9 @@ __global__ void digit_4(equi<RB, SM>* eq)
 	u32 si[2];
 	uint4 tt[2];
-
+#ifdef OPT_SYNC_ALL
-	// enable this to make fully safe shared mem operations;
+	__syncthreads();
-	// disabled gains some speed, but can rarely cause a crash
+#endif
 	//__syncthreads();
 	#pragma unroll 2
 	for (u32 i = 0; i < 2; i++)
 	{
@ -1035,11 +1034,9 @@ __global__ void digit_5(equi<RB, SM>* eq)
 	u32 si[2];
 	uint4 tt[2];
-
+#ifdef OPT_SYNC_ALL
-	// enable this to make fully safe shared mem operations;
+	__syncthreads();
-	// disabled gains some speed, but can rarely cause a crash
+#endif
 	//__syncthreads();
 	#pragma unroll 2
 	for (u32 i = 0; i < 2; i++)
 	{
--- a/miner.h
+++ b/miner.h
@ -324,9 +324,12 @@ extern int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, uns
 extern int scanhash_wildkeccak(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x11evo(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x13(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x14(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x15(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_zr5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
@ -389,9 +392,12 @@ extern void free_whirl(int thr_id);
 extern void free_wildkeccak(int thr_id);
 extern void free_x11evo(int thr_id);
 extern void free_x11(int thr_id);
 extern void free_x12(int thr_id);
 extern void free_x13(int thr_id);
 extern void free_x14(int thr_id);
 extern void free_x15(int thr_id);
 extern void free_x16r(int thr_id);
 extern void free_x16s(int thr_id);
 extern void free_x17(int thr_id);
 extern void free_zr5(int thr_id);
 //extern void free_sha256d(int thr_id);
@ -934,9 +940,12 @@ void wcoinhash(void *state, const void *input);
 void whirlxHash(void *state, const void *input);
 void x11evo_hash(void *output, const void *input);
 void x11hash(void *output, const void *input);
 void x12hash(void *output, const void *input);
 void x13hash(void *output, const void *input);
 void x14hash(void *output, const void *input);
 void x15hash(void *output, const void *input);
 void x16r_hash(void *output, const void *input);
 void x16s_hash(void *output, const void *input);
 void x17hash(void *output, const void *input);
 void wildkeccak_hash(void *output, const void *input, uint64_t* scratchpad, uint64_t ssize);
 void zr5hash(void *output, const void *input);
--- a/neoscrypt/cuda_neoscrypt.cu
+++ b/neoscrypt/cuda_neoscrypt.cu
@ -1319,7 +1319,6 @@ static void Blake2Shost(uint32_t * inout, const uint32_t * inkey)
 }
 #define SHIFT 128U
 #define TPB 32
 #define TPB2 64
@ -1346,7 +1345,7 @@ __launch_bounds__(TPB, 1)
 void neoscrypt_gpu_hash_chacha1()
 {
 	const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
-	const uint32_t shift = SHIFT * 8U * (thread & 8191);
+	const uint32_t threads = (gridDim.x * blockDim.y);
 	const uint32_t shiftTr = 8U * thread;
 	uint4 X[4];
@ -1361,7 +1360,7 @@ void neoscrypt_gpu_hash_chacha1()
 	#pragma nounroll
 	for (int i = 0; i < 128; i++)
 	{
-		uint32_t offset = shift + i * 8U;
+		uint32_t offset = 8U * (thread + threads * i);
 		for (int j = 0; j < 4; j++)
 			((uint4*)(W + offset))[j * 4 + threadIdx.x] = X[j];
 		neoscrypt_chacha(X);
@ -1370,7 +1369,7 @@ void neoscrypt_gpu_hash_chacha1()
 	#pragma nounroll
 	for (int t = 0; t < 128; t++)
 	{
-		uint32_t offset = shift + (WarpShuffle(X[3].x, 0, 4) & 0x7F) * 8U;
+		uint32_t offset = 8U * (thread + threads * (WarpShuffle(X[3].x, 0, 4) & 0x7F));
 		for (int j = 0; j < 4; j++)
 			X[j] ^= ((uint4*)(W + offset))[j * 4 + threadIdx.x];
 		neoscrypt_chacha(X);
@ -1391,7 +1390,7 @@ __launch_bounds__(TPB, 1)
 void neoscrypt_gpu_hash_salsa1()
 {
 	const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);
-	const uint32_t shift = SHIFT * 8U * (thread & 8191);
+	const uint32_t threads = (gridDim.x * blockDim.y);
 	const uint32_t shiftTr = 8U * thread;
 	uint4 Z[4];
@ -1406,7 +1405,7 @@ void neoscrypt_gpu_hash_salsa1()
 	#pragma nounroll
 	for (int i = 0; i < 128; i++)
 	{
-		uint32_t offset = shift + i * 8U;
+		uint32_t offset = 8U * (thread + threads * i);
 		for (int j = 0; j < 4; j++)
 			((uint4*)(W + offset))[j * 4 + threadIdx.x] = Z[j];
 		neoscrypt_salsa(Z);
@ -1415,7 +1414,7 @@ void neoscrypt_gpu_hash_salsa1()
 	#pragma nounroll
 	for (int t = 0; t < 128; t++)
 	{
-		uint32_t offset = shift + (WarpShuffle(Z[3].x, 0, 4) & 0x7F) * 8U;
+		uint32_t offset = 8U * (thread + threads * (WarpShuffle(Z[3].x, 0, 4) & 0x7F));
 		for (int j = 0; j < 4; j++)
 			Z[j] ^= ((uint4*)(W + offset))[j * 4 + threadIdx.x];
 		neoscrypt_salsa(Z);
@ -1474,7 +1473,7 @@ void neoscrypt_init(int thr_id, uint32_t threads)
 	cuda_get_arch(thr_id);
 	CUDA_SAFE_CALL(cudaMalloc(&d_NNonce[thr_id], 2 * sizeof(uint32_t)));
-	CUDA_SAFE_CALL(cudaMalloc(&hash1, 32 * 128 * sizeof(uint64_t) * min(8192, threads)));
+	CUDA_SAFE_CALL(cudaMalloc(&hash1, 32 * 128 * sizeof(uint64_t) * threads));
 	CUDA_SAFE_CALL(cudaMalloc(&Trans1, 32 * sizeof(uint64_t) * threads));
 	CUDA_SAFE_CALL(cudaMalloc(&Trans2, 32 * sizeof(uint64_t) * threads));
 	CUDA_SAFE_CALL(cudaMalloc(&Trans3, 32 * sizeof(uint64_t) * threads));
--- a/neoscrypt/neoscrypt.cpp
+++ b/neoscrypt/neoscrypt.cpp
@ -22,6 +22,7 @@ int scanhash_neoscrypt(int thr_id, struct work* work, uint32_t max_nonce, unsign
 	int dev_id = device_map[thr_id];
 	int intensity = is_windows() ? 18 : 19;
 	if (strstr(device_name[dev_id], "GTX 10")) intensity = 21; // >= 20 need more than 2GB
 	if (strstr(device_name[dev_id], "TITAN")) intensity = 21;
 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
 	throughput = throughput / 32; /* set for max intensity ~= 20 */
--- a/res/ccminer.rc
+++ b/res/ccminer.rc
@ -13,7 +13,7 @@
 #undef APSTUDIO_READONLY_SYMBOLS
 /////////////////////////////////////////////////////////////////////////////
-// Anglais (États-Unis) resources
+// English (United States) resources
 #if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU)
 LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
@ -60,8 +60,8 @@ IDI_ICON1               ICON                    "ccminer.ico"
 //
 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 2,2,4,0
+ FILEVERSION 2,2,5,0
- PRODUCTVERSION 2,2,4,0
+ PRODUCTVERSION 2,2,5,0
 FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
 FILEFLAGS 0x21L
@ -76,10 +76,10 @@ BEGIN
    BEGIN
        BLOCK "040904e4"
        BEGIN
-            VALUE "FileVersion", "2.2.4"
+            VALUE "FileVersion", "2.2.5"
            VALUE "LegalCopyright", "Copyright (C) 2018"
            VALUE "ProductName", "ccminer"
-            VALUE "ProductVersion", "2.2.4"
+            VALUE "ProductVersion", "2.2.5"
        END
    END
    BLOCK "VarFileInfo"
@ -88,7 +88,7 @@ BEGIN
    END
 END
-#endif    // Anglais (États-Unis) resources
+#endif    // English (United States) resources
 /////////////////////////////////////////////////////////////////////////////
--- a/scrypt/salsa_kernel.cu
+++ b/scrypt/salsa_kernel.cu
@ -240,8 +240,9 @@ inline int _ConvertSMVer2Cores(int major, int minor)
 		{ 0x21, 48  }, // Fermi Generation (SM 2.1) GF10x class
 		{ 0x30, 192 }, // Kepler Generation (SM 3.0) GK10x class - GK104 = 1536 cores / 8 SMs
 		{ 0x35, 192 }, // Kepler Generation (SM 3.5) GK11x class
-		{ 0x50, 128 }, // Maxwell Generation (SM 5.0) GTX750/750Ti
+		{ 0x50, 128 }, // Maxwell First Generation (SM 5.0) GTX750/750Ti
 		{ 0x52, 128 }, // Maxwell Second Generation (SM 5.2) GTX980 = 2048 cores / 16 SMs - GTX970 1664 cores / 13 SMs
 		{ 0x61, 128 }, // Pascal GeForce (SM 6.1)
 		{ -1, -1 },
 	};
--- a/util.cpp
+++ b/util.cpp
@ -1354,7 +1354,11 @@ bool stratum_authorize(struct stratum_ctx *sctx, const char *user, const char *p
 	if (!res_val || json_is_false(res_val) ||
 	    (err_val && !json_is_null(err_val)))  {
-		applog(LOG_ERR, "Stratum authentication failed");
+		if (err_val && json_is_array(err_val)) {
 			const char* reason = json_string_value(json_array_get(err_val, 1));
 			applog(LOG_ERR, "Stratum authentication failed (%s)", reason);
 		}
 		else applog(LOG_ERR, "Stratum authentication failed");
 		goto out;
 	}
@ -2313,19 +2317,28 @@ void print_hash_tests(void)
 	printpfx("x11evo", hash);
 	x11hash(&hash[0], &buf[0]);
-	printpfx("X11", hash);
+	printpfx("x11", hash);
 	x12hash(&hash[0], &buf[0]);
 	printpfx("x12", hash);
 	x13hash(&hash[0], &buf[0]);
-	printpfx("X13", hash);
+	printpfx("x13", hash);
 	x14hash(&hash[0], &buf[0]);
-	printpfx("X14", hash);
+	printpfx("x14", hash);
 	x15hash(&hash[0], &buf[0]);
-	printpfx("X15", hash);
+	printpfx("x15", hash);
 	x16r_hash(&hash[0], &buf[0]);
 	printpfx("x16r", hash);
 	x16s_hash(&hash[0], &buf[0]);
 	printpfx("x16s", hash);
 	x17hash(&hash[0], &buf[0]);
-	printpfx("X17", hash);
+	printpfx("x17", hash);
 	//memcpy(buf, zrtest, 80);
 	zr5hash(&hash[0], &buf[0]);
--- a/x12/x12.cu
+++ b/x12/x12.cu
@ -0,0 +1,236 @@
 /*
 * X12 algorithm
 */
 extern "C" {
 #include "sph/sph_blake.h"
 #include "sph/sph_bmw.h"
 #include "sph/sph_luffa.h"
 #include "sph/sph_cubehash.h"
 #include "sph/sph_shavite.h"
 #include "sph/sph_simd.h"
 #include "sph/sph_echo.h"
 #include "sph/sph_groestl.h"
 #include "sph/sph_skein.h"
 #include "sph/sph_jh.h"
 #include "sph/sph_keccak.h"
 #include "sph/sph_hamsi.h"
 }
 #include "miner.h"
 #include "cuda_helper.h"
 #include "x11/cuda_x11.h"
 static uint32_t *d_hash[MAX_GPUS];
 extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 // X12 CPU Hash
 extern "C" void x12hash(void *output, const void *input)
 {
 	sph_blake512_context    ctx_blake;
 	sph_bmw512_context      ctx_bmw;
 	sph_luffa512_context    ctx_luffa;
 	sph_cubehash512_context ctx_cubehash;
 	sph_shavite512_context  ctx_shavite;
 	sph_simd512_context     ctx_simd;
 	sph_echo512_context     ctx_echo;
 	sph_groestl512_context  ctx_groestl;
 	sph_skein512_context    ctx_skein;
 	sph_jh512_context       ctx_jh;
 	sph_keccak512_context   ctx_keccak;
 	sph_hamsi512_context    ctx_hamsi;
 	uint32_t hash[32];
 	memset(hash, 0, sizeof hash);
 	sph_blake512_init(&ctx_blake);
 	sph_blake512 (&ctx_blake, input, 80);
 	sph_blake512_close(&ctx_blake, (void*) hash);
 	sph_bmw512_init(&ctx_bmw);
 	sph_bmw512(&ctx_bmw, (const void*) hash, 64);
 	sph_bmw512_close(&ctx_bmw, (void*) hash);
 	sph_luffa512_init(&ctx_luffa);
 	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
 	sph_luffa512_close(&ctx_luffa, (void*)hash);
 	sph_cubehash512_init(&ctx_cubehash);
 	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
 	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
 	sph_shavite512_init(&ctx_shavite);
 	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
 	sph_shavite512_close(&ctx_shavite, (void*)hash);
 	sph_simd512_init(&ctx_simd);
 	sph_simd512(&ctx_simd, (const void*)hash, 64);
 	sph_simd512_close(&ctx_simd, (void*)hash);
 	sph_echo512_init(&ctx_echo);
 	sph_echo512(&ctx_echo, (const void*)hash, 64);
 	sph_echo512_close(&ctx_echo, (void*)hash);
 	sph_groestl512_init(&ctx_groestl);
 	sph_groestl512(&ctx_groestl, (const void*) hash, 64);
 	sph_groestl512_close(&ctx_groestl, (void*) hash);
 	sph_skein512_init(&ctx_skein);
 	sph_skein512(&ctx_skein, (const void*) hash, 64);
 	sph_skein512_close(&ctx_skein, (void*) hash);
 	sph_jh512_init(&ctx_jh);
 	sph_jh512(&ctx_jh, (const void*) hash, 64);
 	sph_jh512_close(&ctx_jh, (void*) hash);
 	sph_keccak512_init(&ctx_keccak);
 	sph_keccak512(&ctx_keccak, (const void*) hash, 64);
 	sph_keccak512_close(&ctx_keccak, (void*) hash);
 	sph_hamsi512_init(&ctx_hamsi);
 	sph_hamsi512(&ctx_hamsi, (const void*) hash, 64);
 	sph_hamsi512_close(&ctx_hamsi, (void*) hash);
 	memcpy(output, hash, 32);
 }
 static bool init[MAX_GPUS] = { 0 };
 extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
 	int intensity = (device_sm[device_map[thr_id]] > 500 && !is_windows()) ? 20 : 19;
 	uint32_t throughput =  cuda_default_throughput(thr_id, 1 << intensity); // 19=256*256*8;
 	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 	if (opt_benchmark)
 		((uint32_t*)ptarget)[7] = 0x000f;
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
 			// reduce cpu usage
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 			CUDA_LOG_ERROR();
 		}
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 		quark_blake512_cpu_init(thr_id, throughput);
 		x11_luffaCubehash512_cpu_init(thr_id, throughput);
 		x11_shavite512_cpu_init(thr_id, throughput);
 		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
 			return 0;
 		}
 		x11_echo512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
 		quark_bmw512_cpu_init(thr_id, throughput);
 		quark_keccak512_cpu_init(thr_id, throughput);
 		quark_jh512_cpu_init(thr_id, throughput);
 		x13_hamsi512_cpu_init(thr_id, throughput);
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], 16 * sizeof(uint32_t) * throughput), 0);
 		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
 	uint32_t endiandata[20];
 	for (int k=0; k < 20; k++)
 		be32enc(&endiandata[k], pdata[k]);
 	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
 	cuda_check_cpu_setTarget(ptarget);
 	do {
 		int order = 0;
 		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
 		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		*hashes_done = pdata[19] - first_nonce + throughput;
 		CUDA_LOG_ERROR();
 		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 		if (work->nonces[0] != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
 			uint32_t _ALIGN(64) vhash[8];
 			be32enc(&endiandata[19], work->nonces[0]);
 			x12hash(vhash, endiandata);
 			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
 				work->valid_nonces = 1;
 				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 				work_set_target_ratio(work, vhash);
 				if (work->nonces[1] != 0) {
 					be32enc(&endiandata[19], work->nonces[1]);
 					x12hash(vhash, endiandata);
 					bn_set_target_ratio(work, vhash, 1);
 					work->valid_nonces++;
 					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
 				} else {
 					pdata[19] = work->nonces[0] + 1; // cursor
 				}
 				return work->valid_nonces;
 			}
 			else if (vhash[7] > Htarg) {
 				gpu_increment_reject(thr_id);
 				if (!opt_quiet)
 					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
 				pdata[19] = work->nonces[0] + 1;
 				continue;
 			}
 		}
 		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
 		pdata[19] += throughput;
 	} while (!work_restart[thr_id].restart);
 	*hashes_done = pdata[19] - first_nonce;
 	CUDA_LOG_ERROR();
 	return 0;
 }
 // cleanup
 extern "C" void free_x12(int thr_id)
 {
 	if (!init[thr_id])
 		return;
 	cudaThreadSynchronize();
 	cudaFree(d_hash[thr_id]);
 	quark_blake512_cpu_free(thr_id);
 	quark_groestl512_cpu_free(thr_id);
 	x11_simd512_cpu_free(thr_id);
 	cuda_check_cpu_free(thr_id);
 	CUDA_LOG_ERROR();
 	cudaDeviceSynchronize();
 	init[thr_id] = false;
 }
--- a/x13/cuda_x13_hamsi512.cu
+++ b/x13/cuda_x13_hamsi512.cu
@ -1,6 +1,6 @@
 /*
- * Quick Hamsi-512 for X13
+ * Quick Hamsi-512 for X13 by tsiv - 2014
- * by tsiv - 2014
+ * + Hamsi-512 80 by tpruvot - 2018
 */
 #include <stdio.h>
@ -16,31 +16,17 @@ static __constant__ uint32_t d_alpha_f[32];
 static __constant__ uint32_t d_T512[64][16];
 static const uint32_t alpha_n[] = {
-	SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc),
+	0xff00f0f0, 0xccccaaaa, 0xf0f0cccc, 0xff00aaaa, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc, 0xf0f0ff00,
-	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00),
+	0xf0f0cccc, 0xaaaaff00, 0xccccff00, 0xaaaaf0f0, 0xaaaaf0f0, 0xff00cccc, 0xccccf0f0, 0xff00aaaa,
-	SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc),
+	0xccccaaaa, 0xff00f0f0, 0xff00aaaa, 0xf0f0cccc, 0xf0f0ff00, 0xccccaaaa, 0xf0f0ff00, 0xaaaacccc,
-	SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0),
+	0xaaaaff00, 0xf0f0cccc, 0xaaaaf0f0, 0xccccff00, 0xff00cccc, 0xaaaaf0f0, 0xff00aaaa, 0xccccf0f0
 	SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0),
 	SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0),
 	SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00),
 	SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc),
 	SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0),
 	SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0),
 	SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0)
 };
 static const uint32_t alpha_f[] = {
-	SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0),
+	0xcaf9639c, 0x0ff0f9c0, 0x639c0ff0, 0xcaf9f9c0, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0, 0x639ccaf9,
-	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9),
+	0x639c0ff0, 0xf9c0caf9, 0x0ff0caf9, 0xf9c0639c, 0xf9c0639c, 0xcaf90ff0, 0x0ff0639c, 0xcaf9f9c0,
-	SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0),
+	0x0ff0f9c0, 0xcaf9639c, 0xcaf9f9c0, 0x639c0ff0, 0x639ccaf9, 0x0ff0f9c0, 0x639ccaf9, 0xf9c00ff0,
-	SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c),
+	0xf9c0caf9, 0x639c0ff0, 0xf9c0639c, 0x0ff0caf9, 0xcaf90ff0, 0xf9c0639c, 0xcaf9f9c0, 0x0ff0639c
 	SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c),
 	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c),
 	SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9),
 	SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0),
 	SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c),
 	SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c),
 	SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c)
 };
 #define hamsi_s00   m0
@ -200,390 +186,134 @@ static const uint32_t alpha_f[] = {
 static const uint32_t T512[64][16] = {
-	{ SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000),
+{ 0xef0b0270, 0x3afd0000, 0x5dae0000, 0x69490000, 0x9b0f3c06, 0x4405b5f9, 0x66140a51, 0x924f5d0a, // 0
-	  SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9),
+  0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68 },
-	  SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030),
+{ 0xc96b0030, 0xe7250000, 0x2f840000, 0x264f0000, 0x08695bf9, 0x6dfcf137, 0x509f6984, 0x9e69af68,
-	  SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000),
+  0x26600240, 0xddd80000, 0x722a0000, 0x4f060000, 0x936667ff, 0x29f944ce, 0x368b63d5, 0x0c26f262 },
-	  SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984),
+{ 0x145a3c00, 0xb9e90000, 0x61270000, 0xf1610000, 0xce613d6c, 0xb0493d78, 0x47a96720, 0xe18e24c5,
-	  SPH_C32(0x9e69af68) },
+  0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f },
-	{ SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000),
+{ 0x23671400, 0xc8b90000, 0xf4c70000, 0xfb750000, 0x73cd2465, 0xf8a6a549, 0x02c40a3f, 0xdc24e61f,
-	  SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137),
+  0x373d2800, 0x71500000, 0x95e00000, 0x0a140000, 0xbdac1909, 0x48ef9831, 0x456d6d1f, 0x3daac2da },
-	  SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240),
+{ 0x54285c00, 0xeaed0000, 0xc5d60000, 0xa1c50000, 0xb3a26770, 0x94a5c4e1, 0x6bb0419d, 0x551b3782,
-	  SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000),
+  0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29 },
-	  SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5),
+{ 0x9cbb1800, 0xb0d30000, 0x92510000, 0xed930000, 0x593a4345, 0xe114d5f4, 0x430633da, 0x78cace29,
-	  SPH_C32(0x0c26f262) },
+  0xc8934400, 0x5a3e0000, 0x57870000, 0x4c560000, 0xea982435, 0x75b11115, 0x28b67247, 0x2dd1f9ab },
-	{ SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000),
+{ 0x29449c00, 0x64e70000, 0xf24b0000, 0xc2f30000, 0x0ede4e8f, 0x56c23745, 0xf3e04259, 0x8d0d9ec4,
-	  SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78),
+  0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2 },
-	  SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400),
+{ 0x466d0c00, 0x08620000, 0xdd5d0000, 0xbadd0000, 0x6a927942, 0x441f2b93, 0x218ace6f, 0xbf2c0be2, // 7
-	  SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000),
+  0x6f299000, 0x6c850000, 0x2f160000, 0x782e0000, 0x644c37cd, 0x12dd1cd6, 0xd26a8c36, 0x32219526 },
-	  SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f),
+{ 0xf6800005, 0x3443c000, 0x24070000, 0x8f3d0000, 0x21373bfb, 0x0ab8d5ae, 0xcdc58b19, 0xd795ba31,
-	  SPH_C32(0xdc24e61f) },
+  0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88 },
-	{ SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000),
+{ 0xa67f0001, 0x71378000, 0x19fc0000, 0x96db0000, 0x3a8b6dfd, 0xebcaaef3, 0x2c6d478f, 0xac8e6c88,
-	  SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549),
+  0x50ff0004, 0x45744000, 0x3dfb0000, 0x19e60000, 0x1bbc5606, 0xe1727b5d, 0xe1a8cc96, 0x7b1bd6b9 },
-	  SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800),
+{ 0xf7750009, 0xcf3cc000, 0xc3d60000, 0x04920000, 0x029519a9, 0xf8e836ba, 0x7a87f14e, 0x9e16981a,
-	  SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000),
+  0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320 },
-	  SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f),
+{ 0xd46a0000, 0x8dc8c000, 0xa5af0000, 0x4a290000, 0xfc4e427a, 0xc9b4866c, 0x98369604, 0xf746c320,
-	  SPH_C32(0x3daac2da) },
+  0x231f0009, 0x42f40000, 0x66790000, 0x4ebb0000, 0xfedb5bd3, 0x315cb0d6, 0xe2b1674a, 0x69505b3a },
-	{ SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000),
+{ 0x774400f0, 0xf15a0000, 0xf5b20000, 0x34140000, 0x89377e8c, 0x5a8bec25, 0x0bc3cd1e, 0xcf3775cb,
-	  SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1),
+  0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574 },
-	  SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800),
+{ 0xf46c0050, 0x96180000, 0x14a50000, 0x031f0000, 0x42947eb8, 0x66bf7e19, 0x9ca470d2, 0x8a341574,
-	  SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000),
+  0x832800a0, 0x67420000, 0xe1170000, 0x370b0000, 0xcba30034, 0x3c34923c, 0x9767bdcc, 0x450360bf },
-	  SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da),
+{ 0xe8870170, 0x9d720000, 0x12db0000, 0xd4220000, 0xf2886b27, 0xa921e543, 0x4ef8b518, 0x618813b1, // 14
-	  SPH_C32(0x78cace29) },
+  0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758 },
-	{ SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000),
+{ 0xb4370060, 0x0c4c0000, 0x56c20000, 0x5cae0000, 0x94541f3f, 0x3b3ef825, 0x1b365f3d, 0xf3d45758,
-	  SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4),
+  0x5cb00110, 0x913e0000, 0x44190000, 0x888c0000, 0x66dc7418, 0x921f1d66, 0x55ceea25, 0x925c44e9 },
-	  SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400),
+{ 0x0c720000, 0x49e50f00, 0x42790000, 0x5cea0000, 0x33aa301a, 0x15822514, 0x95a34b7b, 0xb44b0090,
-	  SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000),
+  0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f },
-	  SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247),
+{ 0xfe220000, 0xa7580500, 0x25d10000, 0xf7600000, 0x893178da, 0x1fd4f860, 0x4ed0a315, 0xa123ff9f,
-	  SPH_C32(0x2dd1f9ab) },
+  0xf2500000, 0xeebd0a00, 0x67a80000, 0xab8a0000, 0xba9b48c0, 0x0a56dd74, 0xdb73e86e, 0x1568ff0f },
-	{ SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000),
+{ 0x45180000, 0xa5b51700, 0xf96a0000, 0x3b480000, 0x1ecc142c, 0x231395d6, 0x16bca6b0, 0xdf33f4df,
-	  SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745),
+  0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e },
-	  SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00),
+{ 0xb83d0000, 0x16710600, 0x379a0000, 0xf5b10000, 0x228161ac, 0xae48f145, 0x66241616, 0xc5c1eb3e,
-	  SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000),
+  0xfd250000, 0xb3c41100, 0xcef00000, 0xcef90000, 0x3c4d7580, 0x8d5b6493, 0x7098b0a6, 0x1af21fe1 },
-	  SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f),
+{ 0x75a40000, 0xc28b2700, 0x94a40000, 0x90f50000, 0xfb7857e0, 0x49ce0bae, 0x1767c483, 0xaedf667e,
-	  SPH_C32(0xbf2c0be2) },
+  0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b },
-	{ SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000),
+{ 0xd1660000, 0x1bbc0300, 0x9eec0000, 0xf6940000, 0x03024527, 0xcf70fcf2, 0xb4431b17, 0x857f3c2b, // 21
-	  SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93),
+  0xa4c20000, 0xd9372400, 0x0a480000, 0x66610000, 0xf87a12c7, 0x86bef75c, 0xa324df94, 0x2ba05a55 },
-	  SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000),
+{ 0x75c90003, 0x0e10c000, 0xd1200000, 0xbaea0000, 0x8bc42f3e, 0x8758b757, 0xbb28761d, 0x00b72e2b,
-	  SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000),
+  0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254 },
-	  SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36),
+{ 0xeecf0001, 0x6f564000, 0xf33e0000, 0xa79e0000, 0xbdb57219, 0xb711ebc5, 0x4a3b40ba, 0xfeabf254,
-	  SPH_C32(0x32219526) },
+  0x9b060002, 0x61468000, 0x221e0000, 0x1d740000, 0x36715d27, 0x30495c92, 0xf11336a7, 0xfe1cdc7f },
-	{ SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000),
+{ 0x86790000, 0x3f390002, 0xe19ae000, 0x98560000, 0x9565670e, 0x4e88c8ea, 0xd3dd4944, 0x161ddab9,
-	  SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae),
+  0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834 },
-	  SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001),
+{ 0x30b70000, 0xe5d00000, 0xf4f46000, 0x42c40000, 0x63b83d6a, 0x78ba9460, 0x21afa1ea, 0xb0a51834,
-	  SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000),
+  0xb6ce0000, 0xdae90002, 0x156e8000, 0xda920000, 0xf6dd5a64, 0x36325c8a, 0xf272e8ae, 0xa6b8c28d },
-	  SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f),
+{ 0x14190000, 0x23ca003c, 0x50df0000, 0x44b60000, 0x1b6c67b0, 0x3cf3ac75, 0x61e610b0, 0xdbcadb80,
-	  SPH_C32(0xac8e6c88) },
+  0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7 },
-	{ SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000),
+{ 0xe3430000, 0x3a4e0014, 0xf2c60000, 0xaa4e0000, 0xdb1e42a6, 0x256bbe15, 0x123db156, 0x3a4e99d7,
-	  SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3),
+  0xf75a0000, 0x19840028, 0xa2190000, 0xeef80000, 0xc0722516, 0x19981260, 0x73dba1e6, 0xe1844257 },
-	  SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004),
+{ 0x54500000, 0x0671005c, 0x25ae0000, 0x6a1e0000, 0x2ea54edf, 0x664e8512, 0xbfba18c3, 0x7e715d17, // 28
-	  SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000),
+  0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e },
-	  SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96),
+{ 0xbc8d0000, 0xfc3b0018, 0x19830000, 0xd10b0000, 0xae1878c4, 0x42a69856, 0x0012da37, 0x2c3b504e,
-	  SPH_C32(0x7b1bd6b9) },
+  0xe8dd0000, 0xfa4a0044, 0x3c2d0000, 0xbb150000, 0x80bd361b, 0x24e81d44, 0xbfa8c2f4, 0x524a0d59 },
-	{ SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000),
+{ 0x69510000, 0xd4e1009c, 0xc3230000, 0xac2f0000, 0xe4950bae, 0xcea415dc, 0x87ec287c, 0xbce1a3ce,
-	  SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba),
+  0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173 },
-	  SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000),
+{ 0xc6730000, 0xaf8d000c, 0xa4c10000, 0x218d0000, 0x23111587, 0x7913512f, 0x1d28ac88, 0x378dd173,
-	  SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000),
+  0xaf220000, 0x7b6c0090, 0x67e20000, 0x8da20000, 0xc7841e29, 0xb7b744f3, 0x9ac484f4, 0x8b6c72bd },
-	  SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604),
+{ 0xcc140000, 0xa5630000, 0x5ab90780, 0x3b500000, 0x4bd013ff, 0x879b3418, 0x694348c1, 0xca5a87fe,
-	  SPH_C32(0xf746c320) },
+  0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa },
-	{ SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000),
+{ 0x819e0000, 0xec570000, 0x66320280, 0x95f30000, 0x5da92802, 0x48f43cbc, 0xe65aa22d, 0x8e67b7fa,
-	  SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c),
+  0x4d8a0000, 0x49340000, 0x3c8b0500, 0xaea30000, 0x16793bfd, 0xcf6f08a4, 0x8f19eaec, 0x443d3004 },
-	  SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009),
+{ 0x78230000, 0x12fc0000, 0xa93a0b80, 0x90a50000, 0x713e2879, 0x7ee98924, 0xf08ca062, 0x636f8bab,
-	  SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000),
+  0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b },
-	  SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a),
+{ 0x02af0000, 0xb7280000, 0xba1c0300, 0x56980000, 0xba8d45d3, 0x8048c667, 0xa95c149a, 0xf4f6ea7b, // 35
-	  SPH_C32(0x69505b3a) },
+  0x7a8c0000, 0xa5d40000, 0x13260880, 0xc63d0000, 0xcbb36daa, 0xfea14f43, 0x59d0b4f8, 0x979961d0 },
-	{ SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000),
+{ 0xac480000, 0x1ba60000, 0x45fb1380, 0x03430000, 0x5a85316a, 0x1fb250b6, 0xfe72c7fe, 0x91e478f6,
-	  SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25),
+  0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e },
-	  SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050),
+{ 0x1e4e0000, 0xdecf0000, 0x6df80180, 0x77240000, 0xec47079e, 0xf4a0694e, 0xcda31812, 0x98aa496e,
-	  SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000),
+  0xb2060000, 0xc5690000, 0x28031200, 0x74670000, 0xb6c236f4, 0xeb1239f8, 0x33d1dfec, 0x094e3198 },
-	  SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2),
+{ 0xaec30000, 0x9c4f0001, 0x79d1e000, 0x2c150000, 0x45cc75b3, 0x6650b736, 0xab92f78f, 0xa312567b,
-	  SPH_C32(0x8a341574) },
+  0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e },
-	{ SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000),
+{ 0xdb250000, 0x09290000, 0x49aac000, 0x81e10000, 0xcafe6b59, 0x42793431, 0x43566b76, 0xe86cba2e,
-	  SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19),
+  0x75e60000, 0x95660001, 0x307b2000, 0xadf40000, 0x8f321eea, 0x24298307, 0xe8c49cf9, 0x4b7eec55 },
-	  SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0),
+{ 0x58430000, 0x807e0000, 0x78330001, 0xc66b3800, 0xe7375cdc, 0x79ad3fdd, 0xac73fe6f, 0x3a4479b1,
-	  SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000),
+  0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6 },
-	  SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc),
+{ 0x1d5a0000, 0x2b720000, 0x488d0000, 0xaf611800, 0x25cb2ec5, 0xc879bfd0, 0x81a20429, 0x1e7536a6,
-	  SPH_C32(0x450360bf) },
+  0x45190000, 0xab0c0000, 0x30be0001, 0x690a2000, 0xc2fc7219, 0xb1d4800d, 0x2dd1fa46, 0x24314f17 },
-	{ SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000),
+{ 0xa53b0000, 0x14260000, 0x4e30001e, 0x7cae0000, 0x8f9e0dd5, 0x78dfaa3d, 0xf73168d8, 0x0b1b4946, // 42
-	  SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543),
+  0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce },
-	  SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060),
+{ 0x07ed0000, 0xb2500000, 0x8774000a, 0x970d0000, 0x437223ae, 0x48c76ea4, 0xf4786222, 0x9075b1ce,
-	  SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000),
+  0xa2d60000, 0xa6760000, 0xc9440014, 0xeba30000, 0xccec2e7b, 0x3018c499, 0x03490afa, 0x9b6ef888 },
-	  SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d),
+{ 0x88980000, 0x1f940000, 0x7fcf002e, 0xfb4e0000, 0xf158079a, 0x61ae9167, 0xa895706c, 0xe6107494,
-	  SPH_C32(0xf3d45758) },
+  0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463 },
-	{ SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000),
+{ 0x0bc20000, 0xdb630000, 0x7e88000c, 0x15860000, 0x91fd48f3, 0x7581bb43, 0xf460449e, 0xd8b61463,
-	  SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825),
+  0x835a0000, 0xc4f70000, 0x01470022, 0xeec80000, 0x60a54f69, 0x142f2a24, 0x5cf534f2, 0x3ea660f7 },
-	  SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110),
+{ 0x52500000, 0x29540000, 0x6a61004e, 0xf0ff0000, 0x9a317eec, 0x452341ce, 0xcf568fe5, 0x5303130f,
-	  SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000),
+  0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691 },
-	  SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25),
+{ 0x538d0000, 0xa9fc0000, 0x9ef70006, 0x56ff0000, 0x0ae4004e, 0x92c5cdf9, 0xa9444018, 0x7f975691,
-	  SPH_C32(0x925c44e9) },
+  0x01dd0000, 0x80a80000, 0xf4960048, 0xa6000000, 0x90d57ea2, 0xd7e68c37, 0x6612cffd, 0x2c94459e },
-	{ SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000),
+{ 0xe6280000, 0x4c4b0000, 0xa8550000, 0xd3d002e0, 0xd86130b8, 0x98a7b0da, 0x289506b4, 0xd75a4897,
-	  SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514),
+  0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f },
-	  SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000),
+{ 0xf0c50000, 0x59230000, 0x45820000, 0xe18d00c0, 0x3b6d0631, 0xc2ed5699, 0xcbe0fe1c, 0x56a7b19f, // 49
-	  SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000),
+  0x16ed0000, 0x15680000, 0xedd70000, 0x325d0220, 0xe30c3689, 0x5a4ae643, 0xe375f8a8, 0x81fdf908 },
-	  SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315),
+{ 0xb4310000, 0x77330000, 0xb15d0000, 0x7fd004e0, 0x78a26138, 0xd116c35d, 0xd256d489, 0x4e6f74de,
-	  SPH_C32(0xa123ff9f) },
+  0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539 },
-	{ SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000),
+{ 0xe3060000, 0xbdc10000, 0x87130000, 0xbff20060, 0x2eba0a1a, 0x8db53751, 0x73c5ab06, 0x5bd61539,
-	  SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860),
+  0x57370000, 0xcaf20000, 0x364e0000, 0xc0220480, 0x56186b22, 0x5ca3f40c, 0xa1937f8f, 0x15b961e7 },
-	  SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000),
+{ 0x02f20000, 0xa2810000, 0x873f0000, 0xe36c7800, 0x1e1d74ef, 0x073d2bd6, 0xc4c23237, 0x7f32259e,
-	  SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000),
+  0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0 },
-	  SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e),
+{ 0xbadd0000, 0x13ad0000, 0xb7e70000, 0xf7282800, 0xdf45144d, 0x361ac33a, 0xea5a8d14, 0x2a2c18f0,
-	  SPH_C32(0x1568ff0f) },
+  0xb82f0000, 0xb12c0000, 0x30d80000, 0x14445000, 0xc15860a2, 0x3127e8ec, 0x2e98bf23, 0x551e3d6e },
-	{ SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000),
+{ 0x1e6c0000, 0xc4420000, 0x8a2e0000, 0xbcb6b800, 0x2c4413b6, 0x8bfdd3da, 0x6a0c1bc8, 0xb99dc2eb,
-	  SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6),
+  0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f },
-	  SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000),
+{ 0x92560000, 0x1eda0000, 0xea510000, 0xe8b13000, 0xa93556a5, 0xebfb6199, 0xb15c2254, 0x33c5244f,
-	  SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000),
+  0x8c3a0000, 0xda980000, 0x607f0000, 0x54078800, 0x85714513, 0x6006b243, 0xdb50399c, 0x8a58e6a4 },
-	  SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616),
+{ 0x033d0000, 0x08b30000, 0xf33a0000, 0x3ac20007, 0x51298a50, 0x6b6e661f, 0x0ea5cfe3, 0xe6da7ffe, // 56
-	  SPH_C32(0xc5c1eb3e) },
+  0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000 },
-	{ SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000),
+{ 0xa8da0000, 0x96be0000, 0x5c1d0000, 0x07da0002, 0x7d669583, 0x1f98708a, 0xbb668808, 0xda878000,
-	  SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145),
+  0xabe70000, 0x9e0d0000, 0xaf270000, 0x3d180005, 0x2c4f1fd3, 0x74f61695, 0xb5c347eb, 0x3c5dfffe },
-	  SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000),
+{ 0x01930000, 0xe7820000, 0xedfb0000, 0xcf0c000b, 0x8dd08d58, 0xbca3b42e, 0x063661e1, 0x536f9e7b,
-	  SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000),
+  0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7 },
-	  SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6),
+{ 0x92280000, 0xdc850000, 0x57fa0000, 0x56dc0003, 0xbae92316, 0x5aefa30c, 0x90cef752, 0x7b1675d7,
-	  SPH_C32(0x1af21fe1) },
+  0x93bb0000, 0x3b070000, 0xba010000, 0x99d00008, 0x3739ae4e, 0xe64c1722, 0x96f896b3, 0x2879ebac },
-	{ SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000),
+{ 0x5fa80000, 0x56030000, 0x43ae0000, 0x64f30013, 0x257e86bf, 0x1311944e, 0x541e95bf, 0x8ea4db69,
-	  SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae),
+  0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e },
-	  SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000),
+{ 0x00440000, 0x7f480000, 0xda7c0000, 0x2a230001, 0x3badc9cc, 0xa9b69c87, 0x030a9e60, 0xbe0a679e,
-	  SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000),
+  0x5fec0000, 0x294b0000, 0x99d20000, 0x4ed00012, 0x1ed34f73, 0xbaa708c9, 0x57140bdf, 0x30aebcf7 },
-	  SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17),
+{ 0xee930000, 0xd6070000, 0x92c10000, 0x2b9801e0, 0x9451287c, 0x3b6cfb57, 0x45312374, 0x201f6a64,
-	  SPH_C32(0x857f3c2b) },
+  0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0 },
-	{ SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000),
+{ 0x7b280000, 0x57420000, 0xa9e50000, 0x634300a0, 0x9edb442f, 0x6d9995bb, 0x27f83b03, 0xc7ff60f0,
-	  SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2),
+  0x95bb0000, 0x81450000, 0x3b240000, 0x48db0140, 0x0a8a6c53, 0x56f56eec, 0x62c91877, 0xe7e00a94 }
 	  SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000),
 	  SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000),
 	  SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94),
 	  SPH_C32(0x2ba05a55) },
 	{ SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000),
 	  SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757),
 	  SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001),
 	  SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000),
 	  SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba),
 	  SPH_C32(0xfeabf254) },
 	{ SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000),
 	  SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5),
 	  SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002),
 	  SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000),
 	  SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7),
 	  SPH_C32(0xfe1cdc7f) },
 	{ SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000),
 	  SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea),
 	  SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000),
 	  SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000),
 	  SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea),
 	  SPH_C32(0xb0a51834) },
 	{ SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000),
 	  SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460),
 	  SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000),
 	  SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000),
 	  SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae),
 	  SPH_C32(0xa6b8c28d) },
 	{ SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000),
 	  SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75),
 	  SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000),
 	  SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000),
 	  SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156),
 	  SPH_C32(0x3a4e99d7) },
 	{ SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000),
 	  SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15),
 	  SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000),
 	  SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000),
 	  SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6),
 	  SPH_C32(0xe1844257) },
 	{ SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000),
 	  SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512),
 	  SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000),
 	  SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000),
 	  SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37),
 	  SPH_C32(0x2c3b504e) },
 	{ SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000),
 	  SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856),
 	  SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000),
 	  SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000),
 	  SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4),
 	  SPH_C32(0x524a0d59) },
 	{ SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000),
 	  SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc),
 	  SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000),
 	  SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000),
 	  SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88),
 	  SPH_C32(0x378dd173) },
 	{ SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000),
 	  SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f),
 	  SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000),
 	  SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000),
 	  SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4),
 	  SPH_C32(0x8b6c72bd) },
 	{ SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780),
 	  SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418),
 	  SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000),
 	  SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000),
 	  SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d),
 	  SPH_C32(0x8e67b7fa) },
 	{ SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280),
 	  SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc),
 	  SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000),
 	  SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000),
 	  SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec),
 	  SPH_C32(0x443d3004) },
 	{ SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80),
 	  SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924),
 	  SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000),
 	  SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000),
 	  SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a),
 	  SPH_C32(0xf4f6ea7b) },
 	{ SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300),
 	  SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667),
 	  SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000),
 	  SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000),
 	  SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8),
 	  SPH_C32(0x979961d0) },
 	{ SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380),
 	  SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6),
 	  SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000),
 	  SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000),
 	  SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812),
 	  SPH_C32(0x98aa496e) },
 	{ SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180),
 	  SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e),
 	  SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000),
 	  SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000),
 	  SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec),
 	  SPH_C32(0x094e3198) },
 	{ SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000),
 	  SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736),
 	  SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000),
 	  SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000),
 	  SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76),
 	  SPH_C32(0xe86cba2e) },
 	{ SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000),
 	  SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431),
 	  SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000),
 	  SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000),
 	  SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9),
 	  SPH_C32(0x4b7eec55) },
 	{ SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001),
 	  SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd),
 	  SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000),
 	  SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800),
 	  SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429),
 	  SPH_C32(0x1e7536a6) },
 	{ SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000),
 	  SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0),
 	  SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000),
 	  SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000),
 	  SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46),
 	  SPH_C32(0x24314f17) },
 	{ SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e),
 	  SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d),
 	  SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000),
 	  SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000),
 	  SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222),
 	  SPH_C32(0x9075b1ce) },
 	{ SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a),
 	  SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4),
 	  SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000),
 	  SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000),
 	  SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa),
 	  SPH_C32(0x9b6ef888) },
 	{ SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e),
 	  SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167),
 	  SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000),
 	  SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000),
 	  SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e),
 	  SPH_C32(0xd8b61463) },
 	{ SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c),
 	  SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43),
 	  SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000),
 	  SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000),
 	  SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2),
 	  SPH_C32(0x3ea660f7) },
 	{ SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e),
 	  SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce),
 	  SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000),
 	  SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000),
 	  SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018),
 	  SPH_C32(0x7f975691) },
 	{ SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006),
 	  SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9),
 	  SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000),
 	  SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000),
 	  SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd),
 	  SPH_C32(0x2c94459e) },
 	{ SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000),
 	  SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da),
 	  SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000),
 	  SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0),
 	  SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c),
 	  SPH_C32(0x56a7b19f) },
 	{ SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000),
 	  SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699),
 	  SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000),
 	  SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220),
 	  SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8),
 	  SPH_C32(0x81fdf908) },
 	{ SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000),
 	  SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d),
 	  SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000),
 	  SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060),
 	  SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06),
 	  SPH_C32(0x5bd61539) },
 	{ SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000),
 	  SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751),
 	  SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000),
 	  SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480),
 	  SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f),
 	  SPH_C32(0x15b961e7) },
 	{ SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000),
 	  SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6),
 	  SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000),
 	  SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800),
 	  SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14),
 	  SPH_C32(0x2a2c18f0) },
 	{ SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000),
 	  SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a),
 	  SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000),
 	  SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000),
 	  SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23),
 	  SPH_C32(0x551e3d6e) },
 	{ SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000),
 	  SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da),
 	  SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000),
 	  SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000),
 	  SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254),
 	  SPH_C32(0x33c5244f) },
 	{ SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000),
 	  SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199),
 	  SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000),
 	  SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800),
 	  SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c),
 	  SPH_C32(0x8a58e6a4) },
 	{ SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000),
 	  SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f),
 	  SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000),
 	  SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002),
 	  SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808),
 	  SPH_C32(0xda878000) },
 	{ SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000),
 	  SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a),
 	  SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000),
 	  SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005),
 	  SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb),
 	  SPH_C32(0x3c5dfffe) },
 	{ SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000),
 	  SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e),
 	  SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000),
 	  SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003),
 	  SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752),
 	  SPH_C32(0x7b1675d7) },
 	{ SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000),
 	  SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c),
 	  SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000),
 	  SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008),
 	  SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3),
 	  SPH_C32(0x2879ebac) },
 	{ SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000),
 	  SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e),
 	  SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000),
 	  SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001),
 	  SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60),
 	  SPH_C32(0xbe0a679e) },
 	{ SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000),
 	  SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87),
 	  SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000),
 	  SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012),
 	  SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf),
 	  SPH_C32(0x30aebcf7) },
 	{ SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000),
 	  SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57),
 	  SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000),
 	  SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0),
 	  SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03),
 	  SPH_C32(0xc7ff60f0) },
 	{ SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000),
 	  SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb),
 	  SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000),
 	  SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140),
 	  SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877),
 	  SPH_C32(0xe7e00a94) }
 };
 __global__
@ -598,12 +328,12 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
 		uint32_t *Hash = (uint32_t*)&g_hash[hashPosition<<3];
 		unsigned char *h1 = (unsigned char *)Hash;
-		uint32_t c0 = SPH_C32(0x73746565), c1 = SPH_C32(0x6c706172), c2 = SPH_C32(0x6b204172), c3 = SPH_C32(0x656e6265);
+		uint32_t c0 = 0x73746565, c1 = 0x6c706172, c2 = 0x6b204172, c3 = 0x656e6265;
-		uint32_t c4 = SPH_C32(0x72672031), c5 = SPH_C32(0x302c2062), c6 = SPH_C32(0x75732032), c7 = SPH_C32(0x3434362c);
+		uint32_t c4 = 0x72672031, c5 = 0x302c2062, c6 = 0x75732032, c7 = 0x3434362c;
-		uint32_t c8 = SPH_C32(0x20422d33), c9 = SPH_C32(0x30303120), cA = SPH_C32(0x4c657576), cB = SPH_C32(0x656e2d48);
+		uint32_t c8 = 0x20422d33, c9 = 0x30303120, cA = 0x4c657576, cB = 0x656e2d48;
-		uint32_t cC = SPH_C32(0x65766572), cD = SPH_C32(0x6c65652c), cE = SPH_C32(0x2042656c), cF = SPH_C32(0x6769756d);
+		uint32_t cC = 0x65766572, cD = 0x6c65652c, cE = 0x2042656c, cF = 0x6769756d;
 		uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
 		uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
 		uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
 		uint32_t *tp, db, dm;
 		for(int i = 0; i < 64; i += 8) {
@ -637,16 +367,16 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
 			T_BIG;
 		}
 		// precomputed for 64 bytes blocks ?
 		tp = &d_T512[0][0] + 112;
-
+		m0 = tp[ 0]; m1 = tp[ 1];
-		m0 = *(tp+ 0); m1 = *(tp+ 1);
+		m2 = tp[ 2]; m3 = tp[ 3];
-		m2 = *(tp+ 2); m3 = *(tp+ 3);
+		m4 = tp[ 4]; m5 = tp[ 5];
-		m4 = *(tp+ 4); m5 = *(tp+ 5);
+		m6 = tp[ 6]; m7 = tp[ 7];
-		m6 = *(tp+ 6); m7 = *(tp+ 7);
+		m8 = tp[ 8]; m9 = tp[ 9];
-		m8 = *(tp+ 8); m9 = *(tp+ 9);
+		mA = tp[10]; mB = tp[11];
-		mA = *(tp+10); mB = *(tp+11);
+		mC = tp[12]; mD = tp[13];
-		mC = *(tp+12); mD = *(tp+13);
+		mE = tp[14]; mF = tp[15];
 		mE = *(tp+14); mF = *(tp+15);
 		for( int r = 0; r < 6; r += 2 ) {
 			ROUND_BIG(r, d_alpha_n);
@ -655,15 +385,14 @@ void x13_hamsi512_gpu_hash_64(uint32_t threads, uint32_t startNounce, uint64_t *
 		T_BIG;
 		tp = &d_T512[0][0] + 784;
-
+		m0 = tp[ 0]; m1 = tp[ 1];
-		m0 = *(tp+ 0); m1 = *(tp+ 1);
+		m2 = tp[ 2]; m3 = tp[ 3];
-		m2 = *(tp+ 2); m3 = *(tp+ 3);
+		m4 = tp[ 4]; m5 = tp[ 5];
-		m4 = *(tp+ 4); m5 = *(tp+ 5);
+		m6 = tp[ 6]; m7 = tp[ 7];
-		m6 = *(tp+ 6); m7 = *(tp+ 7);
+		m8 = tp[ 8]; m9 = tp[ 9];
-		m8 = *(tp+ 8); m9 = *(tp+ 9);
+		mA = tp[10]; mB = tp[11];
-		mA = *(tp+10); mB = *(tp+11);
+		mC = tp[12]; mD = tp[13];
-		mC = *(tp+12); mD = *(tp+13);
+		mE = tp[14]; mF = tp[15];
 		mE = *(tp+14); mF = *(tp+15);
 		for( int r = 0; r < 12; r += 2 ) {
 			ROUND_BIG(r, d_alpha_f);
@ -696,3 +425,127 @@ void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce
 	x13_hamsi512_gpu_hash_64<<<grid, block>>>(threads, startNounce, (uint64_t*)d_hash, d_nonceVector);
 	//MyStreamSynchronize(NULL, order, thr_id);
 }
 __constant__ static uint64_t c_PaddedMessage80[10];
 __host__
 void x16_hamsi512_setBlock_80(void *pdata)
 {
 	cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
 }
 __global__
 void x16_hamsi512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		unsigned char h1[80];
 		#pragma unroll
 		for (int i = 0; i < 10; i++)
 			((uint2*)h1)[i] = ((uint2*)c_PaddedMessage80)[i];
 		//((uint64_t*)h1)[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(startNonce + thread));
 		((uint32_t*)h1)[19] = cuda_swab32(startNonce + thread);
 		uint32_t c0 = 0x73746565, c1 = 0x6c706172, c2 = 0x6b204172, c3 = 0x656e6265;
 		uint32_t c4 = 0x72672031, c5 = 0x302c2062, c6 = 0x75732032, c7 = 0x3434362c;
 		uint32_t c8 = 0x20422d33, c9 = 0x30303120, cA = 0x4c657576, cB = 0x656e2d48;
 		uint32_t cC = 0x65766572, cD = 0x6c65652c, cE = 0x2042656c, cF = 0x6769756d;
 		uint32_t h[16] = { c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, cA, cB, cC, cD, cE, cF };
 		uint32_t m0, m1, m2, m3, m4, m5, m6, m7, m8, m9, mA, mB, mC, mD, mE, mF;
 		uint32_t *tp, db, dm;
 		for(int i = 0; i < 80; i += 8)
 		{
 			m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0;
 			m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0;
 			tp = &d_T512[0][0];
 			#pragma unroll
 			for (int u = 0; u < 8; u++) {
 				db = h1[i + u];
 				#pragma unroll 2
 				for (int v = 0; v < 8; v++, db >>= 1) {
 					dm = -(uint32_t)(db & 1);
 					m0 ^= dm & tp[ 0]; m1 ^= dm & tp[ 1];
 					m2 ^= dm & tp[ 2]; m3 ^= dm & tp[ 3];
 					m4 ^= dm & tp[ 4]; m5 ^= dm & tp[ 5];
 					m6 ^= dm & tp[ 6]; m7 ^= dm & tp[ 7];
 					m8 ^= dm & tp[ 8]; m9 ^= dm & tp[ 9];
 					mA ^= dm & tp[10]; mB ^= dm & tp[11];
 					mC ^= dm & tp[12]; mD ^= dm & tp[13];
 					mE ^= dm & tp[14]; mF ^= dm & tp[15];
 					tp += 16;
 				}
 			}
 			#pragma unroll
 			for (int r = 0; r < 6; r++) {
 				ROUND_BIG(r, d_alpha_n);
 			}
 			T_BIG;
 		}
 		#define INPUT_BIG { \
 			m0 = 0; m1 = 0; m2 = 0; m3 = 0; m4 = 0; m5 = 0; m6 = 0; m7 = 0; \
 			m8 = 0; m9 = 0; mA = 0; mB = 0; mC = 0; mD = 0; mE = 0; mF = 0; \
 			tp = &d_T512[0][0]; \
 			for (int u = 0; u < 8; u++) { \
 				db = endtag[u]; \
 				for (int v = 0; v < 8; v++, db >>= 1) { \
 					dm = -(uint32_t)(db & 1); \
 					m0 ^= dm & tp[ 0]; m1 ^= dm & tp[ 1]; \
 					m2 ^= dm & tp[ 2]; m3 ^= dm & tp[ 3]; \
 					m4 ^= dm & tp[ 4]; m5 ^= dm & tp[ 5]; \
 					m6 ^= dm & tp[ 6]; m7 ^= dm & tp[ 7]; \
 					m8 ^= dm & tp[ 8]; m9 ^= dm & tp[ 9]; \
 					mA ^= dm & tp[10]; mB ^= dm & tp[11]; \
 					mC ^= dm & tp[12]; mD ^= dm & tp[13]; \
 					mE ^= dm & tp[14]; mF ^= dm & tp[15]; \
 					tp += 16; \
 				} \
 			} \
 		}
 		// close
 		uint8_t endtag[8] = { 0x80, 0x00, 0x00, 0x00,  0x00, 0x00, 0x00, 0x00 };
 		INPUT_BIG;
 		#pragma unroll
 		for (int r = 0; r < 6; r++) {
 			ROUND_BIG(r, d_alpha_n);
 		}
 		T_BIG;
 		endtag[0] = endtag[1] = 0x00;
 		endtag[6] = 0x02;
 		endtag[7] = 0x80;
 		INPUT_BIG;
 		// PF_BIG
 		#pragma unroll
 		for(int r = 0; r < 12; r++) {
 			ROUND_BIG(r, d_alpha_f);
 		}
 		T_BIG;
 		uint64_t hashPosition = thread;
 		uint32_t *Hash = (uint32_t*)&g_hash[hashPosition << 3];
 		#pragma unroll 16
 		for(int i = 0; i < 16; i++)
 			Hash[i] = cuda_swab32(h[i]);
 		#undef INPUT_BIG
 	}
 }
 __host__
 void x16_hamsi512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
 {
 	const uint32_t threadsperblock = 128;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 	x16_hamsi512_gpu_hash_80 <<<grid, block>>> (threads, startNounce, (uint64_t*)d_hash);
 }
--- a/x15/cuda_x15_whirlpool_sm3.cu
+++ b/x15/cuda_x15_whirlpool_sm3.cu
@ -1998,7 +1998,7 @@ const int i0, const int i1, const int i2, const int i3, const int i4, const int
 __global__
-void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outputHash, int swab)
+void oldwhirlpool_gpu_hash_80(const uint32_t threads, const uint32_t startNounce, void *outputHash, int swab)
 {
 	__shared__ uint64_t sharedMemory[2048];
@ -2014,7 +2014,8 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
 			sharedMemory[threadIdx.x+1792] = mixTob7Tox[threadIdx.x];
 		#endif
 	}
-	__threadfence_block(); // ensure shared mem is ready
+	//__threadfence_block(); // ensure shared mem is ready
 	__syncthreads();
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@ -2028,7 +2029,8 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
 		uint64_t state[8];
 		#pragma unroll 8
 		for (int i=0; i < 8; i++) {
-			state[i] = c_PaddedMessage80[i];
+			//state[i] = c_PaddedMessage80[i];
 			AS_UINT2(&state[i]) = AS_UINT2(&c_PaddedMessage80[i]);
 		}
 #else
 		#pragma unroll 8
@ -2050,6 +2052,7 @@ void oldwhirlpool_gpu_hash_80(uint32_t threads, uint32_t startNounce, void *outp
 			state[i] = xor1(n[i],c_PaddedMessage80[i]);
 		}
 #endif
 		/// round 2 ///////
 		//////////////////////////////////
 		n[0] = c_PaddedMessage80[8];    //read data
@ -2331,7 +2334,7 @@ extern uint32_t whirlpool512_finalhash_64(int thr_id, uint32_t threads, uint32_t
 }
 __host__
-void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_outputHash)
+void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_outputHash)
 {
 	dim3 grid((threads + threadsperblock-1) / threadsperblock);
 	dim3 block(threadsperblock);
@ -2339,7 +2342,7 @@ void whirlpool512_hash_80_sm3(int thr_id, uint32_t threads, uint32_t startNounce
 	if (threads < 256)
 		applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!");
-	oldwhirlpool_gpu_hash_80<<<grid, block>>>(threads, startNounce, d_outputHash, 1);
+	oldwhirlpool_gpu_hash_80<<<grid, block>>>(threads, startNonce, d_outputHash, 1);
 }
 extern void whirl_midstate(void *state, const void *input);
@ -2363,3 +2366,54 @@ void whirlpool512_setBlock_80_sm3(void *pdata, const void *ptarget)
 	cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(pTarget, ptarget, 32, 0, cudaMemcpyHostToDevice);
 }
 // ------------------------------------------------------------------------------------------------
 __host__
 void x16_whirlpool512_init(int thr_id, uint32_t threads)
 {
 	cudaMemcpyToSymbol(InitVector_RC, plain_RC, sizeof(plain_RC), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(mixTob0Tox, plain_T0, sizeof(plain_T0), 0, cudaMemcpyHostToDevice);
 #if USE_ALL_TABLES
 	cudaMemcpyToSymbol(mixTob1Tox, plain_T1, (256 * 8), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(mixTob2Tox, plain_T2, (256 * 8), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(mixTob3Tox, plain_T3, (256 * 8), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(mixTob4Tox, plain_T4, (256 * 8), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(mixTob5Tox, plain_T5, (256 * 8), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(mixTob6Tox, plain_T6, (256 * 8), 0, cudaMemcpyHostToDevice);
 	cudaMemcpyToSymbol(mixTob7Tox, plain_T7, (256 * 8), 0, cudaMemcpyHostToDevice);
 #endif
 }
 extern void whirlpool_midstate(void *state, const void *input);
 __host__
 void x16_whirlpool512_setBlock_80(void *pdata)
 {
 	unsigned char PaddedMessage[128];
 	memcpy(PaddedMessage, pdata, 80);
 	memset(PaddedMessage + 80, 0, 48);
 	PaddedMessage[80] = 0x80; /* ending */
 #if HOST_MIDSTATE
 	// compute constant first block
 	unsigned char midstate[64] = { 0 };
 	whirlpool_midstate(midstate, pdata);
 	memcpy(PaddedMessage, midstate, 64);
 #endif
 	cudaMemcpyToSymbol(c_PaddedMessage80, PaddedMessage, 128, 0, cudaMemcpyHostToDevice);
 }
 __host__
 void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_outputHash)
 {
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 	if (threads < 256)
 		applog(LOG_WARNING, "whirlpool requires a minimum of 256 threads to fetch constant tables!");
 	oldwhirlpool_gpu_hash_80 <<<grid, block>>> (threads, startNonce, d_outputHash, 1);
 }
--- a/x16/cuda_x16.h
+++ b/x16/cuda_x16.h
@ -0,0 +1,80 @@
 #include "x11/cuda_x11.h"
 extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void x13_fugue512_cpu_free(int thr_id);
 extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
 extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag);
 extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
 extern void x15_whirlpool_cpu_free(int thr_id);
 extern void x17_sha512_cpu_init(int thr_id, uint32_t threads);
 extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash);
 extern void x17_haval256_cpu_init(int thr_id, uint32_t threads);
 extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, const int outlen);
 void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
 // ---- optimised but non compatible kernels
 void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
 // ---- 80 bytes kernels
 void quark_bmw512_cpu_setBlock_80(void *pdata);
 void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
 void groestl512_setBlock_80(int thr_id, uint32_t *endiandata);
 void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
 void skein512_cpu_setBlock_80(void *pdata);
 void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap);
 void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
 void qubit_luffa512_cpu_setBlock_80(void *pdata);
 void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
 void jh512_setBlock_80(int thr_id, uint32_t *endiandata);
 void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
 void keccak512_setBlock_80(int thr_id, uint32_t *endiandata);
 void keccak512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
 void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
 void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
 void x11_shavite512_setBlock_80(void *pdata);
 void x11_shavite512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int order);
 void x16_shabal512_setBlock_80(void *pdata);
 void x16_shabal512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
 void x16_simd512_setBlock_80(void *pdata);
 void x16_simd512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
 void x16_echo512_cuda_init(int thr_id, const uint32_t threads);
 void x16_echo512_setBlock_80(void *pdata);
 void x16_echo512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
 void x16_hamsi512_setBlock_80(void *pdata);
 void x16_hamsi512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
 void x16_fugue512_cpu_init(int thr_id, uint32_t threads);
 void x16_fugue512_cpu_free(int thr_id);
 void x16_fugue512_setBlock_80(void *pdata);
 void x16_fugue512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
 void x16_whirlpool512_init(int thr_id, uint32_t threads);
 void x16_whirlpool512_setBlock_80(void* endiandata);
 void x16_whirlpool512_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
 void x16_sha512_setBlock_80(void *pdata);
 void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash);
--- a/x16/cuda_x16_echo512.cu
+++ b/x16/cuda_x16_echo512.cu
@ -0,0 +1,214 @@
 /**
 * echo512-80 cuda kernel for X16R algorithm
 *
 * tpruvot 2018 - GPL code
 */
 #include <stdio.h>
 #include <memory.h>
 #include "cuda_helper.h"
 extern __device__ __device_builtin__ void __threadfence_block(void);
 #include "../x11/cuda_x11_aes.cuh"
 __device__ __forceinline__ void AES_2ROUND(const uint32_t* __restrict__ sharedMemory,
 	uint32_t &x0, uint32_t &x1, uint32_t &x2, uint32_t &x3,
 	uint32_t &k0)
 {
 	uint32_t y0, y1, y2, y3;
 	aes_round(sharedMemory,
 		x0, x1, x2, x3,
 		k0,
 		y0, y1, y2, y3);
 	aes_round(sharedMemory,
 		y0, y1, y2, y3,
 		x0, x1, x2, x3);
 	k0++;
 }
 __device__
 static void echo_round(uint32_t* const sharedMemory, uint32_t *W, uint32_t &k0)
 {
 	// Big Sub Words
 	#pragma unroll 16
 	for (int idx = 0; idx < 16; idx++) {
 		AES_2ROUND(sharedMemory, W[(idx << 2) + 0], W[(idx << 2) + 1], W[(idx << 2) + 2], W[(idx << 2) + 3], k0);
 	}
 	// Shift Rows
 	#pragma unroll 4
 	for (int i = 0; i < 4; i++)
 	{
 		uint32_t t[4];
 		/// 1, 5, 9, 13
 		t[0] = W[i +  4];
 		t[1] = W[i +  8];
 		t[2] = W[i + 24];
 		t[3] = W[i + 60];
 		W[i +  4] = W[i + 20];
 		W[i +  8] = W[i + 40];
 		W[i + 24] = W[i + 56];
 		W[i + 60] = W[i + 44];
 		W[i + 20] = W[i + 36];
 		W[i + 40] = t[1];
 		W[i + 56] = t[2];
 		W[i + 44] = W[i + 28];
 		W[i + 28] = W[i + 12];
 		W[i + 12] = t[3];
 		W[i + 36] = W[i + 52];
 		W[i + 52] = t[0];
 	}
 	// Mix Columns
 	#pragma unroll 4
 	for (int i = 0; i < 4; i++)
 	{
 		#pragma unroll 4
 		for (int idx = 0; idx < 64; idx += 16)
 		{
 			uint32_t a[4];
 			a[0] = W[idx + i];
 			a[1] = W[idx + i + 4];
 			a[2] = W[idx + i + 8];
 			a[3] = W[idx + i + 12];
 			uint32_t ab = a[0] ^ a[1];
 			uint32_t bc = a[1] ^ a[2];
 			uint32_t cd = a[2] ^ a[3];
 			uint32_t t, t2, t3;
 			t  = (ab & 0x80808080);
 			t2 = (bc & 0x80808080);
 			t3 = (cd & 0x80808080);
 			uint32_t abx = (t  >> 7) * 27U ^ ((ab^t)  << 1);
 			uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
 			uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
 			W[idx + i] = bc ^ a[3] ^ abx;
 			W[idx + i +  4] = a[0] ^ cd ^ bcx;
 			W[idx + i +  8] = ab ^ a[3] ^ cdx;
 			W[idx + i + 12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx);
 		}
 	}
 }
 __device__ __forceinline__
 void cuda_echo_round_80(uint32_t *const __restrict__ sharedMemory, uint32_t *const __restrict__ data, const uint32_t nonce, uint32_t *hash)
 {
 	uint32_t h[29]; // <= 127 bytes input
 	#pragma unroll 8
 	for (int i = 0; i < 18; i += 2)
 		AS_UINT2(&h[i]) = AS_UINT2(&data[i]);
 	h[18] = data[18];
 	h[19] = cuda_swab32(nonce);
 	h[20] = 0x80;
 	h[21] = h[22] = h[23] = h[24] = h[25] = h[26] = 0;
 	//((uint8_t*)h)[80] = 0x80;
 	//((uint8_t*)h)[128-17] = 0x02;
 	//((uint8_t*)h)[128-16] = 0x80;
 	//((uint8_t*)h)[128-15] = 0x02;
 	h[27] = 0x2000000;
 	h[28] = 0x280;
 	//h[29] = h[30] = h[31] = 0;
 	uint32_t k0 = 640; // bitlen
 	uint32_t W[64];
 	#pragma unroll 8
 	for (int i = 0; i < 32; i+=4) {
 		W[i] = 512; // L
 		W[i+1] = 0; // H
 		W[i+2] = 0; // X
 		W[i+3] = 0;
 	}
 	uint32_t Z[16];
 	#pragma unroll
 	for (int i = 0;  i<16; i++) Z[i] = W[i];
 	#pragma unroll
 	for (int i = 32; i<61; i++) W[i] = h[i - 32];
 	#pragma unroll
 	for (int i = 61; i<64; i++) W[i] = 0;
 	for (int i = 0; i < 10; i++)
 		echo_round(sharedMemory, W, k0);
 	#pragma unroll 16
 	for (int i = 0; i < 16; i++) {
 		Z[i] ^= h[i] ^ W[i] ^ W[i + 32];
 	}
 	#pragma unroll 8
 	for (int i = 0; i < 16; i += 2)
 		AS_UINT2(&hash[i]) = AS_UINT2(&Z[i]);
 }
 __device__ __forceinline__
 void echo_gpu_init(uint32_t *const __restrict__ sharedMemory)
 {
 	/* each thread startup will fill a uint32 */
 	if (threadIdx.x < 128) {
 		sharedMemory[threadIdx.x] = d_AES0[threadIdx.x];
 		sharedMemory[threadIdx.x + 256] = d_AES1[threadIdx.x];
 		sharedMemory[threadIdx.x + 512] = d_AES2[threadIdx.x];
 		sharedMemory[threadIdx.x + 768] = d_AES3[threadIdx.x];
 		sharedMemory[threadIdx.x + 64 * 2] = d_AES0[threadIdx.x + 64 * 2];
 		sharedMemory[threadIdx.x + 64 * 2 + 256] = d_AES1[threadIdx.x + 64 * 2];
 		sharedMemory[threadIdx.x + 64 * 2 + 512] = d_AES2[threadIdx.x + 64 * 2];
 		sharedMemory[threadIdx.x + 64 * 2 + 768] = d_AES3[threadIdx.x + 64 * 2];
 	}
 }
 __host__
 void x16_echo512_cuda_init(int thr_id, const uint32_t threads)
 {
 	aes_cpu_init(thr_id);
 }
 __constant__ static uint32_t c_PaddedMessage80[20];
 __host__
 void x16_echo512_setBlock_80(void *endiandata)
 {
 	cudaMemcpyToSymbol(c_PaddedMessage80, endiandata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
 }
 __global__ __launch_bounds__(128, 7) /* will force 72 registers */
 void x16_echo512_gpu_hash_80(uint32_t threads, uint32_t startNonce, uint64_t *g_hash)
 {
 	__shared__ uint32_t sharedMemory[1024];
 	echo_gpu_init(sharedMemory);
 	__threadfence_block();
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint64_t hashPosition = thread;
 		uint32_t *pHash = (uint32_t*)&g_hash[hashPosition<<3];
 		cuda_echo_round_80(sharedMemory, c_PaddedMessage80, startNonce + thread, pHash);
 	}
 }
 __host__
 void x16_echo512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
 {
 	const uint32_t threadsperblock = 128;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 	x16_echo512_gpu_hash_80<<<grid, block>>>(threads, startNonce, (uint64_t*)d_hash);
 }
--- a/x16/cuda_x16_echo512_64.cu
+++ b/x16/cuda_x16_echo512_64.cu
@ -0,0 +1,248 @@
 /**
 * Echo512-64 kernel for maxwell, based on alexis work
 */
 #include <cuda_helper.h>
 #include <cuda_vector_uint2x4.h>
 #include <cuda_vectors.h>
 #define INTENSIVE_GMF
 #include "tribus/cuda_echo512_aes.cuh"
 #ifdef __INTELLISENSE__
 #define __byte_perm(x, y, b) x
 #define atomicExch(p,y) (*p) = y
 #endif
 __device__
 static void echo_round_alexis(const uint32_t sharedMemory[4][256], uint32_t *W, uint32_t &k0)
 {
 	// Big Sub Words
 	#pragma unroll 16
 	for (int idx = 0; idx < 16; idx++)
 		AES_2ROUND(sharedMemory,W[(idx<<2) + 0], W[(idx<<2) + 1], W[(idx<<2) + 2], W[(idx<<2) + 3], k0);
 	// Shift Rows
 	#pragma unroll 4
 	for (int i = 0; i < 4; i++){
 		uint32_t t[4];
 		/// 1, 5, 9, 13
 		t[0] = W[i+ 4];
 		t[1] = W[i+ 8];
 		t[2] = W[i+24];
 		t[3] = W[i+60];
 		W[i + 4] = W[i + 20];
 		W[i + 8] = W[i + 40];
 		W[i +24] = W[i + 56];
 		W[i +60] = W[i + 44];
 		W[i +20] = W[i +36];
 		W[i +40] = t[1];
 		W[i +56] = t[2];
 		W[i +44] = W[i +28];
 		W[i +28] = W[i +12];
 		W[i +12] = t[3];
 		W[i +36] = W[i +52];
 		W[i +52] = t[0];
 	}
 	// Mix Columns
 	#pragma unroll 4
 	for (int i = 0; i < 4; i++){ // Schleife über je 2*uint32_t
 		#pragma unroll 4
 		for (int idx = 0; idx < 64; idx += 16){ // Schleife über die elemnte
 			uint32_t a[4];
 			a[0] = W[idx + i];
 			a[1] = W[idx + i + 4];
 			a[2] = W[idx + i + 8];
 			a[3] = W[idx + i +12];
 			uint32_t ab = a[0] ^ a[1];
 			uint32_t bc = a[1] ^ a[2];
 			uint32_t cd = a[2] ^ a[3];
 			uint32_t t, t2, t3;
 			t = (ab & 0x80808080);
 			t2 = (bc & 0x80808080);
 			t3 = (cd & 0x80808080);
 			uint32_t abx = (t  >> 7) * 27U ^ ((ab^t) << 1);
 			uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
 			uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
 			W[idx + i] = bc ^ a[3] ^ abx;
 			W[idx + i + 4] = a[0] ^ cd ^ bcx;
 			W[idx + i + 8] = ab ^ a[3] ^ cdx;
 			W[idx + i +12] = ab ^ a[2] ^ (abx ^ bcx ^ cdx);
 		}
 	}
 }
 __global__ __launch_bounds__(128, 5) /* will force 80 registers */
 static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
 {
 	__shared__ uint32_t sharedMemory[4][256];
 	aes_gpu_init128(sharedMemory);
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	uint32_t k0;
 	uint32_t h[16];
 	uint32_t hash[16];
 	if (thread < threads)
 	{
 		uint32_t *Hash = &g_hash[thread<<4];
 		*(uint2x4*)&h[ 0] = __ldg4((uint2x4*)&Hash[ 0]);
 		*(uint2x4*)&h[ 8] = __ldg4((uint2x4*)&Hash[ 8]);
 		*(uint2x4*)&hash[ 0] = *(uint2x4*)&h[ 0];
 		*(uint2x4*)&hash[ 8] = *(uint2x4*)&h[ 8];
 		__syncthreads();
 		const uint32_t P[48] = {
 			0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
 			//8-12
 			0x01425eb8, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x65978b09, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
 			//21-25
 			0x2cb6b661, 0x6b23b3b3, 0xcf93a7cf, 0x9d9d3751, 0x9ac2dea3, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
 			//34-38
 			0x579f9f33, 0xfbfbfbfb, 0xfbfbfbfb, 0xefefd3c7, 0xdbfde1dd, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
 			0x34514d9e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xb134347e, 0xea6f7e7e, 0xbd7731bd, 0x8a8a1968,
 			0x14b8a457, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0x265f4382, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af
 			//58-61
 		};
 		k0 = 520;
 		#pragma unroll 4
 		for (uint32_t idx = 0; idx < 16; idx += 4) {
 			AES_2ROUND(sharedMemory, h[idx + 0], h[idx + 1], h[idx + 2], h[idx + 3], k0);
 		}
 		k0 += 4;
 		uint32_t W[64];
 		#pragma unroll 4
 		for (uint32_t i = 0; i < 4; i++)
 		{
 			uint32_t a = P[i];
 			uint32_t b = P[i + 4];
 			uint32_t c = h[i + 8];
 			uint32_t d = P[i + 8];
 			uint32_t ab = a ^ b;
 			uint32_t bc = b ^ c;
 			uint32_t cd = c ^ d;
 			uint32_t t =  (ab & 0x80808080);
 			uint32_t t2 = (bc & 0x80808080);
 			uint32_t t3 = (cd & 0x80808080);
 			uint32_t abx = (t  >> 7) * 27U ^ ((ab^t) << 1);
 			uint32_t bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
 			uint32_t cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
 			W[i] = abx ^ bc ^ d;
 			W[i + 4] = bcx ^ a ^ cd;
 			W[i + 8] = cdx ^ ab ^ d;
 			W[i +12] = abx ^ bcx ^ cdx ^ ab ^ c;
 			a = P[i +12];
 			b = h[i + 4];
 			c = P[i +16];
 			d = P[i +20];
 			ab = a ^ b;
 			bc = b ^ c;
 			cd = c ^ d;
 			t = (ab & 0x80808080);
 			t2 = (bc & 0x80808080);
 			t3 = (cd & 0x80808080);
 			abx = (t >> 7) * 27U ^ ((ab^t) << 1);
 			bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
 			cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
 			W[16 + i] = bc ^ d ^ abx;
 			W[16 + i + 4] = a ^ cd ^ bcx;
 			W[16 + i + 8] = d ^ ab ^ cdx;
 			W[16 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx;
 			a = h[i];
 			b = P[24 + i + 0];
 			c = P[24 + i + 4];
 			d = P[24 + i + 8];
 			ab = a ^ b;
 			bc = b ^ c;
 			cd = c ^ d;
 			t = (ab & 0x80808080);
 			t2 = (bc & 0x80808080);
 			t3 = (cd & 0x80808080);
 			abx = (t >> 7) * 27U ^ ((ab^t) << 1);
 			bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
 			cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
 			W[32 + i] = bc ^ d ^ abx;
 			W[32 + i + 4] = a ^ cd ^ bcx;
 			W[32 + i + 8] = d ^ ab ^ cdx;
 			W[32 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx;
 			a = P[36 + i ];
 			b = P[36 + i + 4];
 			c = P[36 + i + 8];
 			d = h[i + 12];
 			ab = a ^ b;
 			bc = b ^ c;
 			cd = c ^ d;
 			t = (ab & 0x80808080);
 			t2 = (bc & 0x80808080);
 			t3 = (cd & 0x80808080);
 			abx = (t >> 7) * 27U ^ ((ab^t) << 1);
 			bcx = (t2 >> 7) * 27U ^ ((bc^t2) << 1);
 			cdx = (t3 >> 7) * 27U ^ ((cd^t3) << 1);
 			W[48 + i] = bc ^ d ^ abx;
 			W[48 + i + 4] = a ^ cd ^ bcx;
 			W[48 + i + 8] = d ^ ab ^ cdx;
 			W[48 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx;
 		}
 		for (int k = 1; k < 10; k++)
 			echo_round_alexis(sharedMemory,W,k0);
 		#pragma unroll 4
 		for (int i = 0; i < 16; i += 4)
 		{
 			W[i] ^= W[32 + i] ^ 512;
 			W[i + 1] ^= W[32 + i + 1];
 			W[i + 2] ^= W[32 + i + 2];
 			W[i + 3] ^= W[32 + i + 3];
 		}
 		*(uint2x4*)&Hash[ 0] = *(uint2x4*)&hash[ 0] ^ *(uint2x4*)&W[ 0];
 		*(uint2x4*)&Hash[ 8] = *(uint2x4*)&hash[ 8] ^ *(uint2x4*)&W[ 8];
 	}
 }
 __host__
 void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){
 	const uint32_t threadsperblock = 128;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 	x16_echo512_gpu_hash_64<<<grid, block>>>(threads, d_hash);
 }
--- a/x16/cuda_x16_fugue512.cu
+++ b/x16/cuda_x16_fugue512.cu
@ -0,0 +1,467 @@
 #include <stdio.h>
 #include <cuda_helper.h>
 #define TPB 256
 /*
 * fugue512-80 x16r kernel implementation.
 *
 * ==========================(LICENSE BEGIN)============================
 *
 * Copyright (c) 2018 tpruvot
 *
 * Permission is hereby granted, free of charge, to any person obtaining
 * a copy of this software and associated documentation files (the
 * "Software"), to deal in the Software without restriction, including
 * without limitation the rights to use, copy, modify, merge, publish,
 * distribute, sublicense, and/or sell copies of the Software, and to
 * permit persons to whom the Software is furnished to do so, subject to
 * the following conditions:
 *
 * The above copyright notice and this permission notice shall be
 * included in all copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
 * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 *
 * ===========================(LICENSE END)=============================
 */
 #ifdef __INTELLISENSE__
 #define __byte_perm(x, y, m) (x|y)
 #define tex1Dfetch(t, n) (n)
 #define __CUDACC__
 #include <cuda_texture_types.h>
 #endif
 // store allocated textures device addresses
 static unsigned int* d_textures[MAX_GPUS][1];
 #define mixtab0(x) mixtabs[(x)]
 #define mixtab1(x) mixtabs[(x)+256]
 #define mixtab2(x) mixtabs[(x)+512]
 #define mixtab3(x) mixtabs[(x)+768]
 static texture<unsigned int, 1, cudaReadModeElementType> mixTab0Tex;
 static const uint32_t mixtab0[] = {
 	0x63633297, 0x7c7c6feb, 0x77775ec7, 0x7b7b7af7, 0xf2f2e8e5, 0x6b6b0ab7, 0x6f6f16a7, 0xc5c56d39,
 	0x303090c0, 0x01010704, 0x67672e87, 0x2b2bd1ac, 0xfefeccd5, 0xd7d71371, 0xabab7c9a, 0x767659c3,
 	0xcaca4005, 0x8282a33e, 0xc9c94909, 0x7d7d68ef, 0xfafad0c5, 0x5959947f, 0x4747ce07, 0xf0f0e6ed,
 	0xadad6e82, 0xd4d41a7d, 0xa2a243be, 0xafaf608a, 0x9c9cf946, 0xa4a451a6, 0x727245d3, 0xc0c0762d,
 	0xb7b728ea, 0xfdfdc5d9, 0x9393d47a, 0x2626f298, 0x363682d8, 0x3f3fbdfc, 0xf7f7f3f1, 0xcccc521d,
 	0x34348cd0, 0xa5a556a2, 0xe5e58db9, 0xf1f1e1e9, 0x71714cdf, 0xd8d83e4d, 0x313197c4, 0x15156b54,
 	0x04041c10, 0xc7c76331, 0x2323e98c, 0xc3c37f21, 0x18184860, 0x9696cf6e, 0x05051b14, 0x9a9aeb5e,
 	0x0707151c, 0x12127e48, 0x8080ad36, 0xe2e298a5, 0xebeba781, 0x2727f59c, 0xb2b233fe, 0x757550cf,
 	0x09093f24, 0x8383a43a, 0x2c2cc4b0, 0x1a1a4668, 0x1b1b416c, 0x6e6e11a3, 0x5a5a9d73, 0xa0a04db6,
 	0x5252a553, 0x3b3ba1ec, 0xd6d61475, 0xb3b334fa, 0x2929dfa4, 0xe3e39fa1, 0x2f2fcdbc, 0x8484b126,
 	0x5353a257, 0xd1d10169, 0x00000000, 0xededb599, 0x2020e080, 0xfcfcc2dd, 0xb1b13af2, 0x5b5b9a77,
 	0x6a6a0db3, 0xcbcb4701, 0xbebe17ce, 0x3939afe4, 0x4a4aed33, 0x4c4cff2b, 0x5858937b, 0xcfcf5b11,
 	0xd0d0066d, 0xefefbb91, 0xaaaa7b9e, 0xfbfbd7c1, 0x4343d217, 0x4d4df82f, 0x333399cc, 0x8585b622,
 	0x4545c00f, 0xf9f9d9c9, 0x02020e08, 0x7f7f66e7, 0x5050ab5b, 0x3c3cb4f0, 0x9f9ff04a, 0xa8a87596,
 	0x5151ac5f, 0xa3a344ba, 0x4040db1b, 0x8f8f800a, 0x9292d37e, 0x9d9dfe42, 0x3838a8e0, 0xf5f5fdf9,
 	0xbcbc19c6, 0xb6b62fee, 0xdada3045, 0x2121e784, 0x10107040, 0xffffcbd1, 0xf3f3efe1, 0xd2d20865,
 	0xcdcd5519, 0x0c0c2430, 0x1313794c, 0xececb29d, 0x5f5f8667, 0x9797c86a, 0x4444c70b, 0x1717655c,
 	0xc4c46a3d, 0xa7a758aa, 0x7e7e61e3, 0x3d3db3f4, 0x6464278b, 0x5d5d886f, 0x19194f64, 0x737342d7,
 	0x60603b9b, 0x8181aa32, 0x4f4ff627, 0xdcdc225d, 0x2222ee88, 0x2a2ad6a8, 0x9090dd76, 0x88889516,
 	0x4646c903, 0xeeeebc95, 0xb8b805d6, 0x14146c50, 0xdede2c55, 0x5e5e8163, 0x0b0b312c, 0xdbdb3741,
 	0xe0e096ad, 0x32329ec8, 0x3a3aa6e8, 0x0a0a3628, 0x4949e43f, 0x06061218, 0x2424fc90, 0x5c5c8f6b,
 	0xc2c27825, 0xd3d30f61, 0xacac6986, 0x62623593, 0x9191da72, 0x9595c662, 0xe4e48abd, 0x797974ff,
 	0xe7e783b1, 0xc8c84e0d, 0x373785dc, 0x6d6d18af, 0x8d8d8e02, 0xd5d51d79, 0x4e4ef123, 0xa9a97292,
 	0x6c6c1fab, 0x5656b943, 0xf4f4fafd, 0xeaeaa085, 0x6565208f, 0x7a7a7df3, 0xaeae678e, 0x08083820,
 	0xbaba0bde, 0x787873fb, 0x2525fb94, 0x2e2ecab8, 0x1c1c5470, 0xa6a65fae, 0xb4b421e6, 0xc6c66435,
 	0xe8e8ae8d, 0xdddd2559, 0x747457cb, 0x1f1f5d7c, 0x4b4bea37, 0xbdbd1ec2, 0x8b8b9c1a, 0x8a8a9b1e,
 	0x70704bdb, 0x3e3ebaf8, 0xb5b526e2, 0x66662983, 0x4848e33b, 0x0303090c, 0xf6f6f4f5, 0x0e0e2a38,
 	0x61613c9f, 0x35358bd4, 0x5757be47, 0xb9b902d2, 0x8686bf2e, 0xc1c17129, 0x1d1d5374, 0x9e9ef74e,
 	0xe1e191a9, 0xf8f8decd, 0x9898e556, 0x11117744, 0x696904bf, 0xd9d93949, 0x8e8e870e, 0x9494c166,
 	0x9b9bec5a, 0x1e1e5a78, 0x8787b82a, 0xe9e9a989, 0xcece5c15, 0x5555b04f, 0x2828d8a0, 0xdfdf2b51,
 	0x8c8c8906, 0xa1a14ab2, 0x89899212, 0x0d0d2334, 0xbfbf10ca, 0xe6e684b5, 0x4242d513, 0x686803bb,
 	0x4141dc1f, 0x9999e252, 0x2d2dc3b4, 0x0f0f2d3c, 0xb0b03df6, 0x5454b74b, 0xbbbb0cda, 0x16166258
 };
 #define TIX4(q, x00, x01, x04, x07, x08, x22, x24, x27, x30) { \
 	x22 ^= x00; \
 	x00 = (q); \
 	x08 ^= x00; \
 	x01 ^= x24; \
 	x04 ^= x27; \
 	x07 ^= x30; \
 }
 #define CMIX36(x00, x01, x02, x04, x05, x06, x18, x19, x20) { \
 	x00 ^= x04; \
 	x01 ^= x05; \
 	x02 ^= x06; \
 	x18 ^= x04; \
 	x19 ^= x05; \
 	x20 ^= x06; \
 }
 #define SMIX(x0, x1, x2, x3) { \
 	uint32_t tmp; \
 	uint32_t r0 = 0; \
 	uint32_t r1 = 0; \
 	uint32_t r2 = 0; \
 	uint32_t r3 = 0; \
 	uint32_t c0 = mixtab0(x0 >> 24); \
 	tmp = mixtab1((x0 >> 16) & 0xFF); \
 	c0 ^= tmp; \
 	r1 ^= tmp; \
 	tmp = mixtab2((x0 >>  8) & 0xFF); \
 	c0 ^= tmp; \
 	r2 ^= tmp; \
 	tmp = mixtab3(x0 & 0xFF); \
 	c0 ^= tmp; \
 	r3 ^= tmp; \
 	tmp = mixtab0(x1 >> 24); \
 	uint32_t c1 = tmp; \
 	r0 ^= tmp; \
 	tmp = mixtab1((x1 >> 16) & 0xFF); \
 	c1 ^= tmp; \
 	tmp = mixtab2((x1 >>  8) & 0xFF); \
 	c1 ^= tmp; \
 	r2 ^= tmp; \
 	tmp = mixtab3(x1 & 0xFF); \
 	c1 ^= tmp; \
 	r3 ^= tmp; \
 	tmp = mixtab0(x2 >> 24); \
 	uint32_t c2 = tmp; \
 	r0 ^= tmp; \
 	tmp = mixtab1((x2 >> 16) & 0xFF); \
 	c2 ^= tmp; \
 	r1 ^= tmp; \
 	tmp = mixtab2((x2 >>  8) & 0xFF); \
 	c2 ^= tmp; \
 	tmp = mixtab3(x2 & 0xFF); \
 	c2 ^= tmp; \
 	r3 ^= tmp; \
 	tmp = mixtab0(x3 >> 24); \
 	uint32_t c3 = tmp; \
 	r0 ^= tmp; \
 	tmp = mixtab1((x3 >> 16) & 0xFF); \
 	c3 ^= tmp; \
 	r1 ^= tmp; \
 	tmp = mixtab2((x3 >>  8) & 0xFF); \
 	c3 ^= tmp; \
 	r2 ^= tmp; \
 	tmp = mixtab3(x3 & 0xFF); \
 	c3 ^= tmp; \
 	x0 = ((c0 ^ r0) & 0xFF000000) | ((c1 ^ r1) & 0x00FF0000) \
 		| ((c2 ^ r2) & 0x0000FF00) | ((c3 ^ r3) & 0x000000FF); \
 	x1 = ((c1 ^ (r0 <<  8)) & 0xFF000000) | ((c2 ^ (r1 <<  8)) & 0x00FF0000) \
 		| ((c3 ^ (r2 <<  8)) & 0x0000FF00) | ((c0 ^ (r3 >> 24)) & 0x000000FF); \
 	x2 = ((c2 ^ (r0 << 16)) & 0xFF000000) | ((c3 ^ (r1 << 16)) & 0x00FF0000) \
 		| ((c0 ^ (r2 >> 16)) & 0x0000FF00) | ((c1 ^ (r3 >> 16)) & 0x000000FF); \
 	x3 = ((c3 ^ (r0 << 24)) & 0xFF000000) | ((c0 ^ (r1 >>  8)) & 0x00FF0000) \
 		| ((c1 ^ (r2 >>  8)) & 0x0000FF00) | ((c2 ^ (r3 >>  8)) & 0x000000FF); \
 }
 #define SUB_ROR3 { \
 	B33 = S33, B34 = S34, B35 = S35; \
 	S35 = S32; S34 = S31; S33 = S30; S32 = S29; S31 = S28; S30 = S27; S29 = S26; S28 = S25; S27 = S24; \
 	S26 = S23; S25 = S22; S24 = S21; S23 = S20; S22 = S19; S21 = S18; S20 = S17; S19 = S16; S18 = S15; \
 	S17 = S14; S16 = S13; S15 = S12; S14 = S11; S13 = S10; S12 = S09; S11 = S08; S10 = S07; S09 = S06; \
 	S08 = S05; S07 = S04; S06 = S03; S05 = S02; S04 = S01; S03 = S00; S02 = B35; S01 = B34; S00 = B33; \
 }
 #define SUB_ROR8 { \
 	B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
 	S35 = S27; S34 = S26; S33 = S25; S32 = S24; S31 = S23; S30 = S22; S29 = S21; S28 = S20; S27 = S19; \
 	S26 = S18; S25 = S17; S24 = S16; S23 = S15; S22 = S14; S21 = S13; S20 = S12; S19 = S11; S18 = S10; \
 	S17 = S09; S16 = S08; S15 = S07; S14 = S06; S13 = S05; S12 = S04; S11 = S03; S10 = S02; S09 = S01; \
 	S08 = S00; S07 = B35; S06 = B34; S05 = B33; S04 = B32; S03 = B31; S02 = B30; S01 = B29; S00 = B28; \
 }
 #define SUB_ROR9 { \
 	B27 = S27, B28 = S28, B29 = S29, B30 = S30, B31 = S31, B32 = S32, B33 = S33, B34 = S34, B35 = S35; \
 	S35 = S26; S34 = S25; S33 = S24; S32 = S23; S31 = S22; S30 = S21; S29 = S20; S28 = S19; S27 = S18; \
 	S26 = S17; S25 = S16; S24 = S15; S23 = S14; S22 = S13; S21 = S12; S20 = S11; S19 = S10; S18 = S09; \
 	S17 = S08; S16 = S07; S15 = S06; S14 = S05; S13 = S04; S12 = S03; S11 = S02; S10 = S01; S09 = S00; \
 	S08 = B35; S07 = B34; S06 = B33; S05 = B32; S04 = B31; S03 = B30; S02 = B29; S01 = B28; S00 = B27; \
 }
 #define SUB_ROR9_3 { \
 	SUB_ROR3; SUB_ROR3; SUB_ROR3; \
 }
 #define SUB_ROR12 { /* to fix */ \
 	B24 = S00; B25 = S01; B26 = S02; B27 = S03; B28 = S04; B29 = S05; B30 = S06; B31 = S07; B32 = S08; B33 = S09; B34 = S10; B35 = S11; \
 	S00 = S12; S01 = S13; S02 = S14; S03 = S15; S04 = S16; S05 = S17; S06 = S18; S07 = S19; S08 = S20; S09 = S21; S10 = S22; S11 = S23; \
 	S12 = S24; S13 = S25; S14 = S26; S15 = S27; S16 = S28; S17 = S29; S18 = S30; S19 = S31; S20 = S32; S21 = S33; S22 = S34; S23 = S35; \
 	S24 = B24; S25 = B25; S26 = B26; S27 = B27; S28 = B28; S29 = B29; S30 = B30; S31 = B31; S32 = B32; S33 = B33; S34 = B34; S35 = B35; \
 }
 #define FUGUE512_3(x, y, z) { \
 	TIX4(x, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
 	CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
 	SMIX(S33, S34, S35, S00); \
 	CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
 	SMIX(S30, S31, S32, S33); \
 	CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
 	SMIX(S27, S28, S29, S30); \
 	CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
 	SMIX(S24, S25, S26, S27); \
 	\
 	TIX4(y, S24, S25, S28, S31, S32, S10, S12, S15, S18); \
 	CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \
 	SMIX(S21, S22, S23, S24); \
 	CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \
 	SMIX(S18, S19, S20, S21); \
 	CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \
 	SMIX(S15, S16, S17, S18); \
 	CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \
 	SMIX(S12, S13, S14, S15); \
 	\
 	TIX4(z, S12, S13, S16, S19, S20, S34, S00, S03, S06); \
 	CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \
 	SMIX(S09, S10, S11, S12); \
 	CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \
 	SMIX(S06, S07, S08, S09); \
 	CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \
 	SMIX(S03, S04, S05, S06); \
 	CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \
 	SMIX(S00, S01, S02, S03); \
 }
 #define FUGUE512_F(w, x, y, z) { \
 	TIX4(w, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
 	CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
 	SMIX(S33, S34, S35, S00); \
 	CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
 	SMIX(S30, S31, S32, S33); \
 	CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
 	SMIX(S27, S28, S29, S30); \
 	CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
 	SMIX(S24, S25, S26, S27); \
 	\
 	TIX4(x, S24, S25, S28, S31, S32, S10, S12, S15, S18); \
 	CMIX36(S21, S22, S23, S25, S26, S27, S03, S04, S05); \
 	SMIX(S21, S22, S23, S24); \
 	CMIX36(S18, S19, S20, S22, S23, S24, S00, S01, S02); \
 	SMIX(S18, S19, S20, S21); \
 	CMIX36(S15, S16, S17, S19, S20, S21, S33, S34, S35); \
 	SMIX(S15, S16, S17, S18); \
 	CMIX36(S12, S13, S14, S16, S17, S18, S30, S31, S32); \
 	SMIX(S12, S13, S14, S15); \
 	\
 	TIX4(y, S12, S13, S16, S19, S20, S34, S00, S03, S06); \
 	CMIX36(S09, S10, S11, S13, S14, S15, S27, S28, S29); \
 	SMIX(S09, S10, S11, S12); \
 	CMIX36(S06, S07, S08, S10, S11, S12, S24, S25, S26); \
 	SMIX(S06, S07, S08, S09); \
 	CMIX36(S03, S04, S05, S07, S08, S09, S21, S22, S23); \
 	SMIX(S03, S04, S05, S06); \
 	CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20); \
 	SMIX(S00, S01, S02, S03); \
 	\
 	TIX4(z, S00, S01, S04, S07, S08, S22, S24, S27, S30); \
 	CMIX36(S33, S34, S35, S01, S02, S03, S15, S16, S17); \
 	SMIX(S33, S34, S35, S00); \
 	CMIX36(S30, S31, S32, S34, S35, S00, S12, S13, S14); \
 	SMIX(S30, S31, S32, S33); \
 	CMIX36(S27, S28, S29, S31, S32, S33, S09, S10, S11); \
 	SMIX(S27, S28, S29, S30); \
 	CMIX36(S24, S25, S26, S28, S29, S30, S06, S07, S08); \
 	SMIX(S24, S25, S26, S27); \
 }
 #undef ROL8
 #ifdef __CUDA_ARCH__
 __device__ __forceinline__
 uint32_t ROL8(const uint32_t a) {
 	return __byte_perm(a, 0, 0x2103);
 }
 __device__ __forceinline__
 uint32_t ROR8(const uint32_t a) {
 	return __byte_perm(a, 0, 0x0321);
 }
 __device__ __forceinline__
 uint32_t ROL16(const uint32_t a) {
 	return __byte_perm(a, 0, 0x1032);
 }
 #else
 #define ROL8(u)  ROTL32(u, 8)
 #define ROR8(u)  ROTR32(u, 8)
 #define ROL16(u) ROTL32(u,16)
 #endif
 //#define AS_UINT4(addr) *((uint4*)(addr))
 __constant__ static uint64_t c_PaddedMessage80[10];
 __host__
 void x16_fugue512_setBlock_80(void *pdata)
 {
 	cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
 }
 /***************************************************/
 __global__
 __launch_bounds__(TPB)
 void x16_fugue512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
 {
 	__shared__ uint32_t mixtabs[1024];
 	// load shared mem (with 256 threads)
 	const uint32_t thr = threadIdx.x & 0xFF;
 	const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);
 	mixtabs[thr] = tmp;
 	mixtabs[thr+256] = ROR8(tmp);
 	mixtabs[thr+512] = ROL16(tmp);
 	mixtabs[thr+768] = ROL8(tmp);
 #if TPB <= 256
 	if (blockDim.x < 256) {
 		const uint32_t thr = (threadIdx.x + 0x80) & 0xFF;
 		const uint32_t tmp = tex1Dfetch(mixTab0Tex, thr);
 		mixtabs[thr] = tmp;
 		mixtabs[thr + 256] = ROR8(tmp);
 		mixtabs[thr + 512] = ROL16(tmp);
 		mixtabs[thr + 768] = ROL8(tmp);
 	}
 #endif
 	__syncthreads();
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint32_t Data[20];
 		#pragma unroll
 		for(int i = 0; i < 10; i++)
 			AS_UINT2(&Data[i * 2]) = AS_UINT2(&c_PaddedMessage80[i]);
 		Data[19] = (startNonce + thread);
 		uint32_t S00, S01, S02, S03, S04, S05, S06, S07, S08, S09, S10, S11;
 		uint32_t S12, S13, S14, S15, S16, S17, S18, S19, S20, S21, S22, S23;
 		uint32_t S24, S25, S26, S27, S28, S29, S30, S31, S32, S33, S34, S35;
 		//uint32_t B24, B25, B26,
 		uint32_t B27, B28, B29, B30, B31, B32, B33, B34, B35;
 		//const uint64_t bc = 640 bits to hash
 		//const uint32_t bclo = (uint32_t)(bc);
 		//const uint32_t bchi = (uint32_t)(bc >> 32);
 		S00 = S01 = S02 = S03 = S04 = S05 = S06 = S07 = S08 = S09 = 0;
 		S10 = S11 = S12 = S13 = S14 = S15 = S16 = S17 = S18 = S19 = 0;
 		S20 = 0x8807a57e; S21 = 0xe616af75; S22 = 0xc5d3e4db; S23 = 0xac9ab027;
 		S24 = 0xd915f117; S25 = 0xb6eecc54; S26 = 0x06e8020b; S27 = 0x4a92efd1;
 		S28 = 0xaac6e2c9; S29 = 0xddb21398; S30 = 0xcae65838; S31 = 0x437f203f;
 		S32 = 0x25ea78e7; S33 = 0x951fddd6; S34 = 0xda6ed11d; S35 = 0xe13e3567;
 		FUGUE512_3((Data[ 0]), (Data[ 1]), (Data[ 2]));
 		FUGUE512_3((Data[ 3]), (Data[ 4]), (Data[ 5]));
 		FUGUE512_3((Data[ 6]), (Data[ 7]), (Data[ 8]));
 		FUGUE512_3((Data[ 9]), (Data[10]), (Data[11]));
 		FUGUE512_3((Data[12]), (Data[13]), (Data[14]));
 		FUGUE512_3((Data[15]), (Data[16]), (Data[17]));
 		FUGUE512_F((Data[18]), (Data[19]), 0/*bchi*/, (80*8)/*bclo*/);
 		// rotate right state by 3 dwords (S00 = S33, S03 = S00)
 		SUB_ROR3;
 		SUB_ROR9;
 		#pragma unroll 32
 		for (int i = 0; i < 32; i++) {
 			SUB_ROR3;
 			CMIX36(S00, S01, S02, S04, S05, S06, S18, S19, S20);
 			SMIX(S00, S01, S02, S03);
 		}
 		#pragma unroll 13
 		for (int i = 0; i < 13; i++) {
 			S04 ^= S00;
 			S09 ^= S00;
 			S18 ^= S00;
 			S27 ^= S00;
 			SUB_ROR9;
 			SMIX(S00, S01, S02, S03);
 			S04 ^= S00;
 			S10 ^= S00;
 			S18 ^= S00;
 			S27 ^= S00;
 			SUB_ROR9;
 			SMIX(S00, S01, S02, S03);
 			S04 ^= S00;
 			S10 ^= S00;
 			S19 ^= S00;
 			S27 ^= S00;
 			SUB_ROR9;
 			SMIX(S00, S01, S02, S03);
 			S04 ^= S00;
 			S10 ^= S00;
 			S19 ^= S00;
 			S28 ^= S00;
 			SUB_ROR8;
 			SMIX(S00, S01, S02, S03);
 		}
 		S04 ^= S00;
 		S09 ^= S00;
 		S18 ^= S00;
 		S27 ^= S00;
 		Data[ 0] = cuda_swab32(S01);
 		Data[ 1] = cuda_swab32(S02);
 		Data[ 2] = cuda_swab32(S03);
 		Data[ 3] = cuda_swab32(S04);
 		Data[ 4] = cuda_swab32(S09);
 		Data[ 5] = cuda_swab32(S10);
 		Data[ 6] = cuda_swab32(S11);
 		Data[ 7] = cuda_swab32(S12);
 		Data[ 8] = cuda_swab32(S18);
 		Data[ 9] = cuda_swab32(S19);
 		Data[10] = cuda_swab32(S20);
 		Data[11] = cuda_swab32(S21);
 		Data[12] = cuda_swab32(S27);
 		Data[13] = cuda_swab32(S28);
 		Data[14] = cuda_swab32(S29);
 		Data[15] = cuda_swab32(S30);
 		const size_t hashPosition = thread;
 		uint64_t* pHash = &g_hash[hashPosition << 3];
 		#pragma unroll 4
 		for(int i = 0; i < 4; i++)
 			AS_UINT4(&pHash[i * 2]) = AS_UINT4(&Data[i * 4]);
 	}
 }
 #define texDef(id, texname, texmem, texsource, texsize) { \
 	unsigned int *texmem; \
 	cudaMalloc(&texmem, texsize); \
 	d_textures[thr_id][id] = texmem; \
 	cudaMemcpy(texmem, texsource, texsize, cudaMemcpyHostToDevice); \
 	texname.normalized = 0; \
 	texname.filterMode = cudaFilterModePoint; \
 	texname.addressMode[0] = cudaAddressModeClamp; \
 	{ cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<unsigned int>(); \
 	  cudaBindTexture(NULL, &texname, texmem, &channelDesc, texsize ); \
 	} \
 }
 __host__
 void x16_fugue512_cpu_init(int thr_id, uint32_t threads)
 {
 	texDef(0, mixTab0Tex, mixTab0m, mixtab0, sizeof(uint32_t)*256);
 }
 __host__
 void x16_fugue512_cpu_free(int thr_id)
 {
 	cudaFree(d_textures[thr_id][0]);
 }
 __host__
 void x16_fugue512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
 {
 	const uint32_t threadsperblock = TPB;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 	x16_fugue512_gpu_hash_80 <<<grid, block>>> (threads, startNonce, (uint64_t*)d_hash);
 }
--- a/x16/cuda_x16_shabal512.cu
+++ b/x16/cuda_x16_shabal512.cu
@ -0,0 +1,350 @@
 /*
 * Shabal-512 for X16R
 * tpruvot 2018, based on alexis x14 and xevan kernlx code
 */
 #include <cuda_helper.h>
 #include <cuda_vectors.h>
 #include <cuda_vector_uint2x4.h>
 typedef uint32_t sph_u32;
 #define C32(x) (x)
 #define T32(x) (x)
 #define INPUT_BLOCK_ADD do { \
 		B0 = T32(B0 + M0); \
 		B1 = T32(B1 + M1); \
 		B2 = T32(B2 + M2); \
 		B3 = T32(B3 + M3); \
 		B4 = T32(B4 + M4); \
 		B5 = T32(B5 + M5); \
 		B6 = T32(B6 + M6); \
 		B7 = T32(B7 + M7); \
 		B8 = T32(B8 + M8); \
 		B9 = T32(B9 + M9); \
 		BA = T32(BA + MA); \
 		BB = T32(BB + MB); \
 		BC = T32(BC + MC); \
 		BD = T32(BD + MD); \
 		BE = T32(BE + ME); \
 		BF = T32(BF + MF); \
 		} while (0)
 #define INPUT_BLOCK_SUB do { \
 		C0 = T32(C0 - M0); \
 		C1 = T32(C1 - M1); \
 		C2 = T32(C2 - M2); \
 		C3 = T32(C3 - M3); \
 		C4 = T32(C4 - M4); \
 		C5 = T32(C5 - M5); \
 		C6 = T32(C6 - M6); \
 		C7 = T32(C7 - M7); \
 		C8 = T32(C8 - M8); \
 		C9 = T32(C9 - M9); \
 		CA = T32(CA - MA); \
 		CB = T32(CB - MB); \
 		CC = T32(CC - MC); \
 		CD = T32(CD - MD); \
 		CE = T32(CE - ME); \
 		CF = T32(CF - MF); \
 		} while (0)
 #define XOR_W   do { \
 		A00 ^= Wlow; \
 		A01 ^= Whigh; \
 		} while (0)
 #define SWAP(v1, v2) do { \
 		sph_u32 tmp = (v1); \
 		(v1) = (v2); \
 		(v2) = tmp; \
 		} while (0)
 #define SWAP_BC do { \
 		SWAP(B0, C0); \
 		SWAP(B1, C1); \
 		SWAP(B2, C2); \
 		SWAP(B3, C3); \
 		SWAP(B4, C4); \
 		SWAP(B5, C5); \
 		SWAP(B6, C6); \
 		SWAP(B7, C7); \
 		SWAP(B8, C8); \
 		SWAP(B9, C9); \
 		SWAP(BA, CA); \
 		SWAP(BB, CB); \
 		SWAP(BC, CC); \
 		SWAP(BD, CD); \
 		SWAP(BE, CE); \
 		SWAP(BF, CF); \
 		} while (0)
 #define PERM_ELT(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) do { \
 		xa0 = T32((xa0 \
 			^ (((xa1 << 15) | (xa1 >> 17)) * 5U) \
 			^ xc) * 3U) \
 			^ xb1 ^ (xb2 & ~xb3) ^ xm; \
 		xb0 = T32(~(((xb0 << 1) | (xb0 >> 31)) ^ xa0)); \
 		} while (0)
 #define PERM_STEP_0 do { \
 		PERM_ELT(A00, A0B, B0, BD, B9, B6, C8, M0); \
 		PERM_ELT(A01, A00, B1, BE, BA, B7, C7, M1); \
 		PERM_ELT(A02, A01, B2, BF, BB, B8, C6, M2); \
 		PERM_ELT(A03, A02, B3, B0, BC, B9, C5, M3); \
 		PERM_ELT(A04, A03, B4, B1, BD, BA, C4, M4); \
 		PERM_ELT(A05, A04, B5, B2, BE, BB, C3, M5); \
 		PERM_ELT(A06, A05, B6, B3, BF, BC, C2, M6); \
 		PERM_ELT(A07, A06, B7, B4, B0, BD, C1, M7); \
 		PERM_ELT(A08, A07, B8, B5, B1, BE, C0, M8); \
 		PERM_ELT(A09, A08, B9, B6, B2, BF, CF, M9); \
 		PERM_ELT(A0A, A09, BA, B7, B3, B0, CE, MA); \
 		PERM_ELT(A0B, A0A, BB, B8, B4, B1, CD, MB); \
 		PERM_ELT(A00, A0B, BC, B9, B5, B2, CC, MC); \
 		PERM_ELT(A01, A00, BD, BA, B6, B3, CB, MD); \
 		PERM_ELT(A02, A01, BE, BB, B7, B4, CA, ME); \
 		PERM_ELT(A03, A02, BF, BC, B8, B5, C9, MF); \
 		} while (0)
 #define PERM_STEP_1 do { \
 		PERM_ELT(A04, A03, B0, BD, B9, B6, C8, M0); \
 		PERM_ELT(A05, A04, B1, BE, BA, B7, C7, M1); \
 		PERM_ELT(A06, A05, B2, BF, BB, B8, C6, M2); \
 		PERM_ELT(A07, A06, B3, B0, BC, B9, C5, M3); \
 		PERM_ELT(A08, A07, B4, B1, BD, BA, C4, M4); \
 		PERM_ELT(A09, A08, B5, B2, BE, BB, C3, M5); \
 		PERM_ELT(A0A, A09, B6, B3, BF, BC, C2, M6); \
 		PERM_ELT(A0B, A0A, B7, B4, B0, BD, C1, M7); \
 		PERM_ELT(A00, A0B, B8, B5, B1, BE, C0, M8); \
 		PERM_ELT(A01, A00, B9, B6, B2, BF, CF, M9); \
 		PERM_ELT(A02, A01, BA, B7, B3, B0, CE, MA); \
 		PERM_ELT(A03, A02, BB, B8, B4, B1, CD, MB); \
 		PERM_ELT(A04, A03, BC, B9, B5, B2, CC, MC); \
 		PERM_ELT(A05, A04, BD, BA, B6, B3, CB, MD); \
 		PERM_ELT(A06, A05, BE, BB, B7, B4, CA, ME); \
 		PERM_ELT(A07, A06, BF, BC, B8, B5, C9, MF); \
 		} while (0)
 #define PERM_STEP_2 do { \
 		PERM_ELT(A08, A07, B0, BD, B9, B6, C8, M0); \
 		PERM_ELT(A09, A08, B1, BE, BA, B7, C7, M1); \
 		PERM_ELT(A0A, A09, B2, BF, BB, B8, C6, M2); \
 		PERM_ELT(A0B, A0A, B3, B0, BC, B9, C5, M3); \
 		PERM_ELT(A00, A0B, B4, B1, BD, BA, C4, M4); \
 		PERM_ELT(A01, A00, B5, B2, BE, BB, C3, M5); \
 		PERM_ELT(A02, A01, B6, B3, BF, BC, C2, M6); \
 		PERM_ELT(A03, A02, B7, B4, B0, BD, C1, M7); \
 		PERM_ELT(A04, A03, B8, B5, B1, BE, C0, M8); \
 		PERM_ELT(A05, A04, B9, B6, B2, BF, CF, M9); \
 		PERM_ELT(A06, A05, BA, B7, B3, B0, CE, MA); \
 		PERM_ELT(A07, A06, BB, B8, B4, B1, CD, MB); \
 		PERM_ELT(A08, A07, BC, B9, B5, B2, CC, MC); \
 		PERM_ELT(A09, A08, BD, BA, B6, B3, CB, MD); \
 		PERM_ELT(A0A, A09, BE, BB, B7, B4, CA, ME); \
 		PERM_ELT(A0B, A0A, BF, BC, B8, B5, C9, MF); \
 		} while (0)
 #define APPLY_P do { \
 		B0 = T32(B0 << 17) | (B0 >> 15); \
 		B1 = T32(B1 << 17) | (B1 >> 15); \
 		B2 = T32(B2 << 17) | (B2 >> 15); \
 		B3 = T32(B3 << 17) | (B3 >> 15); \
 		B4 = T32(B4 << 17) | (B4 >> 15); \
 		B5 = T32(B5 << 17) | (B5 >> 15); \
 		B6 = T32(B6 << 17) | (B6 >> 15); \
 		B7 = T32(B7 << 17) | (B7 >> 15); \
 		B8 = T32(B8 << 17) | (B8 >> 15); \
 		B9 = T32(B9 << 17) | (B9 >> 15); \
 		BA = T32(BA << 17) | (BA >> 15); \
 		BB = T32(BB << 17) | (BB >> 15); \
 		BC = T32(BC << 17) | (BC >> 15); \
 		BD = T32(BD << 17) | (BD >> 15); \
 		BE = T32(BE << 17) | (BE >> 15); \
 		BF = T32(BF << 17) | (BF >> 15); \
 		PERM_STEP_0; \
 		PERM_STEP_1; \
 		PERM_STEP_2; \
 		A0B = T32(A0B + C6); \
 		A0A = T32(A0A + C5); \
 		A09 = T32(A09 + C4); \
 		A08 = T32(A08 + C3); \
 		A07 = T32(A07 + C2); \
 		A06 = T32(A06 + C1); \
 		A05 = T32(A05 + C0); \
 		A04 = T32(A04 + CF); \
 		A03 = T32(A03 + CE); \
 		A02 = T32(A02 + CD); \
 		A01 = T32(A01 + CC); \
 		A00 = T32(A00 + CB); \
 		A0B = T32(A0B + CA); \
 		A0A = T32(A0A + C9); \
 		A09 = T32(A09 + C8); \
 		A08 = T32(A08 + C7); \
 		A07 = T32(A07 + C6); \
 		A06 = T32(A06 + C5); \
 		A05 = T32(A05 + C4); \
 		A04 = T32(A04 + C3); \
 		A03 = T32(A03 + C2); \
 		A02 = T32(A02 + C1); \
 		A01 = T32(A01 + C0); \
 		A00 = T32(A00 + CF); \
 		A0B = T32(A0B + CE); \
 		A0A = T32(A0A + CD); \
 		A09 = T32(A09 + CC); \
 		A08 = T32(A08 + CB); \
 		A07 = T32(A07 + CA); \
 		A06 = T32(A06 + C9); \
 		A05 = T32(A05 + C8); \
 		A04 = T32(A04 + C7); \
 		A03 = T32(A03 + C6); \
 		A02 = T32(A02 + C5); \
 		A01 = T32(A01 + C4); \
 		A00 = T32(A00 + C3); \
 	} while (0)
 #define INCR_W do { \
 	if ((Wlow = T32(Wlow + 1)) == 0) \
 		Whigh = T32(Whigh + 1); \
 	} while (0)
 __constant__ static const sph_u32 A_init_512[] = {
 	C32(0x20728DFD), C32(0x46C0BD53), C32(0xE782B699), C32(0x55304632),
 	C32(0x71B4EF90), C32(0x0EA9E82C), C32(0xDBB930F1), C32(0xFAD06B8B),
 	C32(0xBE0CAE40), C32(0x8BD14410), C32(0x76D2ADAC), C32(0x28ACAB7F)
 };
 __constant__ static const sph_u32 B_init_512[] = {
 	C32(0xC1099CB7), C32(0x07B385F3), C32(0xE7442C26), C32(0xCC8AD640),
 	C32(0xEB6F56C7), C32(0x1EA81AA9), C32(0x73B9D314), C32(0x1DE85D08),
 	C32(0x48910A5A), C32(0x893B22DB), C32(0xC5A0DF44), C32(0xBBC4324E),
 	C32(0x72D2F240), C32(0x75941D99), C32(0x6D8BDE82), C32(0xA1A7502B)
 };
 __constant__ static const sph_u32 C_init_512[] = {
 	C32(0xD9BF68D1), C32(0x58BAD750), C32(0x56028CB2), C32(0x8134F359),
 	C32(0xB5D469D8), C32(0x941A8CC2), C32(0x418B2A6E), C32(0x04052780),
 	C32(0x7F07D787), C32(0x5194358F), C32(0x3C60D665), C32(0xBE97D79A),
 	C32(0x950C3434), C32(0xAED9A06D), C32(0x2537DC8D), C32(0x7CDB5969)
 };
 __constant__ static uint32_t c_PaddedMessage80[20];
 __host__
 void x16_shabal512_setBlock_80(void *pdata)
 {
 	cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
 }
 #define TPB_SHABAL 256
 __global__ __launch_bounds__(TPB_SHABAL, 2)
 void x16_shabal512_gpu_hash_80(uint32_t threads, const uint32_t startNonce, uint32_t *g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	uint32_t B[] = {
 		0xC1099CB7, 0x07B385F3, 0xE7442C26, 0xCC8AD640, 0xEB6F56C7, 0x1EA81AA9, 0x73B9D314, 0x1DE85D08,
 		0x48910A5A, 0x893B22DB, 0xC5A0DF44, 0xBBC4324E, 0x72D2F240, 0x75941D99, 0x6D8BDE82, 0xA1A7502B
 	};
 	uint32_t M[16];
 	if (thread < threads)
 	{
 		// todo: try __ldc
 		*(uint2x4*)&M[0] = *(uint2x4*)&c_PaddedMessage80[0];
 		*(uint2x4*)&M[8] = *(uint2x4*)&c_PaddedMessage80[8];
 		sph_u32 A00 = A_init_512[0], A01 = A_init_512[1], A02 = A_init_512[ 2], A03 = A_init_512[ 3];
 		sph_u32 A04 = A_init_512[4], A05 = A_init_512[5], A06 = A_init_512[ 6], A07 = A_init_512[ 7];
 		sph_u32 A08 = A_init_512[8], A09 = A_init_512[9], A0A = A_init_512[10], A0B = A_init_512[11];
 		sph_u32 B0 = B_init_512[ 0], B1 = B_init_512[ 1], B2 = B_init_512[ 2], B3 = B_init_512 [3];
 		sph_u32 B4 = B_init_512[ 4], B5 = B_init_512[ 5], B6 = B_init_512[ 6], B7 = B_init_512[ 7];
 		sph_u32 B8 = B_init_512[ 8], B9 = B_init_512[ 9], BA = B_init_512[10], BB = B_init_512[11];
 		sph_u32 BC = B_init_512[12], BD = B_init_512[13], BE = B_init_512[14], BF = B_init_512[15];
 		sph_u32 C0 = C_init_512[ 0], C1 = C_init_512[ 1], C2 = C_init_512[ 2], C3 = C_init_512[ 3];
 		sph_u32 C4 = C_init_512[ 4], C5 = C_init_512[ 5], C6 = C_init_512[ 6], C7 = C_init_512[ 7];
 		sph_u32 C8 = C_init_512[ 8], C9 = C_init_512[ 9], CA = C_init_512[10], CB = C_init_512[11];
 		sph_u32 CC = C_init_512[12], CD = C_init_512[13], CE = C_init_512[14], CF = C_init_512[15];
 		sph_u32 M0, M1, M2, M3, M4, M5, M6, M7, M8, M9, MA, MB, MC, MD, ME, MF;
 		sph_u32 Wlow = 1, Whigh = 0;
 		M0 = M[ 0];
 		M1 = M[ 1];
 		M2 = M[ 2];
 		M3 = M[ 3];
 		M4 = M[ 4];
 		M5 = M[ 5];
 		M6 = M[ 6];
 		M7 = M[ 7];
 		M8 = M[ 8];
 		M9 = M[ 9];
 		MA = M[10];
 		MB = M[11];
 		MC = M[12];
 		MD = M[13];
 		ME = M[14];
 		MF = M[15];
 		INPUT_BLOCK_ADD;
 		XOR_W;
 		APPLY_P;
 		INPUT_BLOCK_SUB;
 		SWAP_BC;
 		INCR_W;
 		M0 = c_PaddedMessage80[16];
 		M1 = c_PaddedMessage80[17];
 		M2 = c_PaddedMessage80[18];
 		M3 = cuda_swab32(startNonce + thread);
 		M4 = 0x80;
 		M5 = M6 = M7 = M8 = M9 = MA = MB = MC = MD = ME = MF = 0;
 		INPUT_BLOCK_ADD;
 		XOR_W;
 		APPLY_P;
 		for (unsigned i = 0; i < 3; i++) {
 			SWAP_BC;
 			XOR_W;
 			APPLY_P;
 		}
 		B[ 0] = B0;
 		B[ 1] = B1;
 		B[ 2] = B2;
 		B[ 3] = B3;
 		B[ 4] = B4;
 		B[ 5] = B5;
 		B[ 6] = B6;
 		B[ 7] = B7;
 		B[ 8] = B8;
 		B[ 9] = B9;
 		B[10] = BA;
 		B[11] = BB;
 		B[12] = BC;
 		B[13] = BD;
 		B[14] = BE;
 		B[15] = BF;
 		// output
 		uint64_t hashPosition = thread;
 		uint32_t *Hash = &g_hash[hashPosition << 4];
 		*(uint2x4*)&Hash[0] = *(uint2x4*)&B[0];
 		*(uint2x4*)&Hash[8] = *(uint2x4*)&B[8];
 	}
 }
 __host__
 void x16_shabal512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNonce, uint32_t *d_hash)
 {
 	const uint32_t threadsperblock = TPB_SHABAL;
 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);
 	x16_shabal512_gpu_hash_80 <<<grid, block >>>(threads, startNonce, d_hash);
 }
--- a/x16/cuda_x16_simd512_80.cu
+++ b/x16/cuda_x16_simd512_80.cu
--- a/x16/x16r.cu
+++ b/x16/x16r.cu
@ -0,0 +1,622 @@
 /**
 * X16R algorithm (X16 with Randomized chain order)
 *
 * tpruvot 2018 - GPL code
 */
 #include <stdio.h>
 #include <memory.h>
 #include <unistd.h>
 extern "C" {
 #include "sph/sph_blake.h"
 #include "sph/sph_bmw.h"
 #include "sph/sph_groestl.h"
 #include "sph/sph_skein.h"
 #include "sph/sph_jh.h"
 #include "sph/sph_keccak.h"
 #include "sph/sph_luffa.h"
 #include "sph/sph_cubehash.h"
 #include "sph/sph_shavite.h"
 #include "sph/sph_simd.h"
 #include "sph/sph_echo.h"
 #include "sph/sph_hamsi.h"
 #include "sph/sph_fugue.h"
 #include "sph/sph_shabal.h"
 #include "sph/sph_whirlpool.h"
 #include "sph/sph_sha2.h"
 }
 #include "miner.h"
 #include "cuda_helper.h"
 #include "cuda_x16.h"
 static uint32_t *d_hash[MAX_GPUS];
 enum Algo {
 	BLAKE = 0,
 	BMW,
 	GROESTL,
 	JH,
 	KECCAK,
 	SKEIN,
 	LUFFA,
 	CUBEHASH,
 	SHAVITE,
 	SIMD,
 	ECHO,
 	HAMSI,
 	FUGUE,
 	SHABAL,
 	WHIRLPOOL,
 	SHA512,
 	HASH_FUNC_COUNT
 };
 static const char* algo_strings[] = {
 	"blake",
 	"bmw512",
 	"groestl",
 	"jh512",
 	"keccak",
 	"skein",
 	"luffa",
 	"cube",
 	"shavite",
 	"simd",
 	"echo",
 	"hamsi",
 	"fugue",
 	"shabal",
 	"whirlpool",
 	"sha512",
 	NULL
 };
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
 static void getAlgoString(const uint32_t* prevblock, char *output)
 {
 	char *sptr = output;
 	uint8_t* data = (uint8_t*)prevblock;
 	for (uint8_t j = 0; j < HASH_FUNC_COUNT; j++) {
 		uint8_t b = (15 - j) >> 1; // 16 ascii hex chars, reversed
 		uint8_t algoDigit = (j & 1) ? data[b] & 0xF : data[b] >> 4;
 		if (algoDigit >= 10)
 			sprintf(sptr, "%c", 'A' + (algoDigit - 10));
 		else
 			sprintf(sptr, "%u", (uint32_t) algoDigit);
 		sptr++;
 	}
 	*sptr = '\0';
 }
 // X16R CPU Hash (Validation)
 extern "C" void x16r_hash(void *output, const void *input)
 {
 	unsigned char _ALIGN(64) hash[128];
 	sph_blake512_context ctx_blake;
 	sph_bmw512_context ctx_bmw;
 	sph_groestl512_context ctx_groestl;
 	sph_jh512_context ctx_jh;
 	sph_keccak512_context ctx_keccak;
 	sph_skein512_context ctx_skein;
 	sph_luffa512_context ctx_luffa;
 	sph_cubehash512_context ctx_cubehash;
 	sph_shavite512_context ctx_shavite;
 	sph_simd512_context ctx_simd;
 	sph_echo512_context ctx_echo;
 	sph_hamsi512_context ctx_hamsi;
 	sph_fugue512_context ctx_fugue;
 	sph_shabal512_context ctx_shabal;
 	sph_whirlpool_context ctx_whirlpool;
 	sph_sha512_context ctx_sha512;
 	void *in = (void*) input;
 	int size = 80;
 	uint32_t *in32 = (uint32_t*) input;
 	getAlgoString(&in32[1], hashOrder);
 	for (int i = 0; i < 16; i++)
 	{
 		const char elem = hashOrder[i];
 		const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
 		switch (algo) {
 		case BLAKE:
 			sph_blake512_init(&ctx_blake);
 			sph_blake512(&ctx_blake, in, size);
 			sph_blake512_close(&ctx_blake, hash);
 			break;
 		case BMW:
 			sph_bmw512_init(&ctx_bmw);
 			sph_bmw512(&ctx_bmw, in, size);
 			sph_bmw512_close(&ctx_bmw, hash);
 			break;
 		case GROESTL:
 			sph_groestl512_init(&ctx_groestl);
 			sph_groestl512(&ctx_groestl, in, size);
 			sph_groestl512_close(&ctx_groestl, hash);
 			break;
 		case SKEIN:
 			sph_skein512_init(&ctx_skein);
 			sph_skein512(&ctx_skein, in, size);
 			sph_skein512_close(&ctx_skein, hash);
 			break;
 		case JH:
 			sph_jh512_init(&ctx_jh);
 			sph_jh512(&ctx_jh, in, size);
 			sph_jh512_close(&ctx_jh, hash);
 			break;
 		case KECCAK:
 			sph_keccak512_init(&ctx_keccak);
 			sph_keccak512(&ctx_keccak, in, size);
 			sph_keccak512_close(&ctx_keccak, hash);
 			break;
 		case LUFFA:
 			sph_luffa512_init(&ctx_luffa);
 			sph_luffa512(&ctx_luffa, in, size);
 			sph_luffa512_close(&ctx_luffa, hash);
 			break;
 		case CUBEHASH:
 			sph_cubehash512_init(&ctx_cubehash);
 			sph_cubehash512(&ctx_cubehash, in, size);
 			sph_cubehash512_close(&ctx_cubehash, hash);
 			break;
 		case SHAVITE:
 			sph_shavite512_init(&ctx_shavite);
 			sph_shavite512(&ctx_shavite, in, size);
 			sph_shavite512_close(&ctx_shavite, hash);
 			break;
 		case SIMD:
 			sph_simd512_init(&ctx_simd);
 			sph_simd512(&ctx_simd, in, size);
 			sph_simd512_close(&ctx_simd, hash);
 			break;
 		case ECHO:
 			sph_echo512_init(&ctx_echo);
 			sph_echo512(&ctx_echo, in, size);
 			sph_echo512_close(&ctx_echo, hash);
 			break;
 		case HAMSI:
 			sph_hamsi512_init(&ctx_hamsi);
 			sph_hamsi512(&ctx_hamsi, in, size);
 			sph_hamsi512_close(&ctx_hamsi, hash);
 			break;
 		case FUGUE:
 			sph_fugue512_init(&ctx_fugue);
 			sph_fugue512(&ctx_fugue, in, size);
 			sph_fugue512_close(&ctx_fugue, hash);
 			break;
 		case SHABAL:
 			sph_shabal512_init(&ctx_shabal);
 			sph_shabal512(&ctx_shabal, in, size);
 			sph_shabal512_close(&ctx_shabal, hash);
 			break;
 		case WHIRLPOOL:
 			sph_whirlpool_init(&ctx_whirlpool);
 			sph_whirlpool(&ctx_whirlpool, in, size);
 			sph_whirlpool_close(&ctx_whirlpool, hash);
 			break;
 		case SHA512:
 			sph_sha512_init(&ctx_sha512);
 			sph_sha512(&ctx_sha512,(const void*) in, size);
 			sph_sha512_close(&ctx_sha512,(void*) hash);
 			break;
 		}
 		in = (void*) hash;
 		size = 64;
 	}
 	memcpy(output, hash, 32);
 }
 void whirlpool_midstate(void *state, const void *input)
 {
 	sph_whirlpool_context ctx;
 	sph_whirlpool_init(&ctx);
 	sph_whirlpool(&ctx, input, 64);
 	memcpy(state, ctx.state, 64);
 }
 static bool init[MAX_GPUS] = { 0 };
 static bool use_compat_kernels[MAX_GPUS] = { 0 };
 //#define _DEBUG
 #define _DEBUG_PREFIX "x16r-"
 #include "cuda_debug.cuh"
 //static int algo80_tests[HASH_FUNC_COUNT] = { 0 };
 //static int algo64_tests[HASH_FUNC_COUNT] = { 0 };
 static int algo80_fails[HASH_FUNC_COUNT] = { 0 };
 extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
 	const int dev_id = device_map[thr_id];
 	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19;
 	if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20;
 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
 	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
 			// reduce cpu usage
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		}
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 		cuda_get_arch(thr_id);
 		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
 		if (use_compat_kernels[thr_id])
 			x11_echo512_cpu_init(thr_id, throughput);
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_bmw512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
 		quark_jh512_cpu_init(thr_id, throughput);
 		quark_keccak512_cpu_init(thr_id, throughput);
 		qubit_luffa512_cpu_init(thr_id, throughput);
 		x11_luffa512_cpu_init(thr_id, throughput); // 64
 		x11_shavite512_cpu_init(thr_id, throughput);
 		x11_simd512_cpu_init(thr_id, throughput); // 64
 		x16_echo512_cuda_init(thr_id, throughput);
 		x13_hamsi512_cpu_init(thr_id, throughput);
 		x13_fugue512_cpu_init(thr_id, throughput);
 		x16_fugue512_cpu_init(thr_id, throughput);
 		x14_shabal512_cpu_init(thr_id, throughput);
 		x15_whirlpool_cpu_init(thr_id, throughput, 0);
 		x16_whirlpool512_init(thr_id, throughput);
 		x17_sha512_cpu_init(thr_id, throughput);
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
 		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
 	if (opt_benchmark) {
 		((uint32_t*)ptarget)[7] = 0x003f;
 		//((uint8_t*)pdata)[8] = 0x90; // hashOrder[0] = '9'; for simd 80 + blake512 64
 		((uint8_t*)pdata)[8] = 0xAA; // hashOrder[0] = 'A'; for echo 80 + 64
 		//((uint8_t*)pdata)[8] = 0xB0; // hashOrder[0] = 'B'; for hamsi 80 + blake512 64
 		//((uint8_t*)pdata)[8] = 0xC0; // hashOrder[0] = 'C'; for fugue 80 + blake512 64
 		//((uint8_t*)pdata)[8] = 0xE0; // hashOrder[0] = 'E'; for whirlpool 80 + blake512 64
 	}
 	uint32_t _ALIGN(64) endiandata[20];
 	for (int k=0; k < 19; k++)
 		be32enc(&endiandata[k], pdata[k]);
 	uint32_t ntime = swab32(pdata[17]);
 	if (s_ntime != ntime) {
 		getAlgoString(&endiandata[1], hashOrder);
 		s_ntime = ntime;
 		if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime);
 	}
 	cuda_check_cpu_setTarget(ptarget);
 	char elem = hashOrder[0];
 	const uint8_t algo80 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
 	switch (algo80) {
 		case BLAKE:
 			quark_blake512_cpu_setBlock_80(thr_id, endiandata);
 			break;
 		case BMW:
 			quark_bmw512_cpu_setBlock_80(endiandata);
 			break;
 		case GROESTL:
 			groestl512_setBlock_80(thr_id, endiandata);
 			break;
 		case JH:
 			jh512_setBlock_80(thr_id, endiandata);
 			break;
 		case KECCAK:
 			keccak512_setBlock_80(thr_id, endiandata);
 			break;
 		case SKEIN:
 			skein512_cpu_setBlock_80((void*)endiandata);
 			break;
 		case LUFFA:
 			qubit_luffa512_cpu_setBlock_80((void*)endiandata);
 			break;
 		case CUBEHASH:
 			cubehash512_setBlock_80(thr_id, endiandata);
 			break;
 		case SHAVITE:
 			x11_shavite512_setBlock_80((void*)endiandata);
 			break;
 		case SIMD:
 			x16_simd512_setBlock_80((void*)endiandata);
 			break;
 		case ECHO:
 			x16_echo512_setBlock_80((void*)endiandata);
 			break;
 		case HAMSI:
 			x16_hamsi512_setBlock_80((void*)endiandata);
 			break;
 		case FUGUE:
 			x16_fugue512_setBlock_80((void*)pdata);
 			break;
 		case SHABAL:
 			x16_shabal512_setBlock_80((void*)endiandata);
 			break;
 		case WHIRLPOOL:
 			x16_whirlpool512_setBlock_80((void*)endiandata);
 			break;
 		case SHA512:
 			x16_sha512_setBlock_80(endiandata);
 			break;
 		default: {
 			return -1;
 		}
 	}
 	int warn = 0;
 	do {
 		int order = 0;
 		// Hash with CUDA
 		switch (algo80) {
 			case BLAKE:
 				quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("blake80:");
 				break;
 			case BMW:
 				quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 				TRACE("bmw80  :");
 				break;
 			case GROESTL:
 				groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("grstl80:");
 				break;
 			case JH:
 				jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("jh51280:");
 				break;
 			case KECCAK:
 				keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("kecck80:");
 				break;
 			case SKEIN:
 				skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
 				TRACE("skein80:");
 				break;
 			case LUFFA:
 				qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 				TRACE("luffa80:");
 				break;
 			case CUBEHASH:
 				cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("cube 80:");
 				break;
 			case SHAVITE:
 				x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 				TRACE("shavite:");
 				break;
 			case SIMD:
 				x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("simd512:");
 				break;
 			case ECHO:
 				x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("echo   :");
 				break;
 			case HAMSI:
 				x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("hamsi  :");
 				break;
 			case FUGUE:
 				x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("fugue  :");
 				break;
 			case SHABAL:
 				x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("shabal :");
 				break;
 			case WHIRLPOOL:
 				x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("whirl  :");
 				break;
 			case SHA512:
 				x16_sha512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("sha512 :");
 				break;
 		}
 		for (int i = 1; i < 16; i++)
 		{
 			const char elem = hashOrder[i];
 			const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
 			switch (algo64) {
 			case BLAKE:
 				quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("blake  :");
 				break;
 			case BMW:
 				quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("bmw    :");
 				break;
 			case GROESTL:
 				quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("groestl:");
 				break;
 			case JH:
 				quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("jh512  :");
 				break;
 			case KECCAK:
 				quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("keccak :");
 				break;
 			case SKEIN:
 				quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("skein  :");
 				break;
 			case LUFFA:
 				x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("luffa  :");
 				break;
 			case CUBEHASH:
 				x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("cube   :");
 				break;
 			case SHAVITE:
 				x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("shavite:");
 				break;
 			case SIMD:
 				x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("simd   :");
 				break;
 			case ECHO:
 				if (use_compat_kernels[thr_id])
 					x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				else
 					x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
 				TRACE("echo   :");
 				break;
 			case HAMSI:
 				x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("hamsi  :");
 				break;
 			case FUGUE:
 				x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("fugue  :");
 				break;
 			case SHABAL:
 				x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("shabal :");
 				break;
 			case WHIRLPOOL:
 				x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("shabal :");
 				break;
 			case SHA512:
 				x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("sha512 :");
 				break;
 			}
 		}
 		*hashes_done = pdata[19] - first_nonce + throughput;
 		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 #ifdef _DEBUG
 		uint32_t _ALIGN(64) dhash[8];
 		be32enc(&endiandata[19], pdata[19]);
 		x16r_hash(dhash, endiandata);
 		applog_hash(dhash);
 		return -1;
 #endif
 		if (work->nonces[0] != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
 			uint32_t _ALIGN(64) vhash[8];
 			be32enc(&endiandata[19], work->nonces[0]);
 			x16r_hash(vhash, endiandata);
 			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
 				work->valid_nonces = 1;
 				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 				work_set_target_ratio(work, vhash);
 				if (work->nonces[1] != 0) {
 					be32enc(&endiandata[19], work->nonces[1]);
 					x16r_hash(vhash, endiandata);
 					bn_set_target_ratio(work, vhash, 1);
 					work->valid_nonces++;
 					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
 				} else {
 					pdata[19] = work->nonces[0] + 1; // cursor
 				}
 #if 0
 				gpulog(LOG_INFO, thr_id, "hash found with %s 80!", algo_strings[algo80]);
 				algo80_tests[algo80] += work->valid_nonces;
 				char oks64[128] = { 0 };
 				char oks80[128] = { 0 };
 				char fails[128] = { 0 };
 				for (int a = 0; a < HASH_FUNC_COUNT; a++) {
 					const char elem = hashOrder[a];
 					const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
 					if (a > 0) algo64_tests[algo64] += work->valid_nonces;
 					sprintf(&oks64[strlen(oks64)], "|%X:%2d", a, algo64_tests[a] < 100 ? algo64_tests[a] : 99);
 					sprintf(&oks80[strlen(oks80)], "|%X:%2d", a, algo80_tests[a] < 100 ? algo80_tests[a] : 99);
 					sprintf(&fails[strlen(fails)], "|%X:%2d", a, algo80_fails[a] < 100 ? algo80_fails[a] : 99);
 				}
 				applog(LOG_INFO, "K64: %s", oks64);
 				applog(LOG_INFO, "K80: %s", oks80);
 				applog(LOG_ERR,  "F80: %s", fails);
 #endif
 				return work->valid_nonces;
 			}
 			else if (vhash[7] > Htarg) {
 				// x11+ coins could do some random error, but not on retry
 				gpu_increment_reject(thr_id);
 				algo80_fails[algo80]++;
 				if (!warn) {
 					warn++;
 					pdata[19] = work->nonces[0] + 1;
 					continue;
 				} else {
 					if (!opt_quiet)	gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! %s %s",
 						work->nonces[0], algo_strings[algo80], hashOrder);
 					warn = 0;
 				}
 			}
 		}
 		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
 		pdata[19] += throughput;
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 // cleanup
 extern "C" void free_x16r(int thr_id)
 {
 	if (!init[thr_id])
 		return;
 	cudaThreadSynchronize();
 	cudaFree(d_hash[thr_id]);
 	quark_blake512_cpu_free(thr_id);
 	quark_groestl512_cpu_free(thr_id);
 	x11_simd512_cpu_free(thr_id);
 	x13_fugue512_cpu_free(thr_id);
 	x16_fugue512_cpu_free(thr_id); // to merge with x13_fugue512 ?
 	x15_whirlpool_cpu_free(thr_id);
 	cuda_check_cpu_free(thr_id);
 	cudaDeviceSynchronize();
 	init[thr_id] = false;
 }
--- a/x16/x16s.cu
+++ b/x16/x16s.cu
@ -0,0 +1,601 @@
 /**
 * X16S algorithm (X16 with Shuffled chain order)
 *
 * tpruvot 2018 - GPL code
 */
 #include <stdio.h>
 #include <memory.h>
 #include <unistd.h>
 extern "C" {
 #include "sph/sph_blake.h"
 #include "sph/sph_bmw.h"
 #include "sph/sph_groestl.h"
 #include "sph/sph_skein.h"
 #include "sph/sph_jh.h"
 #include "sph/sph_keccak.h"
 #include "sph/sph_luffa.h"
 #include "sph/sph_cubehash.h"
 #include "sph/sph_shavite.h"
 #include "sph/sph_simd.h"
 #include "sph/sph_echo.h"
 #include "sph/sph_hamsi.h"
 #include "sph/sph_fugue.h"
 #include "sph/sph_shabal.h"
 #include "sph/sph_whirlpool.h"
 #include "sph/sph_sha2.h"
 }
 #include "miner.h"
 #include "cuda_helper.h"
 #include "cuda_x16.h"
 static uint32_t *d_hash[MAX_GPUS];
 enum Algo {
 	BLAKE = 0,
 	BMW,
 	GROESTL,
 	JH,
 	KECCAK,
 	SKEIN,
 	LUFFA,
 	CUBEHASH,
 	SHAVITE,
 	SIMD,
 	ECHO,
 	HAMSI,
 	FUGUE,
 	SHABAL,
 	WHIRLPOOL,
 	SHA512,
 	HASH_FUNC_COUNT
 };
 static const char* algo_strings[] = {
 	"blake",
 	"bmw512",
 	"groestl",
 	"jh512",
 	"keccak",
 	"skein",
 	"luffa",
 	"cube",
 	"shavite",
 	"simd",
 	"echo",
 	"hamsi",
 	"fugue",
 	"shabal",
 	"whirlpool",
 	"sha512",
 	NULL
 };
 static __thread uint32_t s_ntime = UINT32_MAX;
 static __thread char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
 static void getAlgoString(const uint32_t* prevblock, char *output)
 {
 	uint8_t* data = (uint8_t*)prevblock;
 	strcpy(output, "0123456789ABCDEF");
 	for (uint8_t i = 0; i < HASH_FUNC_COUNT; i++) {
 		uint8_t b = (15 - i) >> 1; // 16 ascii hex chars, reversed
 		uint8_t algoDigit = (i & 1) ? data[b] & 0xF : data[b] >> 4;
 		int offset = (int) algoDigit;
 		char oldVal = output[offset];
 		for(int j=offset; j-->0;)
 			output[j+1] = output[j];
 		output[0] = oldVal;
 	}
 }
 // X16S CPU Hash (Validation)
 extern "C" void x16s_hash(void *output, const void *input)
 {
 	unsigned char _ALIGN(64) hash[128];
 	sph_blake512_context ctx_blake;
 	sph_bmw512_context ctx_bmw;
 	sph_groestl512_context ctx_groestl;
 	sph_jh512_context ctx_jh;
 	sph_keccak512_context ctx_keccak;
 	sph_skein512_context ctx_skein;
 	sph_luffa512_context ctx_luffa;
 	sph_cubehash512_context ctx_cubehash;
 	sph_shavite512_context ctx_shavite;
 	sph_simd512_context ctx_simd;
 	sph_echo512_context ctx_echo;
 	sph_hamsi512_context ctx_hamsi;
 	sph_fugue512_context ctx_fugue;
 	sph_shabal512_context ctx_shabal;
 	sph_whirlpool_context ctx_whirlpool;
 	sph_sha512_context ctx_sha512;
 	void *in = (void*) input;
 	int size = 80;
 	uint32_t *in32 = (uint32_t*) input;
 	getAlgoString(&in32[1], hashOrder);
 	for (int i = 0; i < 16; i++)
 	{
 		const char elem = hashOrder[i];
 		const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
 		switch (algo) {
 		case BLAKE:
 			sph_blake512_init(&ctx_blake);
 			sph_blake512(&ctx_blake, in, size);
 			sph_blake512_close(&ctx_blake, hash);
 			break;
 		case BMW:
 			sph_bmw512_init(&ctx_bmw);
 			sph_bmw512(&ctx_bmw, in, size);
 			sph_bmw512_close(&ctx_bmw, hash);
 			break;
 		case GROESTL:
 			sph_groestl512_init(&ctx_groestl);
 			sph_groestl512(&ctx_groestl, in, size);
 			sph_groestl512_close(&ctx_groestl, hash);
 			break;
 		case SKEIN:
 			sph_skein512_init(&ctx_skein);
 			sph_skein512(&ctx_skein, in, size);
 			sph_skein512_close(&ctx_skein, hash);
 			break;
 		case JH:
 			sph_jh512_init(&ctx_jh);
 			sph_jh512(&ctx_jh, in, size);
 			sph_jh512_close(&ctx_jh, hash);
 			break;
 		case KECCAK:
 			sph_keccak512_init(&ctx_keccak);
 			sph_keccak512(&ctx_keccak, in, size);
 			sph_keccak512_close(&ctx_keccak, hash);
 			break;
 		case LUFFA:
 			sph_luffa512_init(&ctx_luffa);
 			sph_luffa512(&ctx_luffa, in, size);
 			sph_luffa512_close(&ctx_luffa, hash);
 			break;
 		case CUBEHASH:
 			sph_cubehash512_init(&ctx_cubehash);
 			sph_cubehash512(&ctx_cubehash, in, size);
 			sph_cubehash512_close(&ctx_cubehash, hash);
 			break;
 		case SHAVITE:
 			sph_shavite512_init(&ctx_shavite);
 			sph_shavite512(&ctx_shavite, in, size);
 			sph_shavite512_close(&ctx_shavite, hash);
 			break;
 		case SIMD:
 			sph_simd512_init(&ctx_simd);
 			sph_simd512(&ctx_simd, in, size);
 			sph_simd512_close(&ctx_simd, hash);
 			break;
 		case ECHO:
 			sph_echo512_init(&ctx_echo);
 			sph_echo512(&ctx_echo, in, size);
 			sph_echo512_close(&ctx_echo, hash);
 			break;
 		case HAMSI:
 			sph_hamsi512_init(&ctx_hamsi);
 			sph_hamsi512(&ctx_hamsi, in, size);
 			sph_hamsi512_close(&ctx_hamsi, hash);
 			break;
 		case FUGUE:
 			sph_fugue512_init(&ctx_fugue);
 			sph_fugue512(&ctx_fugue, in, size);
 			sph_fugue512_close(&ctx_fugue, hash);
 			break;
 		case SHABAL:
 			sph_shabal512_init(&ctx_shabal);
 			sph_shabal512(&ctx_shabal, in, size);
 			sph_shabal512_close(&ctx_shabal, hash);
 			break;
 		case WHIRLPOOL:
 			sph_whirlpool_init(&ctx_whirlpool);
 			sph_whirlpool(&ctx_whirlpool, in, size);
 			sph_whirlpool_close(&ctx_whirlpool, hash);
 			break;
 		case SHA512:
 			sph_sha512_init(&ctx_sha512);
 			sph_sha512(&ctx_sha512,(const void*) in, size);
 			sph_sha512_close(&ctx_sha512,(void*) hash);
 			break;
 		}
 		in = (void*) hash;
 		size = 64;
 	}
 	memcpy(output, hash, 32);
 }
 #if 0 /* in x16r */
 void whirlpool_midstate(void *state, const void *input)
 {
 	sph_whirlpool_context ctx;
 	sph_whirlpool_init(&ctx);
 	sph_whirlpool(&ctx, input, 64);
 	memcpy(state, ctx.state, 64);
 }
 #endif
 static bool init[MAX_GPUS] = { 0 };
 static bool use_compat_kernels[MAX_GPUS] = { 0 };
 //#define _DEBUG
 #define _DEBUG_PREFIX "x16s-"
 #include "cuda_debug.cuh"
 extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
 	const int dev_id = device_map[thr_id];
 	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19;
 	if (strstr(device_name[dev_id], "GTX 1080")) intensity = 20;
 	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
 	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
 	if (!init[thr_id])
 	{
 		cudaSetDevice(device_map[thr_id]);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
 			// reduce cpu usage
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
 		}
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
 		cuda_get_arch(thr_id);
 		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
 		if (use_compat_kernels[thr_id])
 			x11_echo512_cpu_init(thr_id, throughput);
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_bmw512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
 		quark_jh512_cpu_init(thr_id, throughput);
 		quark_keccak512_cpu_init(thr_id, throughput);
 		qubit_luffa512_cpu_init(thr_id, throughput);
 		x11_luffa512_cpu_init(thr_id, throughput); // 64
 		x11_shavite512_cpu_init(thr_id, throughput);
 		x11_simd512_cpu_init(thr_id, throughput); // 64
 		x16_echo512_cuda_init(thr_id, throughput);
 		x13_hamsi512_cpu_init(thr_id, throughput);
 		x13_fugue512_cpu_init(thr_id, throughput);
 		x16_fugue512_cpu_init(thr_id, throughput);
 		x14_shabal512_cpu_init(thr_id, throughput);
 		x15_whirlpool_cpu_init(thr_id, throughput, 0);
 		x16_whirlpool512_init(thr_id, throughput);
 		x17_sha512_cpu_init(thr_id, throughput);
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), 0);
 		cuda_check_cpu_init(thr_id, throughput);
 		init[thr_id] = true;
 	}
 	if (opt_benchmark) {
 		((uint32_t*)ptarget)[7] = 0x003f;
 		//((uint8_t*)pdata)[8] = 0x90; // hashOrder[0] = '9'; for simd 80 + blake512 64
 		//((uint8_t*)pdata)[8] = 0x0A; // hashOrder[0] = 'A'; for echo 64
 		//((uint8_t*)pdata)[8] = 0xB0; // hashOrder[0] = 'B'; for hamsi 80 + blake512 64
 		//((uint8_t*)pdata)[8] = 0xC0; // hashOrder[0] = 'C'; for fugue 80 + blake512 64
 		//((uint8_t*)pdata)[8] = 0xE0; // hashOrder[0] = 'E'; for whirlpool 80 + blake512 64
 	}
 	uint32_t _ALIGN(64) endiandata[20];
 	for (int k=0; k < 19; k++)
 		be32enc(&endiandata[k], pdata[k]);
 	uint32_t ntime = swab32(pdata[17]);
 	if (s_ntime != ntime) {
 		getAlgoString(&endiandata[1], hashOrder);
 		s_ntime = ntime;
 		if (opt_debug && !thr_id) applog(LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime);
 	}
 	cuda_check_cpu_setTarget(ptarget);
 	char elem = hashOrder[0];
 	const uint8_t algo80 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
 	switch (algo80) {
 		case BLAKE:
 			quark_blake512_cpu_setBlock_80(thr_id, endiandata);
 			break;
 		case BMW:
 			quark_bmw512_cpu_setBlock_80(endiandata);
 			break;
 		case GROESTL:
 			groestl512_setBlock_80(thr_id, endiandata);
 			break;
 		case JH:
 			jh512_setBlock_80(thr_id, endiandata);
 			break;
 		case KECCAK:
 			keccak512_setBlock_80(thr_id, endiandata);
 			break;
 		case SKEIN:
 			skein512_cpu_setBlock_80((void*)endiandata);
 			break;
 		case LUFFA:
 			qubit_luffa512_cpu_setBlock_80((void*)endiandata);
 			break;
 		case CUBEHASH:
 			cubehash512_setBlock_80(thr_id, endiandata);
 			break;
 		case SHAVITE:
 			x11_shavite512_setBlock_80((void*)endiandata);
 			break;
 		case SIMD:
 			x16_simd512_setBlock_80((void*)endiandata);
 			break;
 		case ECHO:
 			x16_echo512_setBlock_80((void*)endiandata);
 			break;
 		case HAMSI:
 			x16_hamsi512_setBlock_80((void*)endiandata);
 			break;
 		case FUGUE:
 			x16_fugue512_setBlock_80((void*)pdata);
 			break;
 		case SHABAL:
 			x16_shabal512_setBlock_80((void*)endiandata);
 			break;
 		case WHIRLPOOL:
 			x16_whirlpool512_setBlock_80((void*)endiandata);
 			break;
 		case SHA512:
 			x16_sha512_setBlock_80(endiandata);
 			break;
 		default: {
 			return -1;
 		}
 	}
 	int warn = 0;
 	do {
 		int order = 0;
 		// Hash with CUDA
 		switch (algo80) {
 			case BLAKE:
 				quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("blake80:");
 				break;
 			case BMW:
 				quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 				TRACE("bmw80  :");
 				break;
 			case GROESTL:
 				groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("grstl80:");
 				break;
 			case JH:
 				jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("jh51280:");
 				break;
 			case KECCAK:
 				keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("kecck80:");
 				break;
 			case SKEIN:
 				skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
 				TRACE("skein80:");
 				break;
 			case LUFFA:
 				qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 				TRACE("luffa80:");
 				break;
 			case CUBEHASH:
 				cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("cube 80:");
 				break;
 			case SHAVITE:
 				x11_shavite512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 				TRACE("shavite:");
 				break;
 			case SIMD:
 				x16_simd512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("simd512:");
 				break;
 			case ECHO:
 				x16_echo512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("echo   :");
 				break;
 			case HAMSI:
 				x16_hamsi512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("hamsi  :");
 				break;
 			case FUGUE:
 				x16_fugue512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("fugue  :");
 				break;
 			case SHABAL:
 				x16_shabal512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("shabal :");
 				break;
 			case WHIRLPOOL:
 				x16_whirlpool512_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("whirl  :");
 				break;
 			case SHA512:
 				x16_sha512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("sha512 :");
 				break;
 		}
 		for (int i = 1; i < 16; i++)
 		{
 			const char elem = hashOrder[i];
 			const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
 			switch (algo64) {
 			case BLAKE:
 				quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("blake  :");
 				break;
 			case BMW:
 				quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("bmw    :");
 				break;
 			case GROESTL:
 				quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("groestl:");
 				break;
 			case JH:
 				quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("jh512  :");
 				break;
 			case KECCAK:
 				quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("keccak :");
 				break;
 			case SKEIN:
 				quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("skein  :");
 				break;
 			case LUFFA:
 				x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("luffa  :");
 				break;
 			case CUBEHASH:
 				x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("cube   :");
 				break;
 			case SHAVITE:
 				x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("shavite:");
 				break;
 			case SIMD:
 				x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("simd   :");
 				break;
 			case ECHO:
 				if (use_compat_kernels[thr_id])
 					x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				else
 					x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
 				TRACE("echo   :");
 				break;
 			case HAMSI:
 				x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("hamsi  :");
 				break;
 			case FUGUE:
 				x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("fugue  :");
 				break;
 			case SHABAL:
 				x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("shabal :");
 				break;
 			case WHIRLPOOL:
 				x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("shabal :");
 				break;
 			case SHA512:
 				x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
 				TRACE("sha512 :");
 				break;
 			}
 		}
 		*hashes_done = pdata[19] - first_nonce + throughput;
 		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
 #ifdef _DEBUG
 		uint32_t _ALIGN(64) dhash[8];
 		be32enc(&endiandata[19], pdata[19]);
 		x16s_hash(dhash, endiandata);
 		applog_hash(dhash);
 		return -1;
 #endif
 		if (work->nonces[0] != UINT32_MAX)
 		{
 			const uint32_t Htarg = ptarget[7];
 			uint32_t _ALIGN(64) vhash[8];
 			be32enc(&endiandata[19], work->nonces[0]);
 			x16s_hash(vhash, endiandata);
 			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
 				work->valid_nonces = 1;
 				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
 				work_set_target_ratio(work, vhash);
 				if (work->nonces[1] != 0) {
 					be32enc(&endiandata[19], work->nonces[1]);
 					x16s_hash(vhash, endiandata);
 					bn_set_target_ratio(work, vhash, 1);
 					work->valid_nonces++;
 					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
 				} else {
 					pdata[19] = work->nonces[0] + 1; // cursor
 				}
 				//gpulog(LOG_INFO, thr_id, "hash found with %s 80 (%s)!", algo_strings[algo80], hashOrder);
 				return work->valid_nonces;
 			}
 			else if (vhash[7] > Htarg) {
 				// x11+ coins could do some random error, but not on retry
 				gpu_increment_reject(thr_id);
 				if (!warn) {
 					warn++;
 					pdata[19] = work->nonces[0] + 1;
 					continue;
 				} else {
 					if (!opt_quiet)	gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! %s %s",
 						work->nonces[0], algo_strings[algo80], hashOrder);
 					warn = 0;
 				}
 			}
 		}
 		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
 			pdata[19] = max_nonce;
 			break;
 		}
 		pdata[19] += throughput;
 	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
 	*hashes_done = pdata[19] - first_nonce;
 	return 0;
 }
 // cleanup
 extern "C" void free_x16s(int thr_id)
 {
 	if (!init[thr_id])
 		return;
 	cudaThreadSynchronize();
 	cudaFree(d_hash[thr_id]);
 	quark_blake512_cpu_free(thr_id);
 	quark_groestl512_cpu_free(thr_id);
 	x11_simd512_cpu_free(thr_id);
 	x13_fugue512_cpu_free(thr_id);
 	x16_fugue512_cpu_free(thr_id); // to merge with x13_fugue512 ?
 	x15_whirlpool_cpu_free(thr_id);
 	cuda_check_cpu_free(thr_id);
 	cudaDeviceSynchronize();
 	init[thr_id] = false;
 }
--- a/x17/cuda_x17_sha512.cu
+++ b/x17/cuda_x17_sha512.cu
@ -169,3 +169,80 @@ void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce,
 	x17_sha512_gpu_hash_64 <<<grid, block>>> (threads, (uint64_t*)d_hash);
 }
 __constant__
 static uint64_t c_PaddedMessage80[10];
 __global__
 /*__launch_bounds__(256, 4)*/
 void x16_sha512_gpu_hash_80(const uint32_t threads, const uint32_t startNonce, uint64_t *g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint64_t W[80];
 		#pragma unroll
 		for (int i = 0; i < 9; i ++) {
 			W[i] = SWAP64(c_PaddedMessage80[i]);
 		}
 		const uint32_t nonce = startNonce + thread;
 		//((uint32_t*)W)[19] = cuda_swab32(nonce);
 		W[9] = REPLACE_HIDWORD(c_PaddedMessage80[9], cuda_swab32(nonce));
 		W[9] = cuda_swab64(W[9]);
 		W[10] = 0x8000000000000000;
 		#pragma unroll
 		for (int i = 11; i<15; i++) {
 			W[i] = 0U;
 		}
 		W[15] = 0x0000000000000280;
 		#pragma unroll 64
 		for (int i = 16; i < 80; i ++) {
 			W[i] = SSG5_1(W[i-2]) + W[i-7];
 			W[i] += SSG5_0(W[i-15]) + W[i-16];
 		}
 		const uint64_t IV512[8] = {
 			0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
 			0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
 			0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
 			0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
 		};
 		uint64_t r[8];
 		#pragma unroll
 		for (int i = 0; i < 8; i++) {
 			r[i] = IV512[i];
 		}
 		#pragma unroll
 		for (int i = 0; i < 80; i++) {
 			SHA3_STEP(c_WB, r, W, i&7, i);
 		}
 		const uint64_t hashPosition = thread;
 		uint64_t *pHash = &g_hash[hashPosition << 3];
 		#pragma unroll
 		for (int u = 0; u < 8; u ++) {
 			pHash[u] = SWAP64(r[u] + IV512[u]);
 		}
 	}
 }
 __host__
 void x16_sha512_cuda_hash_80(int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
 {
 	const uint32_t threadsperblock = 256;
 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);
 	x16_sha512_gpu_hash_80 <<<grid, block >>> (threads, startNounce, (uint64_t*)d_hash);
 }
 __host__
 void x16_sha512_setBlock_80(void *pdata)
 {
 	cudaMemcpyToSymbol(c_PaddedMessage80, pdata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
 }