upsteam 2.3.1

6 years ago · 7e513867b1
64 changed files with 5413 additions and 488 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -38,13 +38,16 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
				@@ -38,13 +38,16 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  lyra2/Lyra2.c lyra2/Sponge.c \
 			  lyra2/lyra2RE.cu lyra2/cuda_lyra2.cu \
 		          lyra2/lyra2REv2.cu lyra2/cuda_lyra2v2.cu \
+			  lyra2/lyra2REv3.cu lyra2/cuda_lyra2v3.cu \
 			  lyra2/Lyra2Z.c lyra2/lyra2Z.cu lyra2/cuda_lyra2Z.cu \
+			  lyra2/allium.cu \
 			  Algo256/cuda_bmw256.cu Algo256/cuda_cubehash256.cu \
 			  Algo256/cuda_blake256.cu Algo256/cuda_groestl256.cu \
 			  Algo256/cuda_keccak256_sm3.cu Algo256/cuda_keccak256.cu Algo256/cuda_skein256.cu \
 			  Algo256/blake256.cu Algo256/decred.cu Algo256/vanilla.cu Algo256/keccak256.cu \
 			  Algo256/blake2s.cu sph/blake2s.c \
 			  Algo256/bmw.cu Algo256/cuda_bmw.cu \
+			  blake2b.cu \
 			  crypto/xmr-rpc.cpp crypto/wildkeccak-cpu.cpp crypto/wildkeccak.cu \
 			  crypto/cryptolight.cu crypto/cryptolight-core.cu crypto/cryptolight-cpu.cpp \
 			  crypto/cryptonight.cu crypto/cryptonight-core.cu crypto/cryptonight-extra.cu \
@ -58,7 +61,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
				@@ -58,7 +61,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  neoscrypt/neoscrypt.cpp neoscrypt/neoscrypt-cpu.c neoscrypt/cuda_neoscrypt.cu \
 			  pentablake.cu skein.cu cuda_skeincoin.cu skein2.cpp zr5.cu \
 			  skunk/skunk.cu skunk/cuda_skunk.cu skunk/cuda_skunk_streebog.cu \
-			  sha256/sha256d.cu sha256/cuda_sha256d.cu sha256/sha256t.cu sha256/cuda_sha256t.cu \
+			  sha256/sha256d.cu sha256/cuda_sha256d.cu sha256/sha256t.cu sha256/cuda_sha256t.cu sha256/sha256q.cu sha256/cuda_sha256q.cu \
 			  sia/sia.cu sia/sia-rpc.cpp sph/blake2b.c \
 			  sph/bmw.c sph/blake.c sph/groestl.c sph/jh.c sph/keccak.c sph/skein.c \
 			  sph/cubehash.c sph/echo.c sph/luffa.c sph/sha2.c sph/shavite.c sph/simd.c \
@ -70,7 +73,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
				@@ -70,7 +73,7 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  qubit/qubit.cu qubit/qubit_luffa512.cu qubit/deep.cu qubit/luffa.cu \
 			  tribus/tribus.cu tribus/cuda_echo512_final.cu \
 			  x11/x11.cu x12/x12.cu x11/fresh.cu x11/cuda_x11_luffa512.cu x11/cuda_x11_cubehash512.cu \
-			  x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu \
+			  x11/cuda_x11_shavite512.cu x11/cuda_x11_simd512.cu x11/cuda_x11_echo.cu x11/exosis.cu \
 			  x11/cuda_x11_luffa512_Cubehash.cu x11/x11evo.cu x11/timetravel.cu x11/bitcore.cu \
 			  x13/x13.cu x13/cuda_x13_hamsi512.cu x13/cuda_x13_fugue512.cu \
 			  x13/hsr.cu x13/cuda_hsr_sm3.cu x13/sm3.c \
@ -79,8 +82,8 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
				@@ -79,8 +82,8 @@ ccminer_SOURCES	= elist.h miner.h compat.h \
 			  x16/x16r.cu x16/x16s.cu x16/cuda_x16_echo512.cu x16/cuda_x16_fugue512.cu \
 			  x16/cuda_x16_shabal512.cu x16/cuda_x16_simd512_80.cu \
 			  x16/cuda_x16_echo512_64.cu \
-			  x17/x17.cu x17/hmq17.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
-			  x11/phi.cu x11/cuda_streebog_maxwell.cu \
+			  x17/x17.cu x17/hmq17.cu x17/sonoa.cu x17/cuda_x17_haval256.cu x17/cuda_x17_sha512.cu \
+			  phi/phi.cu phi/phi2.cu phi/cuda_phi2.cu phi/cuda_phi2_cubehash512.cu x11/cuda_streebog_maxwell.cu \
 			  x11/c11.cu x11/s3.cu x11/sib.cu x11/veltor.cu x11/cuda_streebog.cu \
 			  gost/gost.cu gost/cuda_gosthash.cu

@ -116,9 +119,11 @@ endif
				@@ -116,9 +119,11 @@ endif
 ccminer_LDADD += -lcuda

 nvcc_ARCH :=
-#nvcc_ARCH += -gencode=arch=compute_61,code=\"sm_61,compute_61\"
+#nvcc_ARCH += -gencode=arch=compute_75,code=\"sm_75,compute_75\" # CUDA 10 req.
+#nvcc_ARCH += -gencode=arch=compute_70,code=\"sm_70,compute_70\" # CUDA 9.1
+#nvcc_ARCH += -gencode=arch=compute_61,code=\"sm_61,compute_61\" # CUDA 8
 nvcc_ARCH += -gencode=arch=compute_52,code=\"sm_52,compute_52\"
-nvcc_ARCH += -gencode=arch=compute_50,code=\"sm_50,compute_50\"
+#nvcc_ARCH += -gencode=arch=compute_50,code=\"sm_50,compute_50\"
 #nvcc_ARCH += -gencode=arch=compute_35,code=\"sm_35,compute_35\"
 #nvcc_ARCH += -gencode=arch=compute_30,code=\"sm_30,compute_30\"

--- a/README.txt
+++ b/README.txt
@ -1,5 +1,5 @@
				@@ -1,5 +1,5 @@

-ccminer 2.2.5 (Apr 2018)             "x12, x16r and x16s algos"
+ccminer 2.3.1                     "lyra2v3, exosis and sha256q"
 ---------------------------------------------------------------

 ***************************************************************
@ -41,19 +41,20 @@ Keccak (Maxcoin)
				@@ -41,19 +41,20 @@ Keccak (Maxcoin)
 Pentablake (Blake 512 x5)
 1Coin Triple S
 Neoscrypt (FeatherCoin)
-Revolver (X11evo)
+x11evo (Revolver)
+phi2 (LUXCoin)
 Scrypt and Scrypt:N
 Scrypt-Jane (Chacha)
-Sibcoin (sib)
+sib (Sibcoin)
 Skein (Skein + SHA)
 Signatum (Skein cubehash fugue Streebog)
+SonoA (Sono)
 Tribus (JH, keccak, simd)
 Woodcoin (Double Skein)
 Vanilla (Blake256 8-rounds - double sha256)
-Vertcoin Lyra2RE
-Ziftrcoin (ZR5)
+Vertcoin Lyra2REv3
 Boolberry (Wild Keccak)
-Monero (Cryptonight)
+Monero (Cryptonight v7 with -a monero)
 Aeon (Cryptonight-lite)

 where some of these coins have a VERY NOTABLE nVidia advantage
@ -73,19 +74,21 @@ This code is based on the pooler cpuminer and inherits
				@@ -73,19 +74,21 @@ This code is based on the pooler cpuminer and inherits
 its command line interface and options.

  -a, --algo=ALGO       specify the algorithm to use
+                          allium      use to mine Garlic
                          bastion     use to mine Joincoin
                          bitcore     use to mine Bitcore's Timetravel10
                          blake       use to mine Saffroncoin (Blake256)
                          blakecoin   use to mine Old Blake 256
                          blake2s     use to mine Nevacoin (Blake2-S 256)
                          bmw         use to mine Midnight
-                          cryptolight use to mine AEON cryptonight (MEM/2)
-                          cryptonight use to mine XMR cryptonight, Bytecoin, Dash, DigitalNote, etc
+                          cryptolight use to mine AEON cryptonight variant 1 (MEM/2)
+                          cryptonight use to mine original cryptonight
                          c11/flax    use to mine Chaincoin and Flax
                          decred      use to mine Decred 180 bytes Blake256-14
                          deep        use to mine Deepcoin
                          dmd-gr      use to mine Diamond-Groestl
                          equihash    use to mine ZEC, HUSH and KMD
+                          exosis      use to mine EXO
                          fresh       use to mine Freshcoin
                          fugue256    use to mine Fuguecoin
                          groestl     use to mine Groestlcoin
@ -96,13 +99,16 @@ its command line interface and options.
				@@ -96,13 +99,16 @@ its command line interface and options.
                          lbry        use to mine LBRY Credits
                          luffa       use to mine Joincoin
                          lyra2       use to mine CryptoCoin
-                          lyra2v2     use to mine Vertcoin
+                          lyra2v2     use to mine Monacoin
+                          lyra2v3     use to mine Vertcoin
                          lyra2z      use to mine Zerocoin (XZC)
+                          monero      use to mine Monero (XMR)
                          myr-gr      use to mine Myriad-Groest
                          neoscrypt   use to mine FeatherCoin, Trezarcoin, Orbitcoin, etc
                          nist5       use to mine TalkCoin
                          penta       use to mine Joincoin / Pentablake
-                          phi         use to mine LUXCoin
+                          phi1612     use to mine Seraph
+                          phi2        use to mine LUXCoin
                          polytimos   use to mine Polytimos
                          quark       use to mine Quarkcoin
                          qubit       use to mine Qubit
@ -111,11 +117,14 @@ its command line interface and options.
				@@ -111,11 +117,14 @@ its command line interface and options.
                          scrypt-jane use to mine Chacha coins like Cache and Ultracoin
                          s3          use to mine 1coin (ONE)
                          sha256t     use to mine OneCoin (OC)
+                          sha256q     use to mine Pyrite
                          sia         use to mine SIA
                          sib         use to mine Sibcoin
                          skein       use to mine Skeincoin
                          skein2      use to mine Woodcoin
                          skunk       use to mine Signatum
+                          sonoa       use to mine Sono
+                          stellite    use to mine Stellite (a cryptonight variant)
                          timetravel  use to mine MachineCoin
                          tribus      use to mine Denarius
                          x11evo      use to mine Revolver
@ -281,6 +290,21 @@ so we can more efficiently implement new algorithms using the latest hardware
				@@ -281,6 +290,21 @@ so we can more efficiently implement new algorithms using the latest hardware
 features.

 >>> RELEASE HISTORY <<<
+  Jan. 30th 2019  v2.3.1
+                  Handle Lyra2v3 algo
+                  Handle sha256q algo
+                  Handle exosis algo
+                  Handle blake2b standard algo
+
+  June 23th 2018  v2.3
+                  Handle phi2 header variation for smart contracts
+                  Handle monero, stellite, graft and cryptolight variants
+                  Handle SonoA algo
+
+  June 10th 2018  v2.2.6
+                  New phi2 algo for LUX
+                  New allium algo for Garlic
+
  Apr. 02nd 2018  v2.2.5
                  New x16r algo for Raven
                  New x16s algo for Pigeon and Eden
--- a/algos.h
+++ b/algos.h
@ -7,7 +7,9 @@
				@@ -7,7 +7,9 @@
 enum sha_algos {
 	ALGO_BLAKECOIN = 0,
 	ALGO_BLAKE,
+	ALGO_BLAKE2B,
 	ALGO_BLAKE2S,
+	ALGO_ALLIUM,
 	ALGO_BMW,
 	ALGO_BASTION,
 	ALGO_C11,
@ -17,6 +19,7 @@ enum sha_algos {
				@@ -17,6 +19,7 @@ enum sha_algos {
 	ALGO_DECRED,
 	ALGO_DMD_GR,
 	ALGO_EQUIHASH,
+	ALGO_EXOSIS,
 	ALGO_FRESH,
 	ALGO_FUGUE256,		/* Fugue256 */
 	ALGO_GOSTD,
@ -33,6 +36,7 @@ enum sha_algos {
				@@ -33,6 +36,7 @@ enum sha_algos {
 	ALGO_LUFFA,
 	ALGO_LYRA2,
 	ALGO_LYRA2v2,
+	ALGO_LYRA2v3,
 	ALGO_LYRA2Z,
 	ALGO_MJOLLNIR,		/* Hefty hash */
 	ALGO_MYR_GR,
@ -40,6 +44,7 @@ enum sha_algos {
				@@ -40,6 +44,7 @@ enum sha_algos {
 	ALGO_NIST5,
 	ALGO_PENTABLAKE,
 	ALGO_PHI,
+	ALGO_PHI2,
 	ALGO_POLYTIMOS,
 	ALGO_QUARK,
 	ALGO_QUBIT,
@ -47,11 +52,13 @@ enum sha_algos {
				@@ -47,11 +52,13 @@ enum sha_algos {
 	ALGO_SCRYPT_JANE,
 	ALGO_SHA256D,
 	ALGO_SHA256T,
+	ALGO_SHA256Q,
 	ALGO_SIA,
 	ALGO_SIB,
 	ALGO_SKEIN,
 	ALGO_SKEIN2,
 	ALGO_SKUNK,
+	ALGO_SONOA,
 	ALGO_S3,
 	ALGO_TIMETRAVEL,
 	ALGO_TRIBUS,
@ -72,6 +79,9 @@ enum sha_algos {
				@@ -72,6 +79,9 @@ enum sha_algos {
 	ALGO_WHIRLPOOLX,
 	ALGO_WILDKECCAK,
 	ALGO_ZR5,
+	ALGO_MONERO,
+	ALGO_GRAFT,
+	ALGO_STELLITE,
 	ALGO_AUTO,
 	ALGO_COUNT
 };
@ -81,7 +91,9 @@ extern volatile enum sha_algos opt_algo;
				@@ -81,7 +91,9 @@ extern volatile enum sha_algos opt_algo;
 static const char *algo_names[] = {
 	"blakecoin",
 	"blake",
+	"blake2b",
 	"blake2s",
+	"allium",
 	"bmw",
 	"bastion",
 	"c11",
@ -91,6 +103,7 @@ static const char *algo_names[] = {
				@@ -91,6 +103,7 @@ static const char *algo_names[] = {
 	"decred",
 	"dmd-gr",
 	"equihash",
+	"exosis",
 	"fresh",
 	"fugue256",
 	"gostd",
@ -107,6 +120,7 @@ static const char *algo_names[] = {
				@@ -107,6 +120,7 @@ static const char *algo_names[] = {
 	"luffa",
 	"lyra2",
 	"lyra2v2",
+	"lyra2v3",
 	"lyra2z",
 	"mjollnir",
 	"myr-gr",
@ -114,6 +128,7 @@ static const char *algo_names[] = {
				@@ -114,6 +128,7 @@ static const char *algo_names[] = {
 	"nist5",
 	"penta",
 	"phi",
+	"phi2",
 	"polytimos",
 	"quark",
 	"qubit",
@ -121,11 +136,13 @@ static const char *algo_names[] = {
				@@ -121,11 +136,13 @@ static const char *algo_names[] = {
 	"scrypt-jane",
 	"sha256d",
 	"sha256t",
+	"sha256q",
 	"sia",
 	"sib",
 	"skein",
 	"skein2",
 	"skunk",
+	"sonoa",
 	"s3",
 	"timetravel",
 	"tribus",
@ -146,6 +163,9 @@ static const char *algo_names[] = {
				@@ -146,6 +163,9 @@ static const char *algo_names[] = {
 	"whirlpoolx",
 	"wildkeccak",
 	"zr5",
+	"monero",
+	"graft",
+	"stellite",
 	"auto", /* reserved for multi algo */
 	""
 };
@ -185,6 +205,8 @@ static inline int algo_to_int(char* arg)
				@@ -185,6 +205,8 @@ static inline int algo_to_int(char* arg)
 			i = ALGO_LYRA2;
 		else if (!strcasecmp("lyra2rev2", arg))
 			i = ALGO_LYRA2v2;
+		else if (!strcasecmp("lyra2rev3", arg))
+			i = ALGO_LYRA2v3;
 		else if (!strcasecmp("phi1612", arg))
 			i = ALGO_PHI;
 		else if (!strcasecmp("bitcoin", arg))
@ -210,4 +232,29 @@ static inline int algo_to_int(char* arg)
				@@ -210,4 +232,29 @@ static inline int algo_to_int(char* arg)
 	return i;
 }

+static inline int get_cryptonight_algo(int fork)
+{
+	int algo = ALGO_COUNT;
+
+	switch (fork) {
+		case 8:
+			algo = ALGO_GRAFT;
+			break;
+
+		case 7:
+			algo = ALGO_MONERO;
+			break;
+
+		case 3:
+			algo = ALGO_STELLITE;
+			break;
+
+		default:
+			algo = ALGO_CRYPTONIGHT;
+			break;
+	}
+
+	return algo;
+}
+
 #endif
--- a/api.cpp
+++ b/api.cpp
@ -257,7 +257,7 @@ static char *getpoolnfo(char *params)
				@@ -257,7 +257,7 @@ static char *getpoolnfo(char *params)

 static void gpuhwinfos(int gpu_id)
 {
-	char buf[256];
+	char buf[512];
 	char pstate[8];
 	char* card;
 	struct cgpu_info *cgpu = NULL;
--- a/bench.cpp
+++ b/bench.cpp
@ -49,9 +49,11 @@ void bench_free()
				@@ -49,9 +49,11 @@ void bench_free()
 void algo_free_all(int thr_id)
 {
 	// only initialized algos will be freed
+	free_allium(thr_id);
 	free_bastion(thr_id);
 	free_bitcore(thr_id);
 	free_blake256(thr_id);
+	free_blake2b(thr_id);
 	free_blake2s(thr_id);
 	free_bmw(thr_id);
 	free_c11(thr_id);
@ -60,9 +62,11 @@ void algo_free_all(int thr_id)
				@@ -60,9 +62,11 @@ void algo_free_all(int thr_id)
 	free_decred(thr_id);
 	free_deep(thr_id);
 	free_equihash(thr_id);
+	free_exosis(thr_id);
 	free_keccak256(thr_id);
 	free_fresh(thr_id);
 	free_fugue256(thr_id);
+	free_gostd(thr_id);
 	free_groestlcoin(thr_id);
 #ifdef WITH_HEAVY_ALGO
 	free_heavy(thr_id);
@ -75,12 +79,14 @@ void algo_free_all(int thr_id)
				@@ -75,12 +79,14 @@ void algo_free_all(int thr_id)
 	free_luffa(thr_id);
 	free_lyra2(thr_id);
 	free_lyra2v2(thr_id);
+	free_lyra2v3(thr_id);
 	free_lyra2Z(thr_id);
 	free_myriad(thr_id);
 	free_neoscrypt(thr_id);
 	free_nist5(thr_id);
 	free_pentablake(thr_id);
 	free_phi(thr_id);
+	free_phi2(thr_id);
 	free_polytimos(thr_id);
 	free_quark(thr_id);
 	free_qubit(thr_id);
@ -89,9 +95,10 @@ void algo_free_all(int thr_id)
				@@ -89,9 +95,10 @@ void algo_free_all(int thr_id)
 	free_skunk(thr_id);
 	free_sha256d(thr_id);
 	free_sha256t(thr_id);
-	free_gostd(thr_id);
+	free_sha256q(thr_id);
 	free_sia(thr_id);
 	free_sib(thr_id);
+	free_sonoa(thr_id);
 	free_s3(thr_id);
 	free_vanilla(thr_id);
 	free_veltor(thr_id);
@ -154,6 +161,22 @@ bool bench_algo_switch_next(int thr_id)
				@@ -154,6 +161,22 @@ bool bench_algo_switch_next(int thr_id)
 	if (algo == ALGO_SCRYPT) algo++;
 	if (algo == ALGO_SCRYPT_JANE) algo++;

+	// Set cryptonight variant
+	switch (algo) {
+		case ALGO_MONERO:
+			cryptonight_fork = 7;
+			break;
+		case ALGO_GRAFT:
+			cryptonight_fork = 8;
+			break;
+		case ALGO_STELLITE:
+			cryptonight_fork = 3;
+			break;
+		case ALGO_CRYPTONIGHT:
+			cryptonight_fork = 1;
+			break;
+	}
+
 	// free current algo memory and track mem usage
 	mused = cuda_available_memory(thr_id);
 	algo_free_all(thr_id);
--- a/blake2b.cu
+++ b/blake2b.cu
@ -0,0 +1,273 @@
				@@ -0,0 +1,273 @@
+/**
+ * Blake2-B CUDA Implementation
+ *
+ * tpruvot@github July 2016
+ *
+ */
+
+#include <miner.h>
+
+#include <string.h>
+#include <stdint.h>
+
+#include <sph/blake2b.h>
+
+#include <cuda_helper.h>
+#include <cuda_vector_uint2x4.h>
+
+#define TPB 512
+#define NBN 2
+
+static uint32_t *d_resNonces[MAX_GPUS];
+
+__device__ uint64_t d_data[10];
+
+static __constant__ const int8_t blake2b_sigma[12][16] = {
+	{ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } ,
+	{ 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  } ,
+	{ 11, 8,  12, 0,  5,  2,  15, 13, 10, 14, 3,  6,  7,  1,  9,  4  } ,
+	{ 7,  9,  3,  1,  13, 12, 11, 14, 2,  6,  5,  10, 4,  0,  15, 8  } ,
+	{ 9,  0,  5,  7,  2,  4,  10, 15, 14, 1,  11, 12, 6,  8,  3,  13 } ,
+	{ 2,  12, 6,  10, 0,  11, 8,  3,  4,  13, 7,  5,  15, 14, 1,  9  } ,
+	{ 12, 5,  1,  15, 14, 13, 4,  10, 0,  7,  6,  3,  9,  2,  8,  11 } ,
+	{ 13, 11, 7,  14, 12, 1,  3,  9,  5,  0,  15, 4,  8,  6,  2,  10 } ,
+	{ 6,  15, 14, 9,  11, 3,  0,  8,  12, 2,  13, 7,  1,  4,  10, 5  } ,
+	{ 10, 2,  8,  4,  7,  6,  1,  5,  15, 11, 9,  14, 3,  12, 13, 0  } ,
+	{ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15 } ,
+	{ 14, 10, 4,  8,  9,  15, 13, 6,  1,  12, 0,  2,  11, 7,  5,  3  }
+};
+
+// host mem align
+#define A 64
+
+extern "C" void blake2b_hash(void *output, const void *input)
+{
+	uint8_t _ALIGN(A) hash[32];
+	blake2b_ctx ctx;
+
+	blake2b_init(&ctx, 32, NULL, 0);
+	blake2b_update(&ctx, input, 80);
+	blake2b_final(&ctx, hash);
+
+	memcpy(output, hash, 32);
+}
+
+// ----------------------------------------------------------------
+
+__device__ __forceinline__
+static void G(const int r, const int i, uint64_t &a, uint64_t &b, uint64_t &c, uint64_t &d, uint64_t const m[16])
+{
+	a = a + b + m[ blake2b_sigma[r][2*i] ];
+	((uint2*)&d)[0] = SWAPUINT2( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
+	c = c + d;
+	((uint2*)&b)[0] = ROR24( ((uint2*)&b)[0] ^ ((uint2*)&c)[0] );
+	a = a + b + m[ blake2b_sigma[r][2*i+1] ];
+	((uint2*)&d)[0] = ROR16( ((uint2*)&d)[0] ^ ((uint2*)&a)[0] );
+	c = c + d;
+	((uint2*)&b)[0] = ROR2( ((uint2*)&b)[0] ^ ((uint2*)&c)[0], 63U);
+}
+
+#define ROUND(r) \
+	G(r, 0, v[0], v[4], v[ 8], v[12], m); \
+	G(r, 1, v[1], v[5], v[ 9], v[13], m); \
+	G(r, 2, v[2], v[6], v[10], v[14], m); \
+	G(r, 3, v[3], v[7], v[11], v[15], m); \
+	G(r, 4, v[0], v[5], v[10], v[15], m); \
+	G(r, 5, v[1], v[6], v[11], v[12], m); \
+	G(r, 6, v[2], v[7], v[ 8], v[13], m); \
+	G(r, 7, v[3], v[4], v[ 9], v[14], m);
+
+__global__
+//__launch_bounds__(128, 8) /* to force 64 regs */
+void blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2)
+{
+	const uint32_t nonce = (blockDim.x * blockIdx.x + threadIdx.x) + startNonce;
+
+	uint64_t m[16];
+
+	m[0] = d_data[0];
+	m[1] = d_data[1];
+	m[2] = d_data[2];
+	m[3] = d_data[3];
+	m[4] = d_data[4];
+	m[5] = d_data[5];
+	m[6] = d_data[6];
+	m[7] = d_data[7];
+	m[8] = d_data[8];
+	((uint32_t*)m)[18] = AS_U32(&d_data[9]);
+	((uint32_t*)m)[19] = nonce;
+
+	m[10] = m[11] = 0;
+	m[12] = m[13] = 0;
+	m[14] = m[15] = 0;
+
+	uint64_t v[16] = {
+		0x6a09e667f2bdc928, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+		0x510e527fade682d1, 0x9b05688c2b3e6c1f, 0x1f83d9abfb41bd6b, 0x5be0cd19137e2179,
+		0x6a09e667f3bcc908, 0xbb67ae8584caa73b, 0x3c6ef372fe94f82b, 0xa54ff53a5f1d36f1,
+		0x510e527fade68281, 0x9b05688c2b3e6c1f, 0xe07c265404be4294, 0x5be0cd19137e2179
+	};
+
+	ROUND( 0);
+	ROUND( 1);
+	ROUND( 2);
+	ROUND( 3);
+	ROUND( 4);
+	ROUND( 5);
+	ROUND( 6);
+	ROUND( 7);
+	ROUND( 8);
+	ROUND( 9);
+	ROUND(10);
+	ROUND(11);
+
+	uint2 last = vectorize(v[3] ^ v[11] ^ 0xa54ff53a5f1d36f1);
+	if (last.y <= target2.y && last.x <= target2.x) {
+		resNonce[1] = resNonce[0];
+		resNonce[0] = nonce;
+	}
+}
+
+__host__
+uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2, uint32_t &secNonce)
+{
+	uint32_t resNonces[NBN] = { UINT32_MAX, UINT32_MAX };
+	uint32_t result = UINT32_MAX;
+
+	dim3 grid((threads + TPB-1)/TPB);
+	dim3 block(TPB);
+
+	/* Check error on Ctrl+C or kill to prevent segfaults on exit */
+	if (cudaMemset(d_resNonces[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
+		return result;
+
+	blake2b_gpu_hash <<<grid, block, 8>>> (threads, startNonce, d_resNonces[thr_id], target2);
+	cudaThreadSynchronize();
+
+	if (cudaSuccess == cudaMemcpy(resNonces, d_resNonces[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
+		result = resNonces[0];
+		secNonce = resNonces[1];
+		if (secNonce == result) secNonce = UINT32_MAX;
+	}
+	return result;
+}
+
+__host__
+void blake2b_setBlock(uint32_t *data)
+{
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_data, data, 80, 0, cudaMemcpyHostToDevice));
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+int scanhash_blake2b(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(A) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+
+	const uint32_t first_nonce = pdata[19];
+
+	int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 28 : 25;
+	if (device_sm[dev_id] >= 520 && is_windows()) intensity = 26;
+	if (device_sm[dev_id] < 350) intensity = 22;
+
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage (linux)
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_resNonces[thr_id], NBN * sizeof(uint32_t)), -1);
+		init[thr_id] = true;
+	}
+
+	for (int i=0; i < 20; i++)
+		be32enc(&endiandata[i], pdata[i]);
+
+	const uint2 target = make_uint2(ptarget[6], ptarget[7]);
+	blake2b_setBlock(endiandata);
+
+	do {
+		work->nonces[0] = blake2b_hash_cuda(thr_id, throughput, pdata[19], target, work->nonces[1]);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(A) vhash[8];
+			work->valid_nonces = 0;
+			endiandata[19] = work->nonces[0];
+			blake2b_hash(vhash, endiandata);
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work_set_target_ratio(work, vhash);
+				work->valid_nonces++;
+				pdata[19] = work->nonces[0] + 1;
+			} else {
+				gpu_increment_reject(thr_id);
+			}
+
+			if (work->nonces[1] != UINT32_MAX) {
+				endiandata[19] = work->nonces[1];
+				blake2b_hash(vhash, endiandata);
+				if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+					if (bn_hash_target_ratio(vhash, ptarget) > work->shareratio[0]) {
+						work->sharediff[1] = work->sharediff[0];
+						work->shareratio[1] = work->shareratio[0];
+						xchg(work->nonces[1], work->nonces[0]);
+						work_set_target_ratio(work, vhash);
+					} else {
+						bn_set_target_ratio(work, vhash, 1);
+					}
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1; // next scan start
+				} else {
+					gpu_increment_reject(thr_id);
+				}
+			}
+
+			if (work->valid_nonces) {
+				work->nonces[0] = cuda_swab32(work->nonces[0]);
+				work->nonces[1] = cuda_swab32(work->nonces[1]);
+				return work->valid_nonces;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_blake2b(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	//cudaThreadSynchronize();
+
+	cudaFree(d_resNonces[thr_id]);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/ccminer.cpp
+++ b/ccminer.cpp
@ -103,6 +103,7 @@ bool submit_old = false;
				@@ -103,6 +103,7 @@ bool submit_old = false;
 bool use_syslog = false;
 bool use_colors = true;
 int use_pok = 0;
+int use_roots = 0;
 static bool opt_background = false;
 bool opt_quiet = false;
 int opt_maxlograte = 3;
@ -232,27 +233,33 @@ int opt_api_mcast_port = 4068;
				@@ -232,27 +233,33 @@ int opt_api_mcast_port = 4068;

 bool opt_stratum_stats = false;

+int cryptonight_fork = 1;
+
 static char const usage[] = "\
 Usage: " PROGRAM_NAME " [OPTIONS]\n\
 Options:\n\
  -a, --algo=ALGO       specify the hash algorithm to use\n\
+			allium      Garlic double lyra2\n\
 			bastion     Hefty bastion\n\
 			bitcore     Timetravel-10\n\
 			blake       Blake 256 (SFR)\n\
+			blake2b     Blake2-B 512 (BCX)\n\
 			blake2s     Blake2-S 256 (NEVA)\n\
 			blakecoin   Fast Blake 256 (8 rounds)\n\
 			bmw         BMW 256\n\
 			cryptolight AEON cryptonight (MEM/2)\n\
-			cryptonight XMR cryptonight\n\
+			cryptonight XMR cryptonight v1 (old)\n\
 			c11/flax    X11 variant\n\
 			decred      Decred Blake256\n\
 			deep        Deepcoin\n\
 			equihash    Zcash Equihash\n\
+			exosis      Exosis timetravel\n\
 			dmd-gr      Diamond-Groestl\n\
 			fresh       Freshcoin (shavite 80)\n\
 			fugue256    Fuguecoin\n\
 			gostcoin    GOSTcoin\n\
 			gostd       Double GOST R 34.11\n\
+			graft       Cryptonight v8\n\
 			groestl     Groestlcoin\n"
 #ifdef WITH_HEAVY_ALGO
 "			heavy       Heavycoin\n"
@ -264,18 +271,22 @@ Options:\n\
				@@ -264,18 +271,22 @@ Options:\n\
 			lbry        LBRY Credits (Sha/Ripemd)\n\
 			luffa       Joincoin\n\
 			lyra2       CryptoCoin\n\
-			lyra2v2     VertCoin\n\
+			lyra2v2     MonaCoin\n\
+			lyra2v3     Vertcoin\n\
 			lyra2z      ZeroCoin (3rd impl)\n\
 			myr-gr      Myriad-Groestl\n\
+			monero      XMR cryptonight (v7)\n\
 			neoscrypt   FeatherCoin, Phoenix, UFO...\n\
 			nist5       NIST5 (TalkCoin)\n\
 			penta       Pentablake hash (5x Blake 512)\n\
-			phi         BHCoin\n\
+			phi1612     LUX initial algo, for Seraph\n\
+			phi2        LUX v2 with lyra2\n\
 			polytimos   Politimos\n\
 			quark       Quark\n\
 			qubit       Qubit\n\
 			sha256d     SHA256d (bitcoin)\n\
 			sha256t     SHA256 x3\n\
+			sha256q     SHA256 x4\n\
 			sia         SIA (Blake2B)\n\
 			sib         Sibcoin (X11+Streebog)\n\
 			scrypt      Scrypt\n\
@ -283,6 +294,8 @@ Options:\n\
				@@ -283,6 +294,8 @@ Options:\n\
 			skein       Skein SHA2 (Skeincoin)\n\
 			skein2      Double Skein (Woodcoin)\n\
 			skunk       Skein Cube Fugue Streebog\n\
+			sonoa       97 hashes based on X17 ones (Sono)\n\
+			stellite    Cryptonight v3\n\
 			s3          S3 (1Coin)\n\
 			timetravel  Machinecoin permuted x8\n\
 			tribus      Denarius\n\
@ -572,7 +585,10 @@ static bool get_blocktemplate(CURL *curl, struct work *work);
				@@ -572,7 +585,10 @@ static bool get_blocktemplate(CURL *curl, struct work *work);

 void get_currentalgo(char* buf, int sz)
 {
-	snprintf(buf, sz, "%s", algo_names[opt_algo]);
+	int algo = opt_algo;
+	if (algo == ALGO_CRYPTONIGHT)
+		algo = get_cryptonight_algo(cryptonight_fork);
+	snprintf(buf, sz, "%s", algo_names[algo]);
 }

 void format_hashrate(double hashrate, char *output)
@ -698,6 +714,10 @@ static bool work_decode(const json_t *val, struct work *work)
				@@ -698,6 +714,10 @@ static bool work_decode(const json_t *val, struct work *work)
 		data_size = 192;
 		adata_sz = 180/4;
 		break;
+	case ALGO_PHI2:
+		data_size = 144;
+		adata_sz = data_size / 4;
+		break;
 	case ALGO_NEOSCRYPT:
 	case ALGO_ZR5:
 		data_size = 80;
@ -743,6 +763,12 @@ static bool work_decode(const json_t *val, struct work *work)
				@@ -743,6 +763,12 @@ static bool work_decode(const json_t *val, struct work *work)
 	for (i = 0; i < atarget_sz; i++)
 		work->target[i] = le32dec(work->target + i);

+	if (opt_algo == ALGO_PHI2) {
+		for (i = 20; i < 36; i++) if (work->data[i]) {
+			use_roots = 1; break;
+		}
+	}
+
 	if ((opt_showdiff || opt_max_diff > 0.) && !allow_mininginfo)
 		calc_network_diff(work);

@ -955,6 +981,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
				@@ -955,6 +981,7 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		case ALGO_BMW:
 		case ALGO_SHA256D:
 		case ALGO_SHA256T:
+		case ALGO_SHA256Q:
 		case ALGO_VANILLA:
 			// fast algos require that... (todo: regen hash)
 			check_dups = true;
@ -1066,6 +1093,9 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
				@@ -1066,6 +1093,9 @@ static bool submit_upstream_work(CURL *curl, struct work *work)
 		else if (opt_algo == ALGO_DECRED) {
 			data_size = 192; adata_sz = 180/4;
 		}
+		else if (opt_algo == ALGO_PHI2 && use_roots) {
+			data_size = 144; adata_sz = 36;
+		}
 		else if (opt_algo == ALGO_SIA) {
 			return sia_submit(curl, pool, work);
 		}
@ -1637,10 +1667,17 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
				@@ -1637,10 +1667,17 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		for (i = 0; i < 8; i++)
 			work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
 		for (i = 0; i < 8; i++)
-			work->data[17 + i] = ((uint32_t*)sctx->job.claim)[i];
+			work->data[17 + i] = ((uint32_t*)sctx->job.extra)[i];
 		work->data[25] = le32dec(sctx->job.ntime);
 		work->data[26] = le32dec(sctx->job.nbits);
 		work->data[28] = 0x80000000;
+	} else if (opt_algo == ALGO_PHI2) {
+		for (i = 0; i < 8; i++)
+			work->data[9 + i] = be32dec((uint32_t *)merkle_root + i);
+		work->data[17] = le32dec(sctx->job.ntime);
+		work->data[18] = le32dec(sctx->job.nbits);
+		for (i = 0; i < 16; i++)
+			work->data[20 + i] = ((uint32_t*)sctx->job.extra)[i];
 	} else if (opt_algo == ALGO_SIA) {
 		uint32_t extra = 0;
 		memcpy(&extra, &sctx->job.coinbase[32], 2);
@ -1708,6 +1745,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
				@@ -1708,6 +1745,7 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		case ALGO_SCRYPT_JANE:
 			work_set_target(work, sctx->job.diff / (65536.0 * opt_difficulty));
 			break;
+		case ALGO_ALLIUM:
 		case ALGO_DMD_GR:
 		case ALGO_FRESH:
 		case ALGO_FUGUE256:
@ -1715,9 +1753,12 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
				@@ -1715,9 +1753,12 @@ static bool stratum_gen_work(struct stratum_ctx *sctx, struct work *work)
 		case ALGO_KECCAKC:
 		case ALGO_LBRY:
 		case ALGO_LYRA2v2:
+		case ALGO_LYRA2v3:
 		case ALGO_LYRA2Z:
+		case ALGO_PHI2:
 		case ALGO_TIMETRAVEL:
 		case ALGO_BITCORE:
+		case ALGO_EXOSIS:
 		case ALGO_X16R:
 		case ALGO_X16S:
 			work_set_target(work, sctx->job.diff / (256.0 * opt_difficulty));
@ -2231,9 +2272,11 @@ static void *miner_thread(void *userdata)
				@@ -2231,9 +2272,11 @@ static void *miner_thread(void *userdata)
 			case ALGO_DECRED:
 			case ALGO_SHA256D:
 			case ALGO_SHA256T:
+			case ALGO_SHA256Q:
 			//case ALGO_WHIRLPOOLX:
 				minmax = 0x40000000U;
 				break;
+			case ALGO_BLAKE2B:
 			case ALGO_KECCAK:
 			case ALGO_KECCAKC:
 			case ALGO_LBRY:
@ -2244,6 +2287,7 @@ static void *miner_thread(void *userdata)
				@@ -2244,6 +2287,7 @@ static void *miner_thread(void *userdata)
 			case ALGO_TRIBUS:
 				minmax = 0x1000000;
 				break;
+			case ALGO_ALLIUM:
 			case ALGO_C11:
 			case ALGO_DEEP:
 			case ALGO_HEAVY:
@ -2251,12 +2295,15 @@ static void *miner_thread(void *userdata)
				@@ -2251,12 +2295,15 @@ static void *miner_thread(void *userdata)
 			case ALGO_JHA:
 			case ALGO_HSR:
 			case ALGO_LYRA2v2:
+			case ALGO_LYRA2v3:
 			case ALGO_PHI:
+			case ALGO_PHI2:
 			case ALGO_POLYTIMOS:
 			case ALGO_S3:
 			case ALGO_SKUNK:
 			case ALGO_TIMETRAVEL:
 			case ALGO_BITCORE:
+			case ALGO_EXOSIS:
 			case ALGO_X11EVO:
 			case ALGO_X11:
 			case ALGO_X12:
@ -2276,6 +2323,7 @@ static void *miner_thread(void *userdata)
				@@ -2276,6 +2323,7 @@ static void *miner_thread(void *userdata)
 			case ALGO_NEOSCRYPT:
 			case ALGO_SIB:
 			case ALGO_SCRYPT:
+			case ALGO_SONOA:
 			case ALGO_VELTOR:
 				minmax = 0x80000;
 				break;
@ -2335,6 +2383,9 @@ static void *miner_thread(void *userdata)
				@@ -2335,6 +2383,9 @@ static void *miner_thread(void *userdata)
 		/* scan nonces for a proof-of-work hash */
 		switch (opt_algo) {

+		case ALGO_ALLIUM:
+			rc = scanhash_allium(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_BASTION:
 			rc = scanhash_bastion(thr_id, &work, max_nonce, &hashes_done);
 			break;
@ -2344,6 +2395,9 @@ static void *miner_thread(void *userdata)
				@@ -2344,6 +2395,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_BLAKE:
 			rc = scanhash_blake256(thr_id, &work, max_nonce, &hashes_done, 14);
 			break;
+		case ALGO_BLAKE2B:
+			rc = scanhash_blake2b(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_BLAKE2S:
 			rc = scanhash_blake2s(thr_id, &work, max_nonce, &hashes_done);
 			break;
@ -2354,11 +2408,19 @@ static void *miner_thread(void *userdata)
				@@ -2354,11 +2408,19 @@ static void *miner_thread(void *userdata)
 			rc = scanhash_c11(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_CRYPTOLIGHT:
-			rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done);
+			rc = scanhash_cryptolight(thr_id, &work, max_nonce, &hashes_done, 1);
 			break;
+		case ALGO_MONERO:
+		case ALGO_STELLITE:
+		case ALGO_GRAFT:
 		case ALGO_CRYPTONIGHT:
-			rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done);
+		{
+			int cn_variant = 0;
+			if (cryptonight_fork > 1 && ((unsigned char*)work.data)[0] >= cryptonight_fork)
+				cn_variant = ((unsigned char*)work.data)[0] - cryptonight_fork + 1;
+			rc = scanhash_cryptonight(thr_id, &work, max_nonce, &hashes_done, cn_variant);
 			break;
+		}
 		case ALGO_DECRED:
 			rc = scanhash_decred(thr_id, &work, max_nonce, &hashes_done);
 			break;
@ -2375,6 +2437,11 @@ static void *miner_thread(void *userdata)
				@@ -2375,6 +2437,11 @@ static void *miner_thread(void *userdata)
 			rc = scanhash_fugue256(thr_id, &work, max_nonce, &hashes_done);
 			break;

+		case ALGO_GOSTCOIN:
+		case ALGO_GOSTD:
+			rc = scanhash_gostd(thr_id, &work, max_nonce, &hashes_done);
+			break;
+
 		case ALGO_GROESTL:
 		case ALGO_DMD_GR:
 			rc = scanhash_groestlcoin(thr_id, &work, max_nonce, &hashes_done);
@ -2427,6 +2494,9 @@ static void *miner_thread(void *userdata)
				@@ -2427,6 +2494,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_LYRA2v2:
 			rc = scanhash_lyra2v2(thr_id, &work, max_nonce, &hashes_done);
 			break;
+		case ALGO_LYRA2v3:
+			rc = scanhash_lyra2v3(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_LYRA2Z:
 			rc = scanhash_lyra2Z(thr_id, &work, max_nonce, &hashes_done);
 			break;
@ -2442,6 +2512,9 @@ static void *miner_thread(void *userdata)
				@@ -2442,6 +2512,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_PHI:
 			rc = scanhash_phi(thr_id, &work, max_nonce, &hashes_done);
 			break;
+		case ALGO_PHI2:
+			rc = scanhash_phi2(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_POLYTIMOS:
 			rc = scanhash_polytimos(thr_id, &work, max_nonce, &hashes_done);
 			break;
@ -2468,15 +2541,17 @@ static void *miner_thread(void *userdata)
				@@ -2468,15 +2541,17 @@ static void *miner_thread(void *userdata)
 		case ALGO_SHA256T:
 			rc = scanhash_sha256t(thr_id, &work, max_nonce, &hashes_done);
 			break;
+		case ALGO_SHA256Q:
+			rc = scanhash_sha256q(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_SIA:
 			rc = scanhash_sia(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_SIB:
 			rc = scanhash_sib(thr_id, &work, max_nonce, &hashes_done);
 			break;
-		case ALGO_GOSTCOIN:
-		case ALGO_GOSTD:
-			rc = scanhash_gostd(thr_id, &work, max_nonce, &hashes_done);
+		case ALGO_SONOA:
+			rc = scanhash_sonoa(thr_id, &work, max_nonce, &hashes_done);
 			break;
 		case ALGO_S3:
 			rc = scanhash_s3(thr_id, &work, max_nonce, &hashes_done);
@ -2506,6 +2581,9 @@ static void *miner_thread(void *userdata)
				@@ -2506,6 +2581,9 @@ static void *miner_thread(void *userdata)
 		case ALGO_BITCORE:
 			rc = scanhash_bitcore(thr_id, &work, max_nonce, &hashes_done);
 			break;
+		case ALGO_EXOSIS:
+			rc = scanhash_exosis(thr_id, &work, max_nonce, &hashes_done);
+			break;
 		case ALGO_X11EVO:
 			rc = scanhash_x11evo(thr_id, &work, max_nonce, &hashes_done);
 			break;
@ -3121,6 +3199,26 @@ void parse_arg(int key, char *arg)
				@@ -3121,6 +3199,26 @@ void parse_arg(int key, char *arg)
 			case ALGO_SCRYPT_JANE: opt_nfactor = 14; break;
 			}
 		}
+
+		// cryptonight variants
+		switch (opt_algo) {
+		case ALGO_MONERO:
+			opt_algo = ALGO_CRYPTONIGHT;
+			cryptonight_fork = 7;
+			break;
+		case ALGO_GRAFT:
+			opt_algo = ALGO_CRYPTONIGHT;
+			cryptonight_fork = 8;
+			break;
+		case ALGO_STELLITE:
+			opt_algo = ALGO_CRYPTONIGHT;
+			cryptonight_fork = 3;
+			break;
+		case ALGO_CRYPTONIGHT:
+			cryptonight_fork = 1;
+			break;
+		}
+
 		break;
 	case 'b':
 		p = strstr(arg, ":");
--- a/ccminer.vcxproj
+++ b/ccminer.vcxproj
@ -435,6 +435,8 @@
				@@ -435,6 +435,8 @@
    <CudaCompile Include="sha256\sha256d.cu" />
    <CudaCompile Include="sha256\cuda_sha256t.cu" />
    <CudaCompile Include="sha256\sha256t.cu" />
+    <CudaCompile Include="sha256\cuda_sha256q.cu" />
+    <CudaCompile Include="sha256\sha256q.cu" />
    <CudaCompile Include="zr5.cu" />
    <CudaCompile Include="heavy\cuda_blake512.cu">
    </CudaCompile>
@ -460,6 +462,7 @@
				@@ -460,6 +462,7 @@
    </CudaCompile>
    <CudaCompile Include="JHA\cuda_jha_keccak512.cu">
    </CudaCompile>
+    <CudaCompile Include="blake2b.cu" />
    <CudaCompile Include="Algo256\blake256.cu">
      <MaxRegCount>64</MaxRegCount>
      <AdditionalOptions Condition="'$(Configuration)'=='Release'">--ptxas-options="-dlcm=cg" %(AdditionalOptions)</AdditionalOptions>
@ -519,12 +522,17 @@
				@@ -519,12 +522,17 @@
    <CudaCompile Include="qubit\luffa.cu" />
    <CudaCompile Include="qubit\qubit.cu" />
    <CudaCompile Include="qubit\qubit_luffa512.cu" />
+    <CudaCompile Include="lyra2\allium.cu" />
    <CudaCompile Include="lyra2\lyra2RE.cu" />
    <CudaCompile Include="lyra2\cuda_lyra2.cu" />
    <CudaCompile Include="lyra2\lyra2REv2.cu" />
    <CudaCompile Include="lyra2\cuda_lyra2v2.cu" />
    <ClInclude Include="lyra2\cuda_lyra2_sm2.cuh" />
+    <ClInclude Include="lyra2\cuda_lyra2_sm5.cuh" />
    <ClInclude Include="lyra2\cuda_lyra2v2_sm3.cuh" />
+    <CudaCompile Include="lyra2\lyra2REv3.cu" />
+    <CudaCompile Include="lyra2\cuda_lyra2v3.cu" />
+    <ClInclude Include="lyra2\cuda_lyra2v3_sm3.cuh" />
    <CudaCompile Include="lyra2\lyra2Z.cu" />
    <CudaCompile Include="lyra2\cuda_lyra2Z.cu" />
    <ClInclude Include="lyra2\cuda_lyra2Z_sm5.cuh" />
@ -536,6 +544,10 @@
				@@ -536,6 +544,10 @@
    <CudaCompile Include="cuda_skeincoin.cu">
      <MaxRegCount>48</MaxRegCount>
    </CudaCompile>
+    <CudaCompile Include="phi\phi.cu" />
+    <CudaCompile Include="phi\phi2.cu" />
+    <CudaCompile Include="phi\cuda_phi2.cu" />
+    <CudaCompile Include="phi\cuda_phi2_cubehash512.cu" />
    <CudaCompile Include="skunk\skunk.cu" />
    <CudaCompile Include="skunk\cuda_skunk.cu">
      <CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
@ -566,11 +578,11 @@
				@@ -566,11 +578,11 @@
    <CudaCompile Include="x11\cuda_streebog_maxwell.cu" />
    <CudaCompile Include="x11\c11.cu" />
    <CudaCompile Include="x11\fresh.cu" />
-    <CudaCompile Include="x11\phi.cu" />
    <CudaCompile Include="x11\sib.cu" />
    <CudaCompile Include="x11\s3.cu" />
    <CudaCompile Include="x11\timetravel.cu" />
    <CudaCompile Include="x11\bitcore.cu" />
+    <CudaCompile Include="x11\exosis.cu" />
    <CudaCompile Include="x11\veltor.cu" />
    <CudaCompile Include="x11\x11.cu" />
    <CudaCompile Include="x11\x11evo.cu" />
@ -586,7 +598,6 @@
				@@ -586,7 +598,6 @@
    <CudaCompile Include="x15\x14.cu" />
    <CudaCompile Include="x15\cuda_x14_shabal512.cu" />
    <CudaCompile Include="x15\cuda_x15_whirlpool.cu" />
-    <CudaCompile Include="x17\hmq17.cu" />
    <CudaCompile Include="x15\x15.cu" />
    <CudaCompile Include="x15\whirlpool.cu" />
    <CudaCompile Include="x15\cuda_x15_whirlpool_sm3.cu" />
@ -599,6 +610,8 @@
				@@ -599,6 +610,8 @@
    <CudaCompile Include="x16\cuda_x16_echo512_64.cu">
      <CodeGeneration>compute_50,sm_50;compute_52,sm_52</CodeGeneration>
    </CudaCompile>
+    <CudaCompile Include="x17\hmq17.cu" />
+    <CudaCompile Include="x17\sonoa.cu" />
    <CudaCompile Include="x17\x17.cu" />
    <CudaCompile Include="x17\cuda_x17_haval256.cu">
    </CudaCompile>
--- a/ccminer.vcxproj.filters
+++ b/ccminer.vcxproj.filters
@ -115,12 +115,15 @@
				@@ -115,12 +115,15 @@
    <Filter Include="Source Files\CUDA\tribus">
      <UniqueIdentifier>{1e548d79-c217-4203-989a-a592fe2b2de3}</UniqueIdentifier>
    </Filter>
+    <Filter Include="Source Files\CUDA\phi">
+      <UniqueIdentifier>{311e8d79-1612-4f0f-8591-23a592f2b2d3}</UniqueIdentifier>
+    </Filter>
    <Filter Include="Source Files\CUDA\x12">
      <UniqueIdentifier>{xde48d89-fx12-1323-129a-b592fe2b2de3}</UniqueIdentifier>
    </Filter>
    <Filter Include="Source Files\CUDA\gost">
      <UniqueIdentifier>{6a99bc95-f402-465e-9e64-b042bd241bb7}</UniqueIdentifier>
-	</Filter>
+    </Filter>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="compat\jansson\dump.c">
@ -548,6 +551,9 @@
				@@ -548,6 +551,9 @@
    <ClInclude Include="lyra2\cuda_lyra2_sm2.cuh">
      <Filter>Source Files\CUDA\lyra2</Filter>
    </ClInclude>
+    <ClInclude Include="lyra2\cuda_lyra2_sm5.cuh">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </ClInclude>
    <ClInclude Include="lyra2\cuda_lyra2Z_sm5.cuh">
      <Filter>Source Files\CUDA\lyra2</Filter>
    </ClInclude>
@ -775,6 +781,9 @@
				@@ -775,6 +781,9 @@
    <CudaCompile Include="x17\hmq17.cu">
      <Filter>Source Files\CUDA\x17</Filter>
    </CudaCompile>
+    <CudaCompile Include="x17\sonoa.cu">
+      <Filter>Source Files\CUDA\x17</Filter>
+    </CudaCompile>
    <CudaCompile Include="x17\x17.cu">
      <Filter>Source Files\CUDA\x17</Filter>
    </CudaCompile>
@ -784,6 +793,18 @@
				@@ -784,6 +793,18 @@
    <CudaCompile Include="polytimos.cu">
      <Filter>Source Files\CUDA</Filter>
    </CudaCompile>
+    <CudaCompile Include="phi\phi.cu">
+      <Filter>Source Files\CUDA\phi</Filter>
+    </CudaCompile>
+    <CudaCompile Include="phi\phi2.cu">
+      <Filter>Source Files\CUDA\phi</Filter>
+    </CudaCompile>
+    <CudaCompile Include="phi\cuda_phi2.cu">
+      <Filter>Source Files\CUDA\phi</Filter>
+    </CudaCompile>
+    <CudaCompile Include="phi\cuda_phi2_cubehash512.cu">
+      <Filter>Source Files\CUDA\phi</Filter>
+    </CudaCompile>
    <CudaCompile Include="skunk\skunk.cu">
      <Filter>Source Files\CUDA\skunk</Filter>
    </CudaCompile>
@ -802,9 +823,6 @@
				@@ -802,9 +823,6 @@
    <ClInclude Include="tribus\cuda_echo512_aes.cuh">
      <Filter>Source Files\CUDA\tribus</Filter>
    </ClInclude>
-    <CudaCompile Include="x11\phi.cu">
-      <Filter>Source Files\CUDA\x11</Filter>
-    </CudaCompile>
    <CudaCompile Include="x11\sib.cu">
      <Filter>Source Files\CUDA\x11</Filter>
    </CudaCompile>
@ -826,6 +844,9 @@
				@@ -826,6 +844,9 @@
    <CudaCompile Include="x11\bitcore.cu">
      <Filter>Source Files\CUDA\x11</Filter>
    </CudaCompile>
+    <CudaCompile Include="x11\exosis.cu">
+      <Filter>Source Files\CUDA\x11</Filter>
+    </CudaCompile>
    <CudaCompile Include="x11\veltor.cu">
      <Filter>Source Files\CUDA\x11</Filter>
    </CudaCompile>
@ -913,6 +934,9 @@
				@@ -913,6 +934,9 @@
    <CudaCompile Include="Algo256\cuda_bmw.cu">
      <Filter>Source Files\CUDA\Algo256</Filter>
    </CudaCompile>
+    <CudaCompile Include="lyra2\allium.cu">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </CudaCompile>
    <CudaCompile Include="lyra2\cuda_lyra2.cu">
      <Filter>Source Files\CUDA\lyra2</Filter>
    </CudaCompile>
@ -925,12 +949,24 @@
				@@ -925,12 +949,24 @@
    <CudaCompile Include="lyra2\lyra2REv2.cu">
      <Filter>Source Files\CUDA\lyra2</Filter>
    </CudaCompile>
+    <CudaCompile Include="lyra2\cuda_lyra2v3.cu">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </CudaCompile>
+    <ClInclude Include="lyra2\cuda_lyra2v3_sm3.cuh">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </ClInclude>
+    <CudaCompile Include="lyra2\lyra2REv3.cu">
+      <Filter>Source Files\CUDA\lyra2</Filter>
+    </CudaCompile>
    <CudaCompile Include="lyra2\cuda_lyra2Z.cu">
      <Filter>Source Files\CUDA\lyra2</Filter>
    </CudaCompile>
    <CudaCompile Include="lyra2\lyra2Z.cu">
      <Filter>Source Files\CUDA\lyra2</Filter>
    </CudaCompile>
+    <CudaCompile Include="blake2b.cu">
+      <Filter>Source Files\CUDA\</Filter>
+    </CudaCompile>
    <CudaCompile Include="Algo256\blake2s.cu">
      <Filter>Source Files\CUDA\Algo256</Filter>
    </CudaCompile>
@ -958,6 +994,12 @@
				@@ -958,6 +994,12 @@
    <CudaCompile Include="sha256\sha256t.cu">
      <Filter>Source Files\CUDA\sha256</Filter>
    </CudaCompile>
+    <CudaCompile Include="sha256\cuda_sha256q.cu">
+      <Filter>Source Files\CUDA\sha256</Filter>
+    </CudaCompile>
+    <CudaCompile Include="sha256\sha256q.cu">
+      <Filter>Source Files\CUDA\sha256</Filter>
+    </CudaCompile>
    <CudaCompile Include="sia\sia.cu">
      <Filter>Source Files\sia</Filter>
    </CudaCompile>
--- a/compat/ccminer-config.h
+++ b/compat/ccminer-config.h
@ -164,7 +164,7 @@
				@@ -164,7 +164,7 @@
 #define PACKAGE_URL "http://github.com/tpruvot/ccminer"

 /* Define to the version of this package. */
-#define PACKAGE_VERSION "2.2.5"
+#define PACKAGE_VERSION "2.3.1"

 /* If using the C implementation of alloca, define if you know the
   direction of stack growth for your system; otherwise it will be
--- a/configure.ac
+++ b/configure.ac
@ -1,4 +1,4 @@
				@@ -1,4 +1,4 @@
-AC_INIT([ccminer], [2.2.5], [], [ccminer], [http://github.com/tpruvot/ccminer])
+AC_INIT([ccminer], [2.3.1], [], [ccminer], [http://github.com/tpruvot/ccminer])

 AC_PREREQ([2.59c])
 AC_CANONICAL_SYSTEM
--- a/crypto/cn_aes.cuh
+++ b/crypto/cn_aes.cuh
@ -138,6 +138,7 @@ static const __device__ __align__(16) uint32_t d_t_fn[1024] = {
				@@ -138,6 +138,7 @@ static const __device__ __align__(16) uint32_t d_t_fn[1024] = {
 */

 #define AS_U32(addr) *((uint32_t*)(addr))
+#define AS_U64(addr) *((uint64_t*)(addr))
 #define AS_UINT2(addr) *((uint2*)(addr))
 #define AS_UINT4(addr) *((uint4*)(addr))
 #define AS_UL2(addr) *((ulonglong2*)(addr))
--- a/crypto/cn_blake.cuh
+++ b/crypto/cn_blake.cuh
@ -164,7 +164,7 @@ void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restrict__ digest)
				@@ -164,7 +164,7 @@ void cn_blake_final(blake_state * __restrict__ S, uint8_t * __restrict__ digest)
 }

 __device__
-void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint8_t * __restrict__ out)
+void cn_blake(const uint8_t * __restrict__ in, uint64_t inlen, uint32_t * out)
 {
 	blake_state bs;
 	blake_state *S = (blake_state *)&bs;
--- a/crypto/cn_groestl.cuh
+++ b/crypto/cn_groestl.cuh
@ -274,13 +274,14 @@ void cn_groestl_final(groestlHashState*  __restrict__ ctx, BitSequence* __restri
				@@ -274,13 +274,14 @@ void cn_groestl_final(groestlHashState*  __restrict__ ctx, BitSequence* __restri
 	for (i = GROESTL_SIZE512-hashbytelen; i < GROESTL_SIZE512; i++,j++) {
 		output[j] = s[i];
 	}
-
+#if 0
 	for (i = 0; i < GROESTL_COLS512; i++) {
 		ctx->chaining[i] = 0;
 	}
 	for (i = 0; i < GROESTL_SIZE512; i++) {
 		ctx->buffer[i] = 0;
 	}
+#endif
 }

 __device__
@ -336,12 +337,12 @@ void cn_groestl_init(groestlHashState* ctx)
				@@ -336,12 +337,12 @@ void cn_groestl_init(groestlHashState* ctx)
 }

 __device__
-void cn_groestl(const BitSequence * __restrict__ data, DataLength len, BitSequence * __restrict__ hashval)
+void cn_groestl(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
 {
 	DataLength databitlen = len << 3;
 	groestlHashState context;

 	cn_groestl_init(&context);
-	cn_groestl_update(&context, data, databitlen);
-	cn_groestl_final(&context, hashval);
+	cn_groestl_update(&context, (BitSequence*) data, databitlen);
+	cn_groestl_final(&context, (BitSequence*) hashval);
 }
--- a/crypto/cn_jh.cuh
+++ b/crypto/cn_jh.cuh
@ -198,8 +198,9 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__
				@@ -198,8 +198,9 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__
 		databitlen = 0;
 	}

-	if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512)  ) {
-		memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) ) ;
+	if ( (state->datasize_in_buffer > 0 ) && (( state->datasize_in_buffer + databitlen) >= 512)  )
+	{
+		memcpy( state->buffer + (state->datasize_in_buffer >> 3), data, 64-(state->datasize_in_buffer >> 3) );
 		index = 64-(state->datasize_in_buffer >> 3);
 		databitlen = databitlen - (512 - state->datasize_in_buffer);
 		cn_jh_F8(state);
@ -222,7 +223,7 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__
				@@ -222,7 +223,7 @@ void cn_jh_update(jhHashState * __restrict__ state, const uint8_t * __restrict__

 /* pad the message, process the padded block(s), truncate the hash value H to obtain the message digest */
 __device__
-void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashval)
+void cn_jh_final(jhHashState * __restrict__ state, uint8_t * __restrict__ hashval)
 {
 	unsigned int i;
 	//uint32_t *bufptr = (uint32_t *)state->buffer;
@ -244,7 +245,7 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv
				@@ -244,7 +245,7 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv

 	} else {

-		/*set the rest of the bytes in the buffer to 0*/
+		/* set the rest of the bytes in the buffer to 0 */
 		if ( (state->datasize_in_buffer & 7) == 0) {
 			for (i = (state->databitlen & 0x1ff) >> 3; i < 64; i++) state->buffer[i] = 0;
 		} else {
@ -268,7 +269,8 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv
				@@ -268,7 +269,8 @@ void cn_jh_final(jhHashState * __restrict__ state, uint32_t * __restrict__ hashv
 		cn_jh_F8(state);
 	}

-	MEMCPY4(hashval, ((unsigned char*)state->x) + 64 + 32, 8);
+	memcpy(hashval, ((unsigned char*)state->x) + 64 + 32, 32);
+	//MEMCPY4(hashval, ((unsigned char*)state->x) + 64 + 32, 8);
 }

 __device__
@ -277,12 +279,12 @@ void cn_jh_init(jhHashState *state, int hashbitlen)
				@@ -277,12 +279,12 @@ void cn_jh_init(jhHashState *state, int hashbitlen)
 	state->databitlen = 0;
 	state->datasize_in_buffer = 0;
 	state->hashbitlen = hashbitlen;
-	//memcpy(state->x, d_JH256_H0, 128);
-	MEMCPY8(state->x, d_JH256_H0, 128 / 8);
+	memcpy(state->x, d_JH256_H0, 128);
+	//MEMCPY8(state->x, d_JH256_H0, 128 / 8);
 }

 __device__
-void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval)
+void cn_jh(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
 {
 	const int hashbitlen = 256;
 	DataLength databitlen = len << 3;
@ -290,5 +292,5 @@ void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re
				@@ -290,5 +292,5 @@ void cn_jh256(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re

 	cn_jh_init(&state, hashbitlen);
 	cn_jh_update(&state, data, databitlen);
-	cn_jh_final(&state, hashval);
+	cn_jh_final(&state, (uint8_t*) hashval);
 }
--- a/crypto/cn_keccak.cuh
+++ b/crypto/cn_keccak.cuh
@ -195,7 +195,7 @@ void cn_keccakf(uint64_t *s)
				@@ -195,7 +195,7 @@ void cn_keccakf(uint64_t *s)
 }

 __device__ __forceinline__
-void cn_keccak(const uint8_t * __restrict__ in, uint8_t * __restrict__ md)
+void cn_keccak(const uint32_t * __restrict__ in, uint64_t * __restrict__ md)
 {
 	uint64_t st[25];

--- a/crypto/cn_skein.cuh
+++ b/crypto/cn_skein.cuh
@ -4,19 +4,15 @@ typedef unsigned int    uint_t;             /* native unsigned integer */
				@@ -4,19 +4,15 @@ typedef unsigned int    uint_t;             /* native unsigned integer */

 #define SKEIN_256_STATE_WORDS ( 4)
 #define SKEIN_512_STATE_WORDS ( 8)
-#define SKEIN1024_STATE_WORDS (16)

 #define SKEIN_256_STATE_BYTES ( 8*SKEIN_256_STATE_WORDS)
 #define SKEIN_512_STATE_BYTES ( 8*SKEIN_512_STATE_WORDS)
-#define SKEIN1024_STATE_BYTES ( 8*SKEIN1024_STATE_WORDS)

 #define SKEIN_256_STATE_BITS  (64*SKEIN_256_STATE_WORDS)
 #define SKEIN_512_STATE_BITS  (64*SKEIN_512_STATE_WORDS)
-#define SKEIN1024_STATE_BITS  (64*SKEIN1024_STATE_WORDS)

 #define SKEIN_256_BLOCK_BYTES ( 8*SKEIN_256_STATE_WORDS)
 #define SKEIN_512_BLOCK_BYTES ( 8*SKEIN_512_STATE_WORDS)
-#define SKEIN1024_BLOCK_BYTES ( 8*SKEIN1024_STATE_WORDS)

 #define SKEIN_MK_64(hi32,lo32)  ((lo32) + (((uint64_t) (hi32)) << 32))
 #define SKEIN_KS_PARITY         SKEIN_MK_64(0x1BD11BDA,0xA9FC1A22)
@ -119,7 +115,7 @@ typedef struct {
				@@ -119,7 +115,7 @@ typedef struct {
 } skeinHashState;

 __device__
-void cn_skein256_init(skeinHashState *state, size_t hashBitLen)
+void cn_skein_init(skeinHashState *state, size_t hashBitLen)
 {
 	const uint64_t SKEIN_512_IV_256[] =
 	{
@ -258,14 +254,12 @@ void cn_skein_block(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restr
				@@ -258,14 +254,12 @@ void cn_skein_block(Skein_512_Ctxt_t * __restrict__ ctx, const uint8_t * __restr
 }

 __device__
-void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen)
+void cn_skein_update(skeinHashState * __restrict__ state, const uint8_t * __restrict__ data, DataLength databitlen)
 {
 	if ((databitlen & 7) == 0) {
-
 		cn_skein_block(&state->u.ctx_512, data, databitlen >> 3);
 	}
 	else {
-
 		size_t bCnt = (databitlen >> 3) + 1;
 		uint8_t b,mask;

@ -280,7 +274,7 @@ void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __r
				@@ -280,7 +274,7 @@ void cn_skein256_update(skeinHashState * __restrict__ state, const uint8_t * __r
 }

 __device__
-void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restrict__ hashVal)
+void cn_skein_final(skeinHashState * __restrict__ state, uint8_t * __restrict__ hashVal)
 {
 	uint64_t X[SKEIN_512_STATE_WORDS];
 	Skein_512_Ctxt_t *ctx = (Skein_512_Ctxt_t *)&state->u.ctx_512;
@ -305,13 +299,13 @@ void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restric
				@@ -305,13 +299,13 @@ void cn_skein256_final(skeinHashState * __restrict__ state, uint32_t * __restric
 		((uint64_t *)ctx->b)[0] = (uint64_t)i;
 		Skein_Start_New_Type(ctx, OUT_FINAL);
 		cn_skein_processblock(ctx, ctx->b, 1, sizeof(uint64_t));
-		memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES/sizeof(uint32_t)), ctx->X, n);
+		memcpy(hashVal + (i*SKEIN_512_BLOCK_BYTES), ctx->X, n);
 		memcpy(ctx->X, X, sizeof(X)); // restore the counter mode key for next time
 	}
 }

 __device__
-void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __restrict__ hashval)
+void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * hashval)
 {
 	int hashbitlen = 256;
 	DataLength databitlen = len << 3;
@ -319,7 +313,7 @@ void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re
				@@ -319,7 +313,7 @@ void cn_skein(const uint8_t * __restrict__ data, DataLength len, uint32_t * __re

 	state.statebits = 64*SKEIN_512_STATE_WORDS;

-	cn_skein256_init(&state, hashbitlen);
-	cn_skein256_update(&state, data, databitlen);
-	cn_skein256_final(&state, hashval);
+	cn_skein_init(&state, hashbitlen);
+	cn_skein_update(&state, data, databitlen);
+	cn_skein_final(&state, (uint8_t*) hashval);
 }
--- a/crypto/cryptolight-core.cu
+++ b/crypto/cryptolight-core.cu
@ -36,7 +36,7 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t *
				@@ -36,7 +36,7 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t *

 	if(thread < threads)
 	{
-		const int oft = thread * 52 + sub + 16; // not aligned 16!
+		const int oft = thread * 50 + sub + 16; // not aligned 16!
 		const int long_oft = (thread << LONG_SHL_IDX) + sub;
 		uint32_t __align__(16) key[40];
 		uint32_t __align__(16) text[4];
@ -57,8 +57,10 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t *
				@@ -57,8 +57,10 @@ void cryptolight_core_gpu_phase1(int threads, uint32_t * long_state, uint32_t *
 	}
 }

+// --------------------------------------------------------------------------------------------------------------
+
 __global__
-void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
+void cryptolight_old_gpu_phase2(const int threads, const int bfactor, const int partidx, uint32_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
 {
 	__shared__ uint32_t __align__(16) sharedMemory[1024];

@ -209,6 +211,70 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int
				@@ -209,6 +211,70 @@ void cryptolight_core_gpu_phase2(const int threads, const int bfactor, const int
 #endif // __CUDA_ARCH__ >= 300
 }

+__device__ __forceinline__ void store_variant1(uint32_t* long_state)
+{
+	uint4* Z = (uint4*) long_state;
+	const uint32_t tmp = (Z->z >> 24); // __byte_perm(src, 0, 0x7773);
+	const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1;
+	Z->z = (Z->z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24);
+}
+
+#define MUL_SUM_XOR_DST_1(a,c,dst,tweak) { \
+        uint64_t hi, lo = cuda_mul128(((uint64_t *)a)[0], ((uint64_t *)dst)[0], &hi) + ((uint64_t *)c)[1]; \
+        hi += ((uint64_t *)c)[0]; \
+        ((uint64_t *)c)[0] = ((uint64_t *)dst)[0] ^ hi; \
+        ((uint64_t *)c)[1] = ((uint64_t *)dst)[1] ^ lo; \
+        ((uint64_t *)dst)[0] = hi; \
+        ((uint64_t *)dst)[1] = lo ^ tweak; }
+
+__global__
+void cryptolight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
+	uint32_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
+	uint64_t * __restrict__ d_tweak)
+{
+	__shared__ __align__(16) uint32_t sharedMemory[1024];
+	cn_aes_gpu_init(sharedMemory);
+	__syncthreads();
+
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+	if (thread < threads)
+	{
+		const uint32_t batchsize = ITER >> (2 + bfactor);
+		const uint32_t start = partidx * batchsize;
+		const uint32_t end = start + batchsize;
+		const uint32_t longptr = thread << LONG_SHL_IDX;
+		uint32_t * long_state = &d_long_state[longptr];
+		uint64_t tweak = d_tweak[thread];
+
+		void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
+		void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
+		uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
+		uint4 B = AS_UINT4(ctx_b);
+		uint32_t* a = (uint32_t*)&A;
+		uint32_t* b = (uint32_t*)&B;
+
+		for (int i = start; i < end; i++)
+		{
+			uint32_t c[4];
+			uint32_t j = (A.x >> 2) & E2I_MASK2;
+			cn_aes_single_round(sharedMemory, &long_state[j], c, a);
+			XOR_BLOCKS_DST(c, b, &long_state[j]);
+			store_variant1(&long_state[j]);
+			MUL_SUM_XOR_DST_1(c, a, &long_state[(c[0] >> 2) & E2I_MASK2], tweak);
+
+			j = (A.x >> 2) & E2I_MASK2;
+			cn_aes_single_round(sharedMemory, &long_state[j], b, a);
+			XOR_BLOCKS_DST(b, c, &long_state[j]);
+			store_variant1(&long_state[j]);
+			MUL_SUM_XOR_DST_1(b, a, &long_state[(b[0] >> 2) & E2I_MASK2], tweak);
+		}
+		if (bfactor) {
+			AS_UINT4(ctx_a) = A;
+			AS_UINT4(ctx_b) = B;
+		}
+	}
+}
+
 __global__
 void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint32_t * ctx_state, uint32_t * ctx_key2)
 {
@ -222,7 +288,7 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3
				@@ -222,7 +288,7 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3
 	if(thread < threads)
 	{
 		const int long_oft = (thread << LONG_SHL_IDX) + sub;
-		const int oft = thread * 52 + sub + 16;
+		const int oft = thread * 50 + sub + 16;
 		uint32_t __align__(16) key[40];
 		uint32_t __align__(16) text[4];

@ -251,8 +317,8 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3
				@@ -251,8 +317,8 @@ void cryptolight_core_gpu_phase3(int threads, const uint32_t * long_state, uint3
 extern int device_bfactor[MAX_GPUS];

 __host__
-void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state,
-	uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
+void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state,
+	uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
 {
 	dim3 grid(blocks);
 	dim3 block(threads);
@ -265,17 +331,21 @@ void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_
				@@ -265,17 +331,21 @@ void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_
 	int i, partcount = 1 << bfactor;
 	int dev_id = device_map[thr_id];

-	cryptolight_core_gpu_phase1 <<<grid, block8 >>>(blocks*threads, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key1);
+	cryptolight_core_gpu_phase1 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key1);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 	if(partcount > 1) usleep(bsleep);

 	for(i = 0; i < partcount; i++)
 	{
-		cryptolight_core_gpu_phase2 <<<grid, (device_sm[dev_id] >= 300 ? block4 : block)>>>(blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
+		dim3 b = device_sm[dev_id] >= 300 ? block4 : block;
+		if (variant == 0)
+			cryptolight_old_gpu_phase2 <<<grid, b>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
+		else
+			cryptolight_gpu_phase2 <<<grid, b>>> (blocks*threads, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		if(partcount > 1) usleep(bsleep);
 	}

-	cryptolight_core_gpu_phase3 <<<grid, block8 >>>(blocks*threads, d_long_state, (uint32_t*)d_ctx_state, d_ctx_key2);
+	cryptolight_core_gpu_phase3 <<<grid, block8 >>>(blocks*threads, d_long_state, d_ctx_state, d_ctx_key2);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 }
--- a/crypto/cryptolight-cpu.cpp
+++ b/crypto/cryptolight-cpu.cpp
@ -132,14 +132,13 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
				@@ -132,14 +132,13 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
 	((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
 }

-static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
+static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak) {
 	uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
 	hi += ((uint64_t*) c)[0];
-
 	((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
 	((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
 	((uint64_t*) dst)[0] = hi;
-	((uint64_t*) dst)[1] = lo;
+	((uint64_t*) dst)[1] = variant ? lo ^ tweak : lo;
 }

 static void copy_block(uint8_t* dst, const uint8_t* src) {
@ -157,17 +156,29 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
				@@ -157,17 +156,29 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
 	((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
 }

-static void cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx)
+static void cryptolight_store_variant(void* state, int variant) {
+	if (variant == 1) {
+		// use variant 1 like monero since june 2018
+		const uint8_t tmp = ((const uint8_t*)(state))[11];
+		const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
+		((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
+	}
+}
+
+static void cryptolight_hash_ctx(void* output, const void* input, const int len, struct cryptonight_ctx* ctx, const int variant)
 {
 	size_t i, j;
+
 	keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
 	ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
 	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);

+	const uint64_t tweak = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0;
+
 	oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
 	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
-#undef RND
-#define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data);
+		#undef RND
+		#define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data);
 		RND(0);
 		RND(1);
 		RND(2);
@ -186,22 +197,22 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len,
				@@ -186,22 +197,22 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len,
 		j = e2i(ctx->a);
 		aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
 		xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
-
-		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)]);
+		cryptolight_store_variant(&ctx->long_state[j], variant);
+		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c)], variant, tweak);

 		j = e2i(ctx->a);
 		aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
 		xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
-
-		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)]);
+		cryptolight_store_variant(&ctx->long_state[j], variant);
+		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b)], variant, tweak);
 	}

 	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
 	oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
 	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
-#undef RND
-#define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \
-		aesb_pseudo_round_mut(&ctx->text[p * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		#undef RND
+		#define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \
+			aesb_pseudo_round_mut(&ctx->text[p * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
 		RND(0);
 		RND(1);
 		RND(2);
@ -221,9 +232,15 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len,
				@@ -221,9 +232,15 @@ static void cryptolight_hash_ctx(void* output, const void* input, const int len,
 	oaes_free((OAES_CTX **) &ctx->aes_ctx);
 }

-void cryptolight_hash(void* output, const void* input, int len)
+void cryptolight_hash_variant(void* output, const void* input, int len, int variant)
 {
 	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
-	cryptolight_hash_ctx(output, input, len, ctx);
+	cryptolight_hash_ctx(output, input, len, ctx, variant);
 	free(ctx);
 }
+
+void cryptolight_hash(void* output, const void* input)
+{
+	cryptolight_hash_variant(output, input, 76, 1);
+}
+
--- a/crypto/cryptolight.cu
+++ b/crypto/cryptolight.cu
@ -7,16 +7,17 @@ static __thread uint32_t cn_blocks  = 32;
				@@ -7,16 +7,17 @@ static __thread uint32_t cn_blocks  = 32;
 static __thread uint32_t cn_threads = 16;

 static uint32_t *d_long_state[MAX_GPUS];
-static uint64_t *d_ctx_state[MAX_GPUS];
+static uint32_t *d_ctx_state[MAX_GPUS];
 static uint32_t *d_ctx_key1[MAX_GPUS];
 static uint32_t *d_ctx_key2[MAX_GPUS];
 static uint32_t *d_ctx_text[MAX_GPUS];
+static uint64_t *d_ctx_tweak[MAX_GPUS];
 static uint32_t *d_ctx_a[MAX_GPUS];
 static uint32_t *d_ctx_b[MAX_GPUS];

 static bool init[MAX_GPUS] = { 0 };

-extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant)
 {
 	int res = 0;
 	uint32_t throughput = 0;
@ -26,6 +27,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
				@@ -26,6 +27,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 	uint32_t *nonceptr = (uint32_t*) (&pdata[39]);
 	const uint32_t first_nonce = *nonceptr;
 	uint32_t nonce = first_nonce;
+	int dev_id = device_map[thr_id];

 	if(opt_benchmark) {
 		ptarget[7] = 0x00ff;
@ -33,6 +35,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
				@@ -33,6 +35,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_

 	if(!init[thr_id])
 	{
+		if (!device_config[thr_id] && strcmp(device_name[dev_id], "TITAN V") == 0) {
+			device_config[thr_id] = strdup("80x32");
+		}
+
 		if (device_config[thr_id]) {
 			sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
 			throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
@ -63,11 +69,11 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
				@@ -63,11 +69,11 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 		}

 		const size_t alloc = MEMORY * throughput;
-		cryptonight_extra_cpu_init(thr_id, throughput);
+		cryptonight_extra_init(thr_id);

 		cudaMalloc(&d_long_state[thr_id], alloc);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-		cudaMalloc(&d_ctx_state[thr_id], 26 * sizeof(uint64_t) * throughput);
+		cudaMalloc(&d_ctx_state[thr_id], 25 * sizeof(uint64_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
@ -79,6 +85,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
				@@ -79,6 +85,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput);

 		init[thr_id] = true;
 	}
@ -90,10 +97,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
				@@ -90,10 +97,10 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 		const uint32_t Htarg = ptarget[7];
 		uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };

-		cryptonight_extra_cpu_setData(thr_id, pdata, ptarget);
-		cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
-		cryptolight_core_cpu_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
-		cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
+		cryptonight_extra_setData(thr_id, pdata, ptarget);
+		cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
+		cryptolight_core_hash(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
+		cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);

 		*hashes_done = nonce - first_nonce + throughput;

@ -104,7 +111,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
				@@ -104,7 +111,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 			uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
 			memcpy(tempdata, pdata, 76);
 			*tempnonceptr = resNonces[0];
-			cryptolight_hash(vhash, tempdata, 76);
+			cryptolight_hash_variant(vhash, tempdata, 76, variant);
 			if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
 			{
 				res = 1;
@ -114,7 +121,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
				@@ -114,7 +121,7 @@ extern "C" int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_
 				if(resNonces[1] != UINT32_MAX)
 				{
 					*tempnonceptr = resNonces[1];
-					cryptolight_hash(vhash, tempdata, 76);
+					cryptolight_hash_variant(vhash, tempdata, 76, variant);
 					if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
 						res++;
 						work->nonces[1] = resNonces[1];
@ -157,10 +164,11 @@ void free_cryptolight(int thr_id)
				@@ -157,10 +164,11 @@ void free_cryptolight(int thr_id)
 	cudaFree(d_ctx_key1[thr_id]);
 	cudaFree(d_ctx_key2[thr_id]);
 	cudaFree(d_ctx_text[thr_id]);
+	cudaFree(d_ctx_tweak[thr_id]);
 	cudaFree(d_ctx_a[thr_id]);
 	cudaFree(d_ctx_b[thr_id]);

-	cryptonight_extra_cpu_free(thr_id);
+	cryptonight_extra_free(thr_id);

 	cudaDeviceSynchronize();

--- a/crypto/cryptolight.h
+++ b/crypto/cryptolight.h
@ -134,10 +134,11 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
				@@ -134,10 +134,11 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
 		exit(1);
 	}
 }
-void cryptolight_core_cpu_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);

-void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn);
-void cryptonight_extra_cpu_init(int thr_id, uint32_t threads);
-void cryptonight_extra_cpu_free(int thr_id);
-void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
-void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state);
+void cryptolight_core_hash(int thr_id, int blocks, int threads, uint32_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
+
+void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget);
+void cryptonight_extra_init(int thr_id/*, uint32_t threads*/);
+void cryptonight_extra_free(int thr_id);
+void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
+void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state);
--- a/crypto/cryptonight-core.cu
+++ b/crypto/cryptonight-core.cu
@ -2,47 +2,55 @@
				@@ -2,47 +2,55 @@
 #include <stdint.h>
 #include <string.h>
 #include <sys/time.h>
+#ifndef _WIN32
 #include <unistd.h>
+#endif
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#if CUDA_VERSION >= 9000 && __CUDA_ARCH__ >= 300
+#undef __shfl
+#define __shfl(var, srcLane, width) __shfl_sync(0xFFFFFFFFu, var, srcLane, width)
+#endif

 #include "cryptonight.h"

-#define LONG_SHL32 19 // 1<<19
+#define LONG_SHL32 19 // 1<<19 (uint32_t* index)
 #define LONG_SHL64 18 // 1<<18 (uint64_t* index)
 #define LONG_LOOPS32 0x80000U
-#define LONG_LOOPS64 0x40000U

 #include "cn_aes.cuh"

 __global__
-//__launch_bounds__(128, 9) // 56 registers
-void cryptonight_core_gpu_phase1(const uint32_t threads, uint64_t * long_state, uint64_t * const ctx_state, uint32_t * ctx_key1)
+void cryptonight_gpu_phase1(const uint32_t threads, uint32_t * __restrict__ d_long_state,
+	uint32_t * __restrict__ ctx_state, uint32_t * __restrict__ ctx_key1)
 {
-	__shared__ __align__(16) uint32_t sharedMemory[1024];
-	cn_aes_gpu_init(sharedMemory);
-	__syncthreads();
+	__shared__ uint32_t sharedMemory[1024];

 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;
-	const uint32_t sub = (threadIdx.x & 7) << 1; // 0 2 .. 14
-
 	if(thread < threads)
 	{
-		const uint32_t long_oft = (thread << LONG_SHL64) + sub;
-
-		const uint32_t* ctx_key = &ctx_key1[thread * 40U];
-		uint4 keys[10];
-		#pragma unroll 10 // load 160 bytes
-		for (int i = 0; i < 10; i ++)
-			keys[i] = AS_UINT4(&ctx_key[i*4]);
+		cn_aes_gpu_init(sharedMemory);
+		__syncthreads();

-		uint4 text = AS_UINT4(&ctx_state[thread * 26U + sub + 8U]);
+		const uint32_t sub = (threadIdx.x & 0x7U) << 2;
+		uint32_t *longstate = &d_long_state[(thread << LONG_SHL32) + sub];
+		uint32_t __align__(8) key[40];
+		MEMCPY8(key, &ctx_key1[thread * 40U], 20);
+		uint32_t __align__(8) text[4];
+		MEMCPY8(text, &ctx_state[thread * 50U + sub + 16U], 2);

-		for (uint32_t i = 0; i < LONG_LOOPS64; i += 16U) {
-			cn_aes_pseudo_round_mut_uint4(sharedMemory, text, keys);
-			AS_UINT4(&long_state[long_oft + i]) = text;
+		for(int i = 0; i < LONG_LOOPS32; i += 32)
+		{
+			cn_aes_pseudo_round_mut(sharedMemory, text, key);
+			MEMCPY8(&longstate[i], text, 2);
 		}
 	}
 }

+// --------------------------------------------------------------------------------------------------------------
+
 __device__ __forceinline__ ulonglong2 cuda_mul128(const uint64_t multiplier, const uint64_t multiplicand)
 {
 	ulonglong2 product;
@ -59,8 +67,7 @@ static __forceinline__ __device__ ulonglong2 operator ^ (const ulonglong2 &a, co
				@@ -59,8 +67,7 @@ static __forceinline__ __device__ ulonglong2 operator ^ (const ulonglong2 &a, co
 	return make_ulonglong2(a.x ^ b.x, a.y ^ b.y);
 }

-#undef MUL_SUM_XOR_DST
-__device__ __forceinline__ void MUL_SUM_XOR_DST(const uint64_t m, uint4 &a, void* far_dst)
+__device__ __forceinline__ void MUL_SUM_XOR_DST_0(const uint64_t m, uint4 &a, void* far_dst)
 {
 	ulonglong2 d = AS_UL2(far_dst);
 	ulonglong2 p = cuda_mul128(m, d.x);
@ -73,8 +80,8 @@ __global__
				@@ -73,8 +80,8 @@ __global__
 #if __CUDA_ARCH__ >= 500
 //__launch_bounds__(128,12) /* force 40 regs to allow -l ...x32 */
 #endif
-void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor, const uint32_t partidx,
-	uint64_t * d_long_state, uint32_t * d_ctx_a, uint32_t * d_ctx_b)
+void cryptonight_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
+	uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b)
 {
 	__shared__ __align__(16) uint32_t sharedMemory[1024];
 	cn_aes_gpu_init(sharedMemory);
@ -84,7 +91,7 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
				@@ -84,7 +91,7 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,

 	if (thread < threads)
 	{
-		const uint32_t batchsize = ITER >> (2U + bfactor);
+		const uint32_t batchsize = ITER >> (2 + bfactor);
 		const uint32_t start = partidx * batchsize;
 		const uint32_t end = start + batchsize;

@ -101,12 +108,12 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
				@@ -101,12 +108,12 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
 			uint32_t j = (A.x & E2I_MASK) >> 3;
 			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
 			AS_UINT4(&long_state[j]) = C ^ B; // st.global.u32.v4
-			MUL_SUM_XOR_DST((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3]);
+			MUL_SUM_XOR_DST_0((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3]);

 			j = (A.x & E2I_MASK) >> 3;
 			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
 			AS_UINT4(&long_state[j]) = C ^ B;
-			MUL_SUM_XOR_DST((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3]);
+			MUL_SUM_XOR_DST_0((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3]);
 		}

 		if (bfactor) {
@ -116,71 +123,194 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
				@@ -116,71 +123,194 @@ void cryptonight_core_gpu_phase2(const uint32_t threads, const uint32_t bfactor,
 	}
 }

+// --------------------------------------------------------------------------------------------------------------
+
+__device__ __forceinline__ void store_variant1(uint64_t* long_state, uint4 Z)
+{
+	const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773);
+	const uint32_t index = (((tmp >> 3) & 6u) | (tmp & 1u)) << 1;
+	Z.z = (Z.z & 0x00ffffffu) | ((tmp ^ ((0x75310u >> index) & 0x30u)) << 24);
+	AS_UINT4(long_state) = Z;
+}
+
+__device__ __forceinline__ void store_variant2(uint64_t* long_state, uint4 Z)
+{
+	const uint32_t tmp = (Z.z >> 24); // __byte_perm(src, 0, 0x7773);
+	const uint32_t index = (((tmp >> 4) & 6u) | (tmp & 1u)) << 1;
+	Z.z = (Z.z & 0x00ffffffu) | ((tmp ^ ((0x75312u >> index) & 0x30u)) << 24);
+	AS_UINT4(long_state) = Z;
+}
+
+__device__ __forceinline__ void MUL_SUM_XOR_DST_1(const uint64_t m, uint4 &a, void* far_dst, uint64_t tweak)
+{
+	ulonglong2 d = AS_UL2(far_dst);
+	ulonglong2 p = cuda_mul128(m, d.x);
+	p += AS_UL2(&a);
+	AS_UL2(&a) = p ^ d;
+	p.y = p.y ^ tweak;
+	AS_UL2(far_dst) = p;
+}
+
 __global__
-void cryptonight_core_gpu_phase3(const uint32_t threads, const uint64_t * long_state, uint64_t * ctx_state, uint32_t * __restrict__ ctx_key2)
+void monero_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
+	uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
+	uint64_t * __restrict__ d_tweak)
 {
 	__shared__ __align__(16) uint32_t sharedMemory[1024];
 	cn_aes_gpu_init(sharedMemory);
 	__syncthreads();

-	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3U;
-	const uint32_t sub = (threadIdx.x & 7U) << 1U;
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+	if (thread < threads)
+	{
+		const uint32_t batchsize = ITER >> (2 + bfactor);
+		const uint32_t start = partidx * batchsize;
+		const uint32_t end = start + batchsize;
+		uint64_t tweak = d_tweak[thread];
+
+		void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
+		void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
+		uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
+		uint4 B = AS_UINT4(ctx_b);

-	if(thread < threads)
+		uint64_t * long_state = &d_long_state[thread << LONG_SHL64];
+		for (int i = start; i < end; i++) // end = 262144
+		{
+			uint4 C;
+			uint32_t j = (A.x & E2I_MASK) >> 3;
+			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
+			store_variant1(&long_state[j], C ^ B); // st.global
+			MUL_SUM_XOR_DST_1((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3], tweak);
+
+			j = (A.x & E2I_MASK) >> 3;
+			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
+			store_variant1(&long_state[j], C ^ B);
+			MUL_SUM_XOR_DST_1((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3], tweak);
+		}
+		if (bfactor) {
+			AS_UINT4(ctx_a) = A;
+			AS_UINT4(ctx_b) = B;
+		}
+	}
+}
+
+// --------------------------------------------------------------------------------------------------------------
+
+__global__
+void stellite_gpu_phase2(const uint32_t threads, const uint16_t bfactor, const uint32_t partidx,
+	uint64_t * __restrict__ d_long_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
+	uint64_t * __restrict__ d_tweak)
+{
+	__shared__ __align__(16) uint32_t sharedMemory[1024];
+	cn_aes_gpu_init(sharedMemory);
+	__syncthreads();
+
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+	if (thread < threads)
 	{
-		const uint32_t long_oft = (thread << LONG_SHL64) + sub;
-		const uint32_t st_oft = (thread * 26U) + sub + 8U;
+		const uint32_t batchsize = ITER >> (2 + bfactor);
+		const uint32_t start = partidx * batchsize;
+		const uint32_t end = start + batchsize;
+		uint64_t tweak = d_tweak[thread];
+
+		void * ctx_a = (void*)(&d_ctx_a[thread << 2]);
+		void * ctx_b = (void*)(&d_ctx_b[thread << 2]);
+		uint4 A = AS_UINT4(ctx_a); // ld.global.u32.v4
+		uint4 B = AS_UINT4(ctx_b);
+
+		uint64_t * long_state = &d_long_state[thread << LONG_SHL64];
+		for (int i = start; i < end; i++) // end = 262144
+		{
+			uint4 C;
+			uint32_t j = (A.x & E2I_MASK) >> 3;
+			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &C);
+			store_variant2(&long_state[j], C ^ B); // st.global
+			MUL_SUM_XOR_DST_1((AS_UL2(&C)).x, A, &long_state[(C.x & E2I_MASK) >> 3], tweak);
+
+			j = (A.x & E2I_MASK) >> 3;
+			cn_aes_single_round_b((uint8_t*)sharedMemory, &long_state[j], A, &B);
+			store_variant2(&long_state[j], C ^ B);
+			MUL_SUM_XOR_DST_1((AS_UL2(&B)).x, A, &long_state[(B.x & E2I_MASK) >> 3], tweak);
+		}
+		if (bfactor) {
+			AS_UINT4(ctx_a) = A;
+			AS_UINT4(ctx_b) = B;
+		}
+	}
+}
+
+// --------------------------------------------------------------------------------------------------------------

-		uint4 key[10];
-		const uint32_t* ctx_key = &ctx_key2[thread * 40U];
-		#pragma unroll 10 // 160 bytes
-		for (int i = 0; i < 10; i++)
-			key[i] = AS_UINT4(&ctx_key[i*4U]);
+__global__
+void cryptonight_gpu_phase3(const uint32_t threads, const uint32_t * __restrict__ d_long_state,
+	uint32_t * __restrict__ d_ctx_state, const uint32_t * __restrict__ d_ctx_key2)
+{
+	__shared__ uint32_t sharedMemory[1024];
+
+	cn_aes_gpu_init(sharedMemory);
+	__syncthreads();
+
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x) >> 3;

-		uint4 text = AS_UINT4(&ctx_state[st_oft]);
+	if(thread < threads)
+	{
+		const int sub = (threadIdx.x & 7) << 2;
+		const uint32_t *longstate = &d_long_state[(thread << LONG_SHL32) + sub];
+		uint32_t key[40], text[4];
+		MEMCPY8(key, d_ctx_key2 + thread * 40, 20);
+		MEMCPY8(text, d_ctx_state + thread * 50 + sub + 16, 2);

-		for(uint32_t i = 0; i < LONG_LOOPS64; i += 16U)
+		for(int i = 0; i < LONG_LOOPS32; i += 32)
 		{
-			uint4 st = AS_UINT4(&long_state[long_oft + i]);
-			text = text ^ st;
-			cn_aes_pseudo_round_mut_uint4(sharedMemory, text, key);
+			#pragma unroll
+			for(int j = 0; j < 4; ++j)
+				text[j] ^= longstate[i + j];
+
+			cn_aes_pseudo_round_mut(sharedMemory, text, key);
 		}

-		AS_UINT4(&ctx_state[st_oft]) = text;
+		MEMCPY8(d_ctx_state + thread * 50 + sub + 16, text, 2);
 	}
 }

+// --------------------------------------------------------------------------------------------------------------
+
 extern int device_bfactor[MAX_GPUS];

 __host__
-void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint64_t *d_ctx_state,
-	uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
+void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state,
+	uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
 {
 	dim3 grid(blocks);
 	dim3 block(threads);
-	//dim3 block2(threads << 1);
 	dim3 block4(threads << 2);
 	dim3 block8(threads << 3);

-	const uint32_t bfactor = (uint32_t) device_bfactor[thr_id];
-	const uint32_t partcount = 1 << bfactor;
+	const uint16_t bfactor = (uint16_t) device_bfactor[thr_id];
+	const uint32_t partcount = 1U << bfactor;
 	const uint32_t throughput = (uint32_t) (blocks*threads);

 	const int bsleep = bfactor ? 100 : 0;
 	const int dev_id = device_map[thr_id];

-	cryptonight_core_gpu_phase1 <<<grid, block8>>> (throughput, d_long_state, d_ctx_state, d_ctx_key1);
+	cryptonight_gpu_phase1 <<<grid, block8>>> (throughput, (uint32_t*) d_long_state, d_ctx_state, d_ctx_key1);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 	if(partcount > 1) usleep(bsleep);

 	for (uint32_t i = 0; i < partcount; i++)
 	{
 		dim3 b = device_sm[dev_id] >= 300 ? block4 : block;
-		cryptonight_core_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
+		if (variant == 0)
+			cryptonight_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b);
+		else if (variant == 1 || cryptonight_fork == 8)
+			monero_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
+		else if (variant == 2 && cryptonight_fork == 3)
+			stellite_gpu_phase2 <<<grid, b>>> (throughput, bfactor, i, d_long_state, d_ctx_a, d_ctx_b, d_ctx_tweak);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		if(partcount > 1) usleep(bsleep);
 	}
-
-	cryptonight_core_gpu_phase3 <<<grid, block8>>> (throughput, d_long_state, d_ctx_state, d_ctx_key2);
+	//cudaDeviceSynchronize();
+	//exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+	cryptonight_gpu_phase3 <<<grid, block8>>> (throughput, (uint32_t*) d_long_state, d_ctx_state, d_ctx_key2);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 }
--- a/crypto/cryptonight-cpu.cpp
+++ b/crypto/cryptonight-cpu.cpp
@ -130,14 +130,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
				@@ -130,14 +130,14 @@ static void mul_sum_dst(const uint8_t* a, const uint8_t* b, const uint8_t* c, ui
 	((uint64_t*) dst)[0] += ((uint64_t*) c)[0];
 }

-static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst) {
+static void mul_sum_xor_dst(const uint8_t* a, uint8_t* c, uint8_t* dst, const int variant, const uint64_t tweak) {
 	uint64_t hi, lo = mul128(((uint64_t*) a)[0], ((uint64_t*) dst)[0], &hi) + ((uint64_t*) c)[1];
 	hi += ((uint64_t*) c)[0];

 	((uint64_t*) c)[0] = ((uint64_t*) dst)[0] ^ hi;
 	((uint64_t*) c)[1] = ((uint64_t*) dst)[1] ^ lo;
 	((uint64_t*) dst)[0] = hi;
-	((uint64_t*) dst)[1] = lo;
+	((uint64_t*) dst)[1] = variant ? lo ^ tweak : lo;
 }

 static void copy_block(uint8_t* dst, const uint8_t* src) {
@ -155,17 +155,34 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
				@@ -155,17 +155,34 @@ static void xor_blocks_dst(const uint8_t* a, const uint8_t* b, uint8_t* dst) {
 	((uint64_t*) dst)[1] = ((uint64_t*) a)[1] ^ ((uint64_t*) b)[1];
 }

-static void cryptonight_hash_ctx(void* output, const void* input, size_t len, struct cryptonight_ctx* ctx)
+static void cryptonight_store_variant(void* state, int variant) {
+	if (variant == 1 || cryptonight_fork == 8) {
+		// monero and graft
+		const uint8_t tmp = ((const uint8_t*)(state))[11];
+		const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1;
+		((uint8_t*)(state))[11] = tmp ^ ((0x75310 >> index) & 0x30);
+	} else if (variant == 2 && cryptonight_fork == 3) {
+		// stellite
+		const uint8_t tmp = ((const uint8_t*)(state))[11];
+		const uint8_t index = (((tmp >> 4) & 6) | (tmp & 1)) << 1;
+		((uint8_t*)(state))[11] = tmp ^ ((0x75312 >> index) & 0x30);
+	}
+}
+
+static void cryptonight_hash_ctx(void* output, const void* input, const size_t len, struct cryptonight_ctx* ctx, const int variant)
 {
 	size_t i, j;
+
 	keccak_hash_process(&ctx->state.hs, (const uint8_t*) input, len);
 	ctx->aes_ctx = (oaes_ctx*) oaes_alloc();
 	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);

+	const uint64_t tweak = variant ? *((uint64_t*) (((uint8_t*)input) + 35)) ^ ctx->state.hs.w[24] : 0;
+
 	oaes_key_import_data(ctx->aes_ctx, ctx->state.hs.b, AES_KEY_SIZE);
 	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
-#undef RND
-#define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data);
+		#undef RND
+			#define RND(p) aesb_pseudo_round_mut(&ctx->text[AES_BLOCK_SIZE * p], ctx->aes_ctx->key->exp_data);
 		RND(0);
 		RND(1);
 		RND(2);
@ -184,22 +201,22 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st
				@@ -184,22 +201,22 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st
 		j = e2i(ctx->a) * AES_BLOCK_SIZE;
 		aesb_single_round(&ctx->long_state[j], ctx->c, ctx->a);
 		xor_blocks_dst(ctx->c, ctx->b, &ctx->long_state[j]);
-
-		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE]);
+		cryptonight_store_variant(&ctx->long_state[j], variant);
+		mul_sum_xor_dst(ctx->c, ctx->a, &ctx->long_state[e2i(ctx->c) * AES_BLOCK_SIZE], variant, tweak);

 		j = e2i(ctx->a) * AES_BLOCK_SIZE;
 		aesb_single_round(&ctx->long_state[j], ctx->b, ctx->a);
 		xor_blocks_dst(ctx->b, ctx->c, &ctx->long_state[j]);
-
-		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE]);
+		cryptonight_store_variant(&ctx->long_state[j], variant);
+		mul_sum_xor_dst(ctx->b, ctx->a, &ctx->long_state[e2i(ctx->b) * AES_BLOCK_SIZE], variant, tweak);
 	}

 	memcpy(ctx->text, ctx->state.init, INIT_SIZE_BYTE);
 	oaes_key_import_data(ctx->aes_ctx, &ctx->state.hs.b[32], AES_KEY_SIZE);
 	for (i = 0; likely(i < MEMORY); i += INIT_SIZE_BYTE) {
-#undef RND
-#define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \
-		aesb_pseudo_round_mut(&ctx->text[p * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
+		#undef RND
+		#define RND(p) xor_blocks(&ctx->text[p * AES_BLOCK_SIZE], &ctx->long_state[i + p * AES_BLOCK_SIZE]); \
+			aesb_pseudo_round_mut(&ctx->text[p * AES_BLOCK_SIZE], ctx->aes_ctx->key->exp_data);
 		RND(0);
 		RND(1);
 		RND(2);
@ -219,9 +236,34 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st
				@@ -219,9 +236,34 @@ static void cryptonight_hash_ctx(void* output, const void* input, size_t len, st
 	oaes_free((OAES_CTX **) &ctx->aes_ctx);
 }

-void cryptonight_hash(void* output, const void* input, size_t len)
+void cryptonight_hash_variant(void* output, const void* input, size_t len, int variant)
 {
 	struct cryptonight_ctx *ctx = (struct cryptonight_ctx*)malloc(sizeof(struct cryptonight_ctx));
-	cryptonight_hash_ctx(output, input, len, ctx);
+	cryptonight_hash_ctx(output, input, len, ctx, variant);
 	free(ctx);
 }
+
+void cryptonight_hash(void* output, const void* input)
+{
+	cryptonight_fork = 1;
+	cryptonight_hash_variant(output, input, 76, 0);
+}
+
+void graft_hash(void* output, const void* input)
+{
+	cryptonight_fork = 8;
+	cryptonight_hash_variant(output, input, 76, 1);
+}
+
+void monero_hash(void* output, const void* input)
+{
+	cryptonight_fork = 7;
+	cryptonight_hash_variant(output, input, 76, 1);
+}
+
+void stellite_hash(void* output, const void* input)
+{
+	cryptonight_fork = 3;
+	cryptonight_hash_variant(output, input, 76, 2);
+}
+
--- a/crypto/cryptonight-extra.cu
+++ b/crypto/cryptonight-extra.cu
@ -7,15 +7,15 @@
				@@ -7,15 +7,15 @@

 #include <miner.h>
 #include <cuda_helper.h>
-#include "cryptonight.h"

-typedef uint8_t BitSequence;
-typedef uint64_t DataLength;
+#include "cryptonight.h"

-static uint32_t *d_input[MAX_GPUS] = { 0 };
+static uint32_t *d_input[MAX_GPUS];
 static uint32_t *d_target[MAX_GPUS];
 static uint32_t *d_result[MAX_GPUS];

+typedef uint8_t BitSequence;
+typedef uint32_t DataLength;
 #include "cn_keccak.cuh"
 #include "cn_blake.cuh"
 #include "cn_groestl.cuh"
@ -44,13 +44,11 @@ __constant__ uint8_t d_sub_byte[16][16] = {
				@@ -44,13 +44,11 @@ __constant__ uint8_t d_sub_byte[16][16] = {
 __device__ __forceinline__
 void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __restrict__ data)
 {
-	const uint32_t aes_gf[] = {
+	const uint32_t aes_gf[10] = {
 		0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1b, 0x36
 	};

-	MEMSET4(key, 0, 40);
 	MEMCPY4(key, data, 8);
-
 	#pragma unroll
 	for(int i = 8; i < 40; i++)
 	{
@ -74,15 +72,14 @@ void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __res
				@@ -74,15 +72,14 @@ void cryptonight_aes_set_key(uint32_t * __restrict__ key, const uint32_t * __res
 }

 __global__
-void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict__ d_input, uint32_t startNonce,
-	uint64_t * d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
-	uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2)
+void cryptonight_extra_gpu_prepare(const uint32_t threads, const uint32_t * __restrict__ d_input, uint32_t startNonce,
+	uint32_t * __restrict__ d_ctx_state, uint32_t * __restrict__ d_ctx_a, uint32_t * __restrict__ d_ctx_b,
+	uint32_t * __restrict__ d_ctx_key1, uint32_t * __restrict__ d_ctx_key2, int variant, uint64_t * d_ctx_tweak)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-
 	if(thread < threads)
 	{
-		uint32_t ctx_state[50];
+		uint64_t ctx_state[25];
 		uint32_t ctx_a[4];
 		uint32_t ctx_b[4];
 		uint32_t ctx_key1[40];
@ -90,92 +87,62 @@ void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict
				@@ -90,92 +87,62 @@ void cryptonight_extra_gpu_prepare(const uint32_t threads, uint32_t * __restrict
 		uint32_t input[19];

 		MEMCPY4(input, d_input, 19);
-		*((uint32_t *)(((char *)input) + 39)) = startNonce + thread;
-
-		cn_keccak((uint8_t *)input, (uint8_t *)ctx_state);
-		cryptonight_aes_set_key(ctx_key1, ctx_state);
-		cryptonight_aes_set_key(ctx_key2, ctx_state + 8);
-		XOR_BLOCKS_DST(ctx_state, ctx_state + 8, ctx_a);
-		XOR_BLOCKS_DST(ctx_state + 4, ctx_state + 12, ctx_b);
-
-		MEMCPY8(&d_ctx_state[thread * 26], ctx_state, 25);
-		MEMCPY4(d_ctx_a + thread * 4, ctx_a, 4);
-		MEMCPY4(d_ctx_b + thread * 4, ctx_b, 4);
-		MEMCPY4(d_ctx_key1 + thread * 40, ctx_key1, 40);
-		MEMCPY4(d_ctx_key2 + thread * 40, ctx_key2, 40);
-	}
-}

-__global__
-void cryptonight_extra_gpu_keccak(uint32_t threads, uint32_t * d_ctx_state)
-{
-	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-	if(thread < threads)
-	{
-		uint64_t* ctx_state = (uint64_t*) (&d_ctx_state[thread * 52U]);
-		uint64_t state[25];
-		#pragma unroll
-		for(int i = 0; i < 25; i++)
-			state[i] = ctx_state[i];
-
-		cn_keccakf2(state);
-
-		// to reduce the final kernel stack frame, cut algos in 2 kernels
-		// ps: these 2 final kernels are not important for the overall xmr hashrate (< 1%)
-		switch (((uint8_t*)state)[0] & 0x03)
-		{
-			case 0: {
-				uint32_t hash[8];
-				cn_blake((uint8_t*)state, 200, (uint8_t*)hash);
-				((uint32_t*)ctx_state)[0] = 0;
-				((uint32_t*)ctx_state)[6] = hash[6];
-				((uint32_t*)ctx_state)[7] = hash[7];
-				break;
-			}
-			case 1: {
-				uint32_t hash[8];
-				cn_groestl((BitSequence*)state, 200, (BitSequence*)hash);
-				((uint32_t*)ctx_state)[0] = 0;
-				((uint32_t*)ctx_state)[6] = hash[6];
-				((uint32_t*)ctx_state)[7] = hash[7];
-				break;
-			}
-			default: {
-				#pragma unroll
-				for(int i = 0; i < 25; i++)
-					ctx_state[i] = state[i];
-			}
+		uint32_t nonce = startNonce + thread;
+		*(((uint8_t *)input) + 39) = nonce & 0xff;
+		*(((uint8_t *)input) + 40) = (nonce >> 8) & 0xff;
+		*(((uint8_t *)input) + 41) = (nonce >> 16) & 0xff;
+		*(((uint8_t *)input) + 42) = (nonce >> 24) & 0xff;
+
+		cn_keccak(input, ctx_state);
+		MEMCPY4(&d_ctx_state[thread * 50U], ctx_state, 50);
+
+		cryptonight_aes_set_key(ctx_key1, (uint32_t*)(&ctx_state[0]));
+		cryptonight_aes_set_key(ctx_key2, (uint32_t*)(&ctx_state[4]));
+		MEMCPY4(&d_ctx_key1[thread * 40U], ctx_key1, 40);
+		MEMCPY4(&d_ctx_key2[thread * 40U], ctx_key2, 40);
+
+		XOR_BLOCKS_DST(&ctx_state[0], &ctx_state[4], ctx_a);
+		XOR_BLOCKS_DST(&ctx_state[2], &ctx_state[6], ctx_b);
+		MEMCPY4(&d_ctx_a[thread * 4U], ctx_a, 4);
+		MEMCPY4(&d_ctx_b[thread * 4U], ctx_b, 4);
+
+		if (variant) {
+			uint2 tweak = AS_UINT2(&ctx_state[24]);
+			//tweak.x ^= (input[8] >> 24) | (input[9] << 8);
+			tweak.x ^= __byte_perm(input[8], input[ 9], 0x6543);
+			tweak.y ^= __byte_perm(input[9], input[10], 0x6543);
+			MEMCPY4(&d_ctx_tweak[thread], &tweak, 2);
 		}
 	}
 }

 __global__
-void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, uint64_t * __restrict__ d_ctx_state,
-	const uint32_t* d_target, uint32_t * resNonces)
+void cryptonight_extra_gpu_final(const uint32_t threads, uint32_t startNonce, const uint32_t * __restrict__ d_target,
+	uint32_t * __restrict__ resNonces, uint32_t * __restrict__ d_ctx_state)
 {
-	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
 	if(thread < threads)
 	{
-		uint64_t* const state = &d_ctx_state[thread * 26U];
-
+		uint32_t *ctx_state = &d_ctx_state[thread * 50U];
 		uint32_t hash[8];
-		switch(((uint8_t *)state)[0] & 0x03)
-		{
-			case 0: {
-				uint32_t* h32 = (uint32_t*)state;
-				hash[6] = h32[6];
-				hash[7] = h32[7];
-				break;
-			}
-			case 2: {
-				cn_jh256((uint8_t*)state, 200, hash);
-				break;
-			}
-			case 3: {
-				cn_skein((uint8_t*)state, 200, hash);
-				break;
-			}
-		}
+		uint32_t state[50];
+
+		#pragma unroll 25
+		for(int i = 0; i < 50; i+=2)
+			AS_UINT2(&state[i]) = AS_UINT2(&ctx_state[i]);
+
+		cn_keccakf2((uint64_t *)state);
+
+		int branch = ((uint8_t *)state)[0] & 0x03;
+		if(branch == 0)
+			cn_blake((const uint8_t *)state, 200, hash);
+		if(branch == 1)
+			cn_groestl((const uint8_t *)state, 200, hash);
+		if(branch == 2)
+			cn_jh((const uint8_t *)state, 200, hash);
+		if(branch == 3)
+			cn_skein((const uint8_t *)state, 200, hash);

 		if(hash[7] <= d_target[1] && hash[6] <= d_target[0])
 		{
@ -188,55 +155,53 @@ void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, ui
				@@ -188,55 +155,53 @@ void cryptonight_extra_gpu_final(uint32_t threads, const uint32_t startNonce, ui
 }

 __host__
-void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *ptarget)
+void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget)
 {
 	uint32_t *pTargetIn = (uint32_t*) ptarget;
-	cudaMemcpy(d_input[thr_id], data, 19 * sizeof(uint32_t), cudaMemcpyHostToDevice);
-	cudaMemcpy(d_target[thr_id], &pTargetIn[6], 2*sizeof(uint32_t), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_input[thr_id], data, 20 * sizeof(uint32_t), cudaMemcpyHostToDevice);
+	cudaMemcpy(d_target[thr_id], &pTargetIn[6], 2 * sizeof(uint32_t), cudaMemcpyHostToDevice);
+	cudaMemset(d_result[thr_id], 0xFF, 2 * sizeof(uint32_t));
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 }

 __host__
-void cryptonight_extra_cpu_init(int thr_id, uint32_t threads)
+void cryptonight_extra_init(int thr_id)
 {
-	cudaMalloc(&d_input[thr_id], 19 * sizeof(uint32_t));
-	cudaMalloc(&d_target[thr_id], 2*sizeof(uint32_t));
-	cudaMalloc(&d_result[thr_id], 2*sizeof(uint32_t));
+	cudaMalloc(&d_input[thr_id], 20 * sizeof(uint32_t));
+	cudaMalloc(&d_target[thr_id], 2 * sizeof(uint32_t));
+	cudaMalloc(&d_result[thr_id], 2 * sizeof(uint32_t));
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 }

 __host__
-void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2)
+void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak)
 {
 	uint32_t threadsperblock = 128;

 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);

-	cryptonight_extra_gpu_prepare <<<grid, block >>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2);
+	cryptonight_extra_gpu_prepare <<<grid, block>>> (threads, d_input[thr_id], startNonce, d_ctx_state, d_ctx_a, d_ctx_b, d_ctx_key1, d_ctx_key2, variant, d_ctx_tweak);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 }

 __host__
-void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint64_t *d_ctx_state)
+void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces, uint32_t *d_ctx_state)
 {
 	uint32_t threadsperblock = 128;

 	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
 	dim3 block(threadsperblock);

-	cudaMemset(d_result[thr_id], 0xFF, 2*sizeof(uint32_t));
-	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-	cryptonight_extra_gpu_keccak <<<grid, block >>> (threads, (uint32_t*)d_ctx_state);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-	cryptonight_extra_gpu_final <<<grid, block >>> (threads, startNonce, d_ctx_state, d_target[thr_id], d_result[thr_id]);
+	cryptonight_extra_gpu_final <<<grid, block>>> (threads, startNonce, d_target[thr_id], d_result[thr_id], d_ctx_state);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-	cudaMemcpy(resnonce, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
+	cudaMemcpy(resNonces, d_result[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost);
 	exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 }

 __host__
-void cryptonight_extra_cpu_free(int thr_id)
+void cryptonight_extra_free(int thr_id)
 {
 	if (d_input[thr_id]) {
 		cudaFree(d_input[thr_id]);
@ -244,4 +209,4 @@ void cryptonight_extra_cpu_free(int thr_id)
				@@ -244,4 +209,4 @@ void cryptonight_extra_cpu_free(int thr_id)
 		cudaFree(d_result[thr_id]);
 		d_input[thr_id] = NULL;
 	}
-}
+}
--- a/crypto/cryptonight.cu
+++ b/crypto/cryptonight.cu
@ -12,16 +12,17 @@ static __thread bool gpu_init_shown = false;
				@@ -12,16 +12,17 @@ static __thread bool gpu_init_shown = false;
 	gpulog(p, thr, fmt, ##__VA_ARGS__)

 static uint64_t *d_long_state[MAX_GPUS];
-static uint64_t *d_ctx_state[MAX_GPUS];
+static uint32_t *d_ctx_state[MAX_GPUS];
 static uint32_t *d_ctx_key1[MAX_GPUS];
 static uint32_t *d_ctx_key2[MAX_GPUS];
 static uint32_t *d_ctx_text[MAX_GPUS];
+static uint64_t *d_ctx_tweak[MAX_GPUS];
 static uint32_t *d_ctx_a[MAX_GPUS];
 static uint32_t *d_ctx_b[MAX_GPUS];

 static bool init[MAX_GPUS] = { 0 };

-extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant)
 {
 	int res = 0;
 	uint32_t throughput = 0;
@ -49,6 +50,13 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
				@@ -49,6 +50,13 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 			gpulog_init(LOG_INFO, thr_id, "%s, %d MB available, %hd SMX", device_name[dev_id],
 				mem, device_mpcount[dev_id]);

+		if (!device_config[thr_id]) {
+			if(strcmp(device_name[dev_id], "TITAN V") == 0)
+				device_config[thr_id] = strdup("80x24");
+			if(strstr(device_name[dev_id], "V100"))
+				device_config[thr_id] = strdup("80x24");
+		}
+
 		if (device_config[thr_id]) {
 			int res = sscanf(device_config[thr_id], "%ux%u", &cn_blocks, &cn_threads);
 			throughput = cuda_default_throughput(thr_id, cn_blocks*cn_threads);
@ -70,7 +78,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
				@@ -70,7 +78,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 			exit(1);
 		}

-		cudaSetDevice(device_map[thr_id]);
+		cudaSetDevice(dev_id);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
 			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
@ -78,12 +86,12 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
				@@ -78,12 +86,12 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 			CUDA_LOG_ERROR();
 		}

-		const size_t alloc = MEMORY * throughput;
-		cryptonight_extra_cpu_init(thr_id, throughput);
+		const size_t alloc = MEMORY * size_t(throughput);
+		cryptonight_extra_init(thr_id);

 		cudaMalloc(&d_long_state[thr_id], alloc);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
-		cudaMalloc(&d_ctx_state[thr_id], 208 * throughput); // 52*4 (200 is not aligned 16)
+		cudaMalloc(&d_ctx_state[thr_id], 50 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		cudaMalloc(&d_ctx_key1[thr_id], 40 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
@ -95,6 +103,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
				@@ -95,6 +103,8 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
 		cudaMalloc(&d_ctx_b[thr_id], 4 * sizeof(uint32_t) * throughput);
 		exit_if_cudaerror(thr_id, __FUNCTION__, __LINE__);
+		cudaMalloc(&d_ctx_tweak[thr_id], sizeof(uint64_t) * throughput);
+		exit_if_cudaerror(thr_id, __FILE__, __LINE__);

 		gpu_init_shown = true;
 		init[thr_id] = true;
@ -107,10 +117,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
				@@ -107,10 +117,10 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 		const uint32_t Htarg = ptarget[7];
 		uint32_t resNonces[2] = { UINT32_MAX, UINT32_MAX };

-		cryptonight_extra_cpu_setData(thr_id, pdata, ptarget);
-		cryptonight_extra_cpu_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
-		cryptonight_core_cuda(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id]);
-		cryptonight_extra_cpu_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);
+		cryptonight_extra_setData(thr_id, pdata, ptarget);
+		cryptonight_extra_prepare(thr_id, throughput, nonce, d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
+		cryptonight_core_cuda(thr_id, cn_blocks, cn_threads, d_long_state[thr_id], d_ctx_state[thr_id], d_ctx_a[thr_id], d_ctx_b[thr_id], d_ctx_key1[thr_id], d_ctx_key2[thr_id], variant, d_ctx_tweak[thr_id]);
+		cryptonight_extra_final(thr_id, throughput, nonce, resNonces, d_ctx_state[thr_id]);

 		*hashes_done = nonce - first_nonce + throughput;

@ -121,7 +131,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
				@@ -121,7 +131,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 			uint32_t *tempnonceptr = (uint32_t*)(((char*)tempdata) + 39);
 			memcpy(tempdata, pdata, 76);
 			*tempnonceptr = resNonces[0];
-			cryptonight_hash(vhash, tempdata, 76);
+			cryptonight_hash_variant(vhash, tempdata, 76, variant);
 			if(vhash[7] <= Htarg && fulltest(vhash, ptarget))
 			{
 				res = 1;
@ -131,7 +141,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
				@@ -131,7 +141,7 @@ extern "C" int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_
 				if(resNonces[1] != UINT32_MAX)
 				{
 					*tempnonceptr = resNonces[1];
-					cryptonight_hash(vhash, tempdata, 76);
+					cryptonight_hash_variant(vhash, tempdata, 76, variant);
 					if(vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
 						res++;
 						work->nonces[1] = resNonces[1];
@ -174,10 +184,11 @@ void free_cryptonight(int thr_id)
				@@ -174,10 +184,11 @@ void free_cryptonight(int thr_id)
 	cudaFree(d_ctx_key1[thr_id]);
 	cudaFree(d_ctx_key2[thr_id]);
 	cudaFree(d_ctx_text[thr_id]);
+	cudaFree(d_ctx_tweak[thr_id]);
 	cudaFree(d_ctx_a[thr_id]);
 	cudaFree(d_ctx_b[thr_id]);

-	cryptonight_extra_cpu_free(thr_id);
+	cryptonight_extra_free(thr_id);

 	cudaDeviceSynchronize();

--- a/crypto/cryptonight.h
+++ b/crypto/cryptonight.h
@ -20,7 +20,6 @@ struct uint3  blockDim;
				@@ -20,7 +20,6 @@ struct uint3  blockDim;
 #define __umul64hi(a,b) a*b
 #endif

-
 #define MEMORY         (1U << 21) // 2 MiB / 2097152 B
 #define ITER           (1U << 20) // 1048576
 #define E2I_MASK       0x1FFFF0u
@ -136,10 +135,10 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
				@@ -136,10 +135,10 @@ static inline void exit_if_cudaerror(int thr_id, const char *src, int line)
 		exit(1);
 	}
 }
-void cryptonight_core_cuda(int thr_id, int blocks, int threads, uint64_t *d_long_state, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
+void cryptonight_core_cuda(int thr_id, uint32_t blocks, uint32_t threads, uint64_t *d_long_state, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);

-void cryptonight_extra_cpu_setData(int thr_id, const void *data, const void *pTargetIn);
-void cryptonight_extra_cpu_init(int thr_id, uint32_t threads);
-void cryptonight_extra_cpu_free(int thr_id);
-void cryptonight_extra_cpu_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2);
-void cryptonight_extra_cpu_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *nonce, uint64_t *d_ctx_state);
+void cryptonight_extra_setData(int thr_id, const void *data, const void *ptarget);
+void cryptonight_extra_init(int thr_id);
+void cryptonight_extra_free(int thr_id);
+void cryptonight_extra_prepare(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_ctx_state, uint32_t *d_ctx_a, uint32_t *d_ctx_b, uint32_t *d_ctx_key1, uint32_t *d_ctx_key2, int variant, uint64_t *d_ctx_tweak);
+void cryptonight_extra_final(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resnonce, uint32_t *d_ctx_state);
--- a/crypto/xmr-rpc.cpp
+++ b/crypto/xmr-rpc.cpp
@ -550,18 +550,24 @@ bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work)
				@@ -550,18 +550,24 @@ bool rpc2_stratum_submit(struct pool_infos *pool, struct work *work)
 	}

 	else if (opt_algo == ALGO_CRYPTOLIGHT) {
+		int variant = 1;
 		uint32_t nonce = work->nonces[idnonce];
 		noncestr = bin2hex((unsigned char*) &nonce, 4);
 		last_found_nonce = nonce;
-		cryptolight_hash(hash, data, 76);
+		//if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork)
+		//	variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1;
+		cryptolight_hash_variant(hash, data, 76, variant);
 		work_set_target_ratio(work, (uint32_t*) hash);
 	}

 	else if (opt_algo == ALGO_CRYPTONIGHT) {
+		int variant = 0;
 		uint32_t nonce = work->nonces[idnonce];
 		noncestr = bin2hex((unsigned char*) &nonce, 4);
 		last_found_nonce = nonce;
-		cryptonight_hash(hash, data, 76);
+		if (cryptonight_fork > 1 && ((unsigned char*)work->data)[0] >= cryptonight_fork)
+			variant = ((unsigned char*)work->data)[0] - cryptonight_fork + 1;
+		cryptonight_hash_variant(hash, data, 76, variant);
 		work_set_target_ratio(work, (uint32_t*) hash);
 	}

--- a/equi/equi-stratum.cpp
+++ b/equi/equi-stratum.cpp
@ -101,7 +101,7 @@ bool equi_stratum_set_target(struct stratum_ctx *sctx, json_t *params)
				@@ -101,7 +101,7 @@ bool equi_stratum_set_target(struct stratum_ctx *sctx, json_t *params)
 		target_be[31-i] = target_bin[i];
 		if (target_bin[i]) filled++;
 	}
-	memcpy(sctx->job.claim, target_be, 32); // hack, unused struct field
+	memcpy(sctx->job.extra, target_be, 32);

 	pthread_mutex_lock(&stratum_work_lock);
 	sctx->next_diff = target_to_diff_equi((uint32_t*) &target_be);
--- a/equi/equihash.cpp
+++ b/equi/equihash.cpp
@ -183,8 +183,7 @@ extern "C" int scanhash_equihash(int thr_id, struct work *work, uint32_t max_non
				@@ -183,8 +183,7 @@ extern "C" int scanhash_equihash(int thr_id, struct work *work, uint32_t max_non
 				return -1;
 			}
 			size_t memSz = solvers[thr_id]->equi_mem_sz / (1024*1024);
-			gpus_intensity[thr_id] = (uint32_t) solvers[thr_id]->throughput;
-			api_set_throughput(thr_id, gpus_intensity[thr_id]);
+			api_set_throughput(thr_id, (uint32_t) solvers[thr_id]->throughput);
 			gpulog(LOG_DEBUG, thr_id, "Allocated %u MB of context memory", (u32) memSz);
 			cuda_get_arch(thr_id);
 			init[thr_id] = true;
--- a/lyra2/Lyra2.c
+++ b/lyra2/Lyra2.c
@ -212,3 +212,176 @@ int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *sa
				@@ -212,3 +212,176 @@ int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *sa

 	return 0;
 }
+
+int LYRA2_3(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols)
+{
+	//============================= Basic variables ============================//
+	int64_t row = 2; //index of row to be processed
+	int64_t prev = 1; //index of prev (last row ever computed/modified)
+	int64_t rowa = 0; //index of row* (a previous row, deterministically picked during Setup and randomly picked while Wandering)
+	int64_t tau; //Time Loop iterator
+	int64_t step = 1; //Visitation step (used during Setup and Wandering phases)
+	int64_t window = 2; //Visitation window (used to define which rows can be revisited during Setup)
+	int64_t gap = 1; //Modifier to the step, assuming the values 1 or -1
+	int64_t i; //auxiliary iteration counter
+	int64_t v64; // 64bit var for memcpy
+	uint64_t instance = 0;
+	//==========================================================================/
+
+	//========== Initializing the Memory Matrix and pointers to it =============//
+	//Tries to allocate enough space for the whole memory matrix
+
+	const int64_t ROW_LEN_INT64 = BLOCK_LEN_INT64 * nCols;
+	const int64_t ROW_LEN_BYTES = ROW_LEN_INT64 * 8;
+	// for Lyra2REv2, nCols = 4, v1 was using 8
+	const int64_t BLOCK_LEN = (nCols == 4) ? BLOCK_LEN_BLAKE2_SAFE_INT64 : BLOCK_LEN_BLAKE2_SAFE_BYTES;
+
+	size_t sz = (size_t)ROW_LEN_BYTES * nRows;
+	uint64_t *wholeMatrix = malloc(sz);
+	if (wholeMatrix == NULL) {
+		return -1;
+	}
+	memset(wholeMatrix, 0, sz);
+
+	//Allocates pointers to each row of the matrix
+	uint64_t **memMatrix = malloc(sizeof(uint64_t*) * nRows);
+	if (memMatrix == NULL) {
+		return -1;
+	}
+	//Places the pointers in the correct positions
+	uint64_t *ptrWord = wholeMatrix;
+	for (i = 0; i < nRows; i++) {
+		memMatrix[i] = ptrWord;
+		ptrWord += ROW_LEN_INT64;
+	}
+	//==========================================================================/
+
+	//============= Getting the password + salt + basil padded with 10*1 ===============//
+	//OBS.:The memory matrix will temporarily hold the password: not for saving memory,
+	//but this ensures that the password copied locally will be overwritten as soon as possible
+
+	//First, we clean enough blocks for the password, salt, basil and padding
+	int64_t nBlocksInput = ((saltlen + pwdlen + 6 * sizeof(uint64_t)) / BLOCK_LEN_BLAKE2_SAFE_BYTES) + 1;
+
+	byte *ptrByte = (byte*) wholeMatrix;
+
+	//Prepends the password
+	memcpy(ptrByte, pwd, pwdlen);
+	ptrByte += pwdlen;
+
+	//Concatenates the salt
+	memcpy(ptrByte, salt, saltlen);
+	ptrByte += saltlen;
+
+	memset(ptrByte, 0, (size_t) (nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - (saltlen + pwdlen)));
+
+	//Concatenates the basil: every integer passed as parameter, in the order they are provided by the interface
+	memcpy(ptrByte, &kLen, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = pwdlen;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = saltlen;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = timeCost;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = nRows;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+	v64 = nCols;
+	memcpy(ptrByte, &v64, sizeof(int64_t));
+	ptrByte += sizeof(uint64_t);
+
+	//Now comes the padding
+	*ptrByte = 0x80; //first byte of padding: right after the password
+	ptrByte = (byte*) wholeMatrix; //resets the pointer to the start of the memory matrix
+	ptrByte += nBlocksInput * BLOCK_LEN_BLAKE2_SAFE_BYTES - 1; //sets the pointer to the correct position: end of incomplete block
+	*ptrByte ^= 0x01; //last byte of padding: at the end of the last incomplete block
+	//==========================================================================/
+
+	//======================= Initializing the Sponge State ====================//
+	//Sponge state: 16 uint64_t, BLOCK_LEN_INT64 words of them for the bitrate (b) and the remainder for the capacity (c)
+	uint64_t state[16];
+	initState(state);
+	//==========================================================================/
+
+	//================================ Setup Phase =============================//
+	//Absorbing salt, password and basil: this is the only place in which the block length is hard-coded to 512 bits
+	ptrWord = wholeMatrix;
+	for (i = 0; i < nBlocksInput; i++) {
+		absorbBlockBlake2Safe(state, ptrWord); //absorbs each block of pad(pwd || salt || basil)
+		ptrWord += BLOCK_LEN; //goes to next block of pad(pwd || salt || basil)
+	}
+
+	//Initializes M[0] and M[1]
+	reducedSqueezeRow0(state, memMatrix[0], nCols); //The locally copied password is most likely overwritten here
+
+	reducedDuplexRow1(state, memMatrix[0], memMatrix[1], nCols);
+
+	do {
+		//M[row] = rand; //M[row*] = M[row*] XOR rotW(rand)
+
+		reducedDuplexRowSetup(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+
+		//updates the value of row* (deterministically picked during Setup))
+		rowa = (rowa + step) & (window - 1);
+		//update prev: it now points to the last row ever computed
+		prev = row;
+		//updates row: goes to the next row to be computed
+		row++;
+
+		//Checks if all rows in the window where visited.
+		if (rowa == 0) {
+		step = window + gap; //changes the step: approximately doubles its value
+		window *= 2; //doubles the size of the re-visitation window
+		gap = -gap; //inverts the modifier to the step
+	}
+
+	} while (row < nRows);
+	//==========================================================================/
+
+	//============================ Wandering Phase =============================//
+	row = 0; //Resets the visitation to the first row of the memory matrix
+	for (tau = 1; tau <= timeCost; tau++) {
+		//Step is approximately half the number of all rows of the memory matrix for an odd tau; otherwise, it is -1
+		step = ((tau & 1) == 0) ? -1 : (nRows >> 1) - 1;
+		do {
+			//Selects a pseudorandom index row* (the only change in REv3)
+			//------------------------------------------------------------------------------------------
+			instance = state[instance & 0xF];
+			rowa = state[instance & 0xF] & (unsigned int)(nRows-1);
+
+			//rowa = state[0] & (unsigned int)(nRows-1);  //(USE THIS IF nRows IS A POWER OF 2)
+			//rowa = state[0] % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+			//------------------------------------------------------------------------------------------
+
+			//Performs a reduced-round duplexing operation over M[row*] XOR M[prev], updating both M[row*] and M[row]
+			reducedDuplexRow(state, memMatrix[prev], memMatrix[rowa], memMatrix[row], nCols);
+
+			//update prev: it now points to the last row ever computed
+			prev = row;
+
+			//updates row: goes to the next row to be computed
+			//------------------------------------------------------------------------------------------
+			row = (row + step) & (unsigned int)(nRows-1); //(USE THIS IF nRows IS A POWER OF 2)
+			//row = (row + step) % nRows; //(USE THIS FOR THE "GENERIC" CASE)
+			//------------------------------------------------------------------------------------------
+
+		} while (row != 0);
+	}
+
+	//============================ Wrap-up Phase ===============================//
+	//Absorbs the last block of the memory matrix
+	absorbBlock(state, memMatrix[rowa]);
+
+	//Squeezes the key
+	squeeze(state, K, (unsigned int) kLen);
+
+	//========================= Freeing the memory =============================//
+	free(memMatrix);
+	free(wholeMatrix);
+
+	return 0;
+}
--- a/lyra2/Lyra2.h
+++ b/lyra2/Lyra2.h
@ -38,5 +38,6 @@ typedef unsigned char byte;
				@@ -38,5 +38,6 @@ typedef unsigned char byte;
 #endif

 int LYRA2(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);
+int LYRA2_3(void *K, int64_t kLen, const void *pwd, int32_t pwdlen, const void *salt, int32_t saltlen, int64_t timeCost, const int16_t nRows, const int16_t nCols);

 #endif /* LYRA2_H_ */
--- a/lyra2/allium.cu
+++ b/lyra2/allium.cu
@ -0,0 +1,217 @@
				@@ -0,0 +1,217 @@
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_groestl.h"
+#include "lyra2/Lyra2.h"
+}
+
+#include <miner.h>
+#include <cuda_helper.h>
+
+static uint64_t* d_hash[MAX_GPUS];
+static uint64_t* d_matrix[MAX_GPUS];
+
+extern void blake256_cpu_init(int thr_id, uint32_t threads);
+extern void blake256_cpu_setBlock_80(uint32_t *pdata);
+//extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
+
+//extern void keccak256_sm3_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+//extern void keccak256_sm3_init(int thr_id, uint32_t threads);
+//extern void keccak256_sm3_free(int thr_id);
+
+extern void blakeKeccak256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
+
+extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+
+extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);
+
+extern void skein256_cpu_init(int thr_id, uint32_t threads);
+
+extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
+extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_outputHash, bool gtx750ti);
+
+extern void groestl256_cpu_init(int thr_id, uint32_t threads);
+extern void groestl256_cpu_free(int thr_id);
+extern void groestl256_setTarget(const void *ptarget);
+extern uint32_t groestl256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_outputHash, int order);
+extern uint32_t groestl256_getSecNonce(int thr_id, int num);
+
+
+extern "C" void allium_hash(void *state, const void *input)
+{
+	uint32_t hashA[8], hashB[8];
+
+	sph_blake256_context     ctx_blake;
+	sph_keccak256_context    ctx_keccak;
+	sph_cubehash256_context  ctx_cube;
+	sph_skein256_context     ctx_skein;
+	sph_groestl256_context   ctx_groestl;
+
+	sph_blake256_set_rounds(14);
+
+	sph_blake256_init(&ctx_blake);
+	sph_blake256(&ctx_blake, input, 80);
+	sph_blake256_close(&ctx_blake, hashA);
+
+	sph_keccak256_init(&ctx_keccak);
+	sph_keccak256(&ctx_keccak, hashA, 32);
+	sph_keccak256_close(&ctx_keccak, hashB);
+
+	LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
+
+	sph_cubehash256_init(&ctx_cube);
+	sph_cubehash256(&ctx_cube, hashA, 32);
+	sph_cubehash256_close(&ctx_cube, hashB);
+
+	LYRA2(hashA, 32, hashB, 32, hashB, 32, 1, 8, 8);
+
+	sph_skein256_init(&ctx_skein);
+	sph_skein256(&ctx_skein, hashA, 32);
+	sph_skein256_close(&ctx_skein, hashB);
+
+	sph_groestl256_init(&ctx_groestl);
+	sph_groestl256(&ctx_groestl, hashB, 32);
+	sph_groestl256_close(&ctx_groestl, hashA);
+
+	memcpy(state, hashA, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+static __thread uint32_t throughput = 0;
+
+extern "C" int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+
+	if (opt_benchmark)
+		ptarget[7] = 0x00ff;
+
+	static __thread bool gtx750ti;
+	if (!init[thr_id])
+	{
+		int dev_id = device_map[thr_id];
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+
+		int intensity = (device_sm[dev_id] >= 500 && !is_windows()) ? 17 : 16;
+		if (device_sm[device_map[thr_id]] == 500) intensity = 15;
+		throughput = cuda_default_throughput(thr_id, 1U << intensity); // 18=256*256*4;
+		if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+		cudaDeviceProp props;
+		cudaGetDeviceProperties(&props, dev_id);
+
+		if (strstr(props.name, "750 Ti")) gtx750ti = true;
+		else gtx750ti = false;
+
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		blake256_cpu_init(thr_id, throughput);
+		//keccak256_sm3_init(thr_id, throughput);
+		skein256_cpu_init(thr_id, throughput);
+		groestl256_cpu_init(thr_id, throughput);
+
+		//cuda_get_arch(thr_id);
+		if (device_sm[dev_id] >= 500)
+		{
+			size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 4 * 4 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
+			CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));
+			lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
+		}
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
+
+		init[thr_id] = true;
+	}
+
+	uint32_t _ALIGN(128) endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	blake256_cpu_setBlock_80(pdata);
+	groestl256_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		//blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		//keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		lyra2_cpu_hash_32(thr_id, throughput, d_hash[thr_id], gtx750ti);
+		cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		lyra2_cpu_hash_32(thr_id, throughput, d_hash[thr_id], gtx750ti);
+		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = groestl256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+
+			be32enc(&endiandata[19], work->nonces[0]);
+			allium_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = groestl256_getSecNonce(thr_id, 1);
+				if (work->nonces[1] != UINT32_MAX) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					allium_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_allium(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+	cudaFree(d_matrix[thr_id]);
+
+	//keccak256_sm3_free(thr_id);
+	groestl256_cpu_free(thr_id);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/lyra2/cuda_lyra2.cu
+++ b/lyra2/cuda_lyra2.cu
@ -1,6 +1,7 @@
				@@ -1,6 +1,7 @@
 /**
 * Lyra2 (v1) cuda implementation based on djm34 work
 * tpruvot@github 2015, Nanashi 08/2016 (from 1.8-r2)
+ * tpruvot@github 2018 for phi2 double lyra2-32 support
 */

 #include <stdio.h>
@ -228,9 +229,7 @@ void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads)
				@@ -228,9 +229,7 @@ void reduceDuplex(uint2 state[4], uint32_t thread, const uint32_t threads)
 {
 	uint2 state1[3];

-#if __CUDA_ARCH__ > 500
-#pragma unroll
-#endif
+	#pragma unroll
 	for (int i = 0; i < Nrow; i++)
 	{
 		ST4S(0, Ncol - i - 1, state, thread, threads);
@ -305,7 +304,7 @@ void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uin
				@@ -305,7 +304,7 @@ void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uin
 		LD4S(state1, rowIn, i, thread, threads);
 		LD4S(state2, rowInOut, i, thread, threads);

-#pragma unroll
+		#pragma unroll
 		for (int j = 0; j < 3; j++)
 			state[j] ^= state1[j] + state2[j];

@ -334,7 +333,7 @@ void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uin
				@@ -334,7 +333,7 @@ void reduceDuplexRowt(const int rowIn, const int rowInOut, const int rowOut, uin

 		LD4S(state1, rowOut, i, thread, threads);

-#pragma unroll
+		#pragma unroll
 		for (int j = 0; j < 3; j++)
 			state1[j] ^= state[j];

@ -409,14 +408,12 @@ __constant__ uint2x4 blake2b_IV[2] = {
				@@ -409,14 +408,12 @@ __constant__ uint2x4 blake2b_IV[2] = {
 };

 __global__ __launch_bounds__(64, 1)
-void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-
 	if (thread < threads)
 	{
 		uint2x4 state[4];
-
 		state[0].x = state[1].x = __ldg(&g_hash[thread + threads * 0]);
 		state[0].y = state[1].y = __ldg(&g_hash[thread + threads * 1]);
 		state[0].z = state[1].z = __ldg(&g_hash[thread + threads * 2]);
@ -436,10 +433,9 @@ void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
				@@ -436,10 +433,9 @@ void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash)

 __global__
 __launch_bounds__(TPB52, 1)
-void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+void lyra2_gpu_hash_32_2(const uint32_t threads, uint64_t *g_hash)
 {
 	const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
-
 	if (thread < threads)
 	{
 		uint2 state[4];
@ -481,14 +477,12 @@ void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_has
				@@ -481,14 +477,12 @@ void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_has
 }

 __global__ __launch_bounds__(64, 1)
-void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash)
 {
 	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
-
-	uint28 state[4];
-
 	if (thread < threads)
 	{
+		uint2x4 state[4];
 		state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
 		state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
 		state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
@ -501,8 +495,58 @@ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
				@@ -501,8 +495,58 @@ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
 		g_hash[thread + threads * 1] = state[0].y;
 		g_hash[thread + threads * 2] = state[0].z;
 		g_hash[thread + threads * 3] = state[0].w;
+	}
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_64_1(uint32_t threads, uint2* const d_hash_512, const uint32_t round)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+		const size_t offset = (size_t)8 * thread + (round * 4U);
+		uint2 *psrc = (uint2*)(&d_hash_512[offset]);
+		state[0].x = state[1].x = __ldg(&psrc[0]);
+		state[0].y = state[1].y = __ldg(&psrc[1]);
+		state[0].z = state[1].z = __ldg(&psrc[2]);
+		state[0].w = state[1].w = __ldg(&psrc[3]);
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];
+
+		for (int i = 0; i<24; i++)
+			round_lyra(state);
+
+		((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
+		((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
+		((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
+		((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
+	}
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_64_3(uint32_t threads, uint2 *d_hash_512, const uint32_t round)
+{
+	// This kernel outputs 2x 256-bits hashes in 512-bits chain offsets in 2 rounds
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+		state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
+		state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
+		state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
+		state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra(state);

-	} //thread
+		const size_t offset = (size_t)8 * thread + (round * 4U);
+		uint2 *pdst = (uint2*)(&d_hash_512[offset]);
+		pdst[0] = state[0].x;
+		pdst[1] = state[0].y;
+		pdst[2] = state[0].z;
+		pdst[3] = state[0].w;
+	}
 }
 #else
 #if __CUDA_ARCH__ < 500
@ -510,9 +554,11 @@ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
				@@ -510,9 +554,11 @@ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
 /* for unsupported SM arch */
 __device__ void* DMatrix;
 #endif
-__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
-__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {}
-__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_32_1(uint32_t threads, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_32_2(uint32_t threads, uint64_t *g_hash) {}
+__global__ void lyra2_gpu_hash_32_3(uint32_t threads, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_64_1(uint32_t threads, uint2* const d_hash_512, const uint32_t round) {}
+__global__ void lyra2_gpu_hash_64_3(uint32_t threads, uint2 *d_hash_512, const uint32_t round) {}
 #endif

 __host__
@ -523,7 +569,7 @@ void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
				@@ -523,7 +569,7 @@ void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
 }

 __host__
-void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, bool gtx750ti)
+void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_hash, bool gtx750ti)
 {
 	int dev_id = device_map[thr_id % MAX_GPUS];

@ -544,11 +590,9 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6
				@@ -544,11 +590,9 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6

 	if (cuda_arch[dev_id] >= 520)
 	{
-		lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
-
-		lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, startNounce, d_hash);
-
-		lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
+		lyra2_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
+		lyra2_gpu_hash_32_2 <<< grid1, block1, 24 * (8 - 0) * sizeof(uint2) * tpb >>> (threads, d_hash);
+		lyra2_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
 	}
 	else if (cuda_arch[dev_id] >= 500)
 	{
@ -561,12 +605,58 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6
				@@ -561,12 +605,58 @@ void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint6
 			// suitable amount to adjust for 10warp
 			shared_mem = 6144;

-		lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
+		lyra2_gpu_hash_32_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
+		lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash);
+		lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash);
+	}
+	else
+		lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, d_hash);
+}

-		lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, startNounce, (uint2*)d_hash);
+__host__
+void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti)
+{
+	int dev_id = device_map[thr_id % MAX_GPUS];
+	uint32_t tpb = TPB52;
+	if (cuda_arch[dev_id] >= 520) tpb = TPB52;
+	else if (cuda_arch[dev_id] >= 500) tpb = TPB50;
+	else if (cuda_arch[dev_id] >= 200) tpb = TPB20;

-		lyra2_gpu_hash_32_3_sm5 <<< grid2, block2 >>> (threads, startNounce, (uint2*)d_hash);
+	dim3 grid1((size_t(threads) * 4 + tpb - 1) / tpb);
+	dim3 block1(4, tpb >> 2);
+
+	dim3 grid2((threads + 64 - 1) / 64);
+	dim3 block2(64);
+
+	if (cuda_arch[dev_id] >= 520)
+	{
+		const size_t shared_mem = sizeof(uint2) * tpb * 192; // 49152;
+		lyra2_gpu_hash_64_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
+		lyra2_gpu_hash_32_2 <<< grid1, block1, shared_mem >>> (threads, d_hash_256);
+		lyra2_gpu_hash_64_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
+
+		lyra2_gpu_hash_64_1 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
+		lyra2_gpu_hash_32_2 <<< grid1, block1, shared_mem >>> (threads, d_hash_256);
+		lyra2_gpu_hash_64_3 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
+	}
+	else if (cuda_arch[dev_id] >= 500)
+	{
+		size_t shared_mem = gtx750ti ? 8192 : 6144; // 8 or 10 warps
+		lyra2_gpu_hash_64_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
+		lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash_256);
+		lyra2_gpu_hash_64_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 0);
+
+		lyra2_gpu_hash_64_1_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
+		lyra2_gpu_hash_32_2_sm5 <<< grid1, block1, shared_mem >>> (threads, (uint2*)d_hash_256);
+		lyra2_gpu_hash_64_3_sm5 <<< grid2, block2 >>> (threads, (uint2*)d_hash_512, 1);
+	}
+	else {
+		// alternative method for SM 3.x
+		hash64_to_lyra32(thr_id, threads, d_hash_512, d_hash_256, 0);
+		lyra2_cpu_hash_32(thr_id, threads, d_hash_256, gtx750ti);
+		hash64_from_lyra32(thr_id, threads, d_hash_512, d_hash_256, 0);
+		hash64_to_lyra32(thr_id, threads, d_hash_512, d_hash_256, 1);
+		lyra2_cpu_hash_32(thr_id, threads, d_hash_256, gtx750ti);
+		hash64_from_lyra32(thr_id, threads, d_hash_512, d_hash_256, 1);
 	}
-	else
-		lyra2_gpu_hash_32_sm2 <<< grid3, block3 >>> (threads, startNounce, d_hash);
 }
--- a/lyra2/cuda_lyra2_sm2.cuh
+++ b/lyra2/cuda_lyra2_sm2.cuh
@ -3,7 +3,7 @@
				@@ -3,7 +3,7 @@
 #ifdef __INTELLISENSE__
 /* just for vstudio code colors, only uncomment that temporary, dont commit it */
 //#undef __CUDA_ARCH__
-//#define __CUDA_ARCH__ 500
+//#define __CUDA_ARCH__ 300
 #endif

 #include "cuda_helper.h"
@ -131,7 +131,7 @@ void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut,
				@@ -131,7 +131,7 @@ void reduceDuplexRowSetup(const int rowIn, const int rowInOut, const int rowOut,
 }

 __global__ __launch_bounds__(TPB30, 1)
-void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash)
+void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash)
 {
 	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
@ -224,5 +224,68 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_h
				@@ -224,5 +224,68 @@ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_h

 #else
 /* if __CUDA_ARCH__ < 200 .. host */
-__global__ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint32_t startNounce, uint64_t *g_hash) {}
+__global__ void lyra2_gpu_hash_32_sm2(uint32_t threads, uint64_t *g_hash) {}
 #endif
+
+// -------------------------------------------------------------------------------------------------------------------------
+
+// lyra2 cant be used as-is in 512-bits hash chains, tx to djm for these weird offsets since first lyra2 algo...
+
+#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ <= 350
+
+__global__ __launch_bounds__(128, 8)
+void hash64_to_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const size_t offset = (size_t) 16 * thread + (round * 8U);
+		uint2 *psrc = (uint2*) (&d_hash64[offset]);
+		uint2 *pdst = (uint2*) (&d_hash_lyra[thread]);
+		pdst[threads*0] = __ldg(&psrc[0]);
+		pdst[threads*1] = __ldg(&psrc[1]);
+		pdst[threads*2] = __ldg(&psrc[2]);
+		pdst[threads*3] = __ldg(&psrc[3]);
+	}
+}
+
+__global__ __launch_bounds__(128, 8)
+void hash64_from_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const size_t offset = (size_t) 16 * thread + (round * 8U);
+		uint2 *psrc = (uint2*) (&d_hash_lyra[thread]);
+		uint2 *pdst = (uint2*) (&d_hash64[offset]);
+		pdst[0] = psrc[0];
+		pdst[1] = psrc[threads*1];
+		pdst[2] = psrc[threads*2];
+		pdst[3] = psrc[threads*3];
+	}
+}
+#else
+/* if __CUDA_ARCH__ < 200 .. host */
+__global__ void hash64_to_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) {}
+__global__ void hash64_from_lyra32_gpu(const uint32_t threads, const uint32_t* d_hash64, uint2* d_hash_lyra, const uint32_t round) {}
+#endif
+
+__host__
+void hash64_to_lyra32(int thr_id, const uint32_t threads, uint32_t* d_hash64, uint64_t* d_hash_lyra, const uint32_t round)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	hash64_to_lyra32_gpu <<<grid, block>>> (threads, d_hash64, (uint2*) d_hash_lyra, round);
+}
+
+__host__
+void hash64_from_lyra32(int thr_id, const uint32_t threads, uint32_t* d_hash64, uint64_t* d_hash_lyra, const uint32_t round)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	hash64_from_lyra32_gpu <<<grid, block>>> (threads, d_hash64, (uint2*) d_hash_lyra, round);
+}
--- a/lyra2/cuda_lyra2_sm5.cuh
+++ b/lyra2/cuda_lyra2_sm5.cuh
@ -589,15 +589,14 @@ void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thr
				@@ -589,15 +589,14 @@ void reduceDuplexRowV50_8(const int rowInOut, uint2 state[4], const uint32_t thr
 }

 __global__ __launch_bounds__(64, 1)
-void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash)
 {
-	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
-
 	const uint2x4 blake2b_IV[2] = {
 		{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } },
 		{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }
 	};

+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	if (thread < threads)
 	{
 		uint2x4 state[4];
@ -622,14 +621,13 @@ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha
				@@ -622,14 +621,13 @@ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha
 }

 __global__ __launch_bounds__(TPB50, 1)
-void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash)
 {
 	const uint32_t thread = (blockDim.y * blockIdx.x + threadIdx.y);

 	if (thread < threads)
 	{
 		uint2 state[4];
-
 		state[0] = __ldg(&DMatrix[(0 * threads + thread)*blockDim.x + threadIdx.x]);
 		state[1] = __ldg(&DMatrix[(1 * threads + thread)*blockDim.x + threadIdx.x]);
 		state[2] = __ldg(&DMatrix[(2 * threads + thread)*blockDim.x + threadIdx.x]);
@ -662,14 +660,13 @@ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha
				@@ -662,14 +660,13 @@ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha
 }

 __global__ __launch_bounds__(64, 1)
-void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash)
+void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);

 	if (thread < threads)
 	{
 		uint2x4 state[4];
-
 		state[0] = __ldg4(&((uint2x4*)DMatrix)[0 * threads + thread]);
 		state[1] = __ldg4(&((uint2x4*)DMatrix)[1 * threads + thread]);
 		state[2] = __ldg4(&((uint2x4*)DMatrix)[2 * threads + thread]);
@ -685,9 +682,68 @@ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha
				@@ -685,9 +682,68 @@ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_ha
 	}
 }

+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_64_1_sm5(uint32_t threads, uint2* const d_hash_512, const uint32_t round)
+{
+	const uint2x4 blake2b_IV[2] = {
+		{ { 0xf3bcc908, 0x6a09e667 }, { 0x84caa73b, 0xbb67ae85 }, { 0xfe94f82b, 0x3c6ef372 }, { 0x5f1d36f1, 0xa54ff53a } },
+		{ { 0xade682d1, 0x510e527f }, { 0x2b3e6c1f, 0x9b05688c }, { 0xfb41bd6b, 0x1f83d9ab }, { 0x137e2179, 0x5be0cd19 } }
+	};
+	// This kernel loads 2x 256-bits hashes from 512-bits chain offsets in 2 steps
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+		const size_t offset = (size_t)8 * thread + (round * 4U);
+		uint2 *psrc = (uint2*)(&d_hash_512[offset]);
+		state[0].x = state[1].x = __ldg(&psrc[0]);
+		state[0].y = state[1].y = __ldg(&psrc[1]);
+		state[0].z = state[1].z = __ldg(&psrc[2]);
+		state[0].w = state[1].w = __ldg(&psrc[3]);
+
+		state[1] = state[0];
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];
+
+		for (int i = 0; i<24; i++)
+			round_lyra(state);
+
+		((uint2x4*)DMatrix)[threads * 0 + thread] = state[0];
+		((uint2x4*)DMatrix)[threads * 1 + thread] = state[1];
+		((uint2x4*)DMatrix)[threads * 2 + thread] = state[2];
+		((uint2x4*)DMatrix)[threads * 3 + thread] = state[3];
+	}
+}
+
+__global__ __launch_bounds__(64, 1)
+void lyra2_gpu_hash_64_3_sm5(uint32_t threads, uint2 *d_hash_512, const uint32_t round)
+{
+	// This kernel outputs 2x 256-bits hashes in 512-bits chain offsets in 2 steps
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+	if (thread < threads)
+	{
+		uint2x4 state[4];
+		state[0] = __ldg4(&((uint2x4*)DMatrix)[threads * 0 + thread]);
+		state[1] = __ldg4(&((uint2x4*)DMatrix)[threads * 1 + thread]);
+		state[2] = __ldg4(&((uint2x4*)DMatrix)[threads * 2 + thread]);
+		state[3] = __ldg4(&((uint2x4*)DMatrix)[threads * 3 + thread]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra(state);
+
+		const size_t offset = (size_t)8 * thread + (round * 4U);
+		uint2 *pdst = (uint2*)(&d_hash_512[offset]);
+		pdst[0] = state[0].x;
+		pdst[1] = state[0].y;
+		pdst[2] = state[0].z;
+		pdst[3] = state[0].w;
+	}
+}
 #else
 /* if __CUDA_ARCH__ != 500 .. host */
-__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
-__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
-__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint32_t startNounce, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_32_1_sm5(uint32_t threads, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_32_2_sm5(uint32_t threads, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_32_3_sm5(uint32_t threads, uint2 *g_hash) {}
+__global__ void lyra2_gpu_hash_64_1_sm5(uint32_t threads, uint2* const d_hash_512, const uint32_t round) {}
+__global__ void lyra2_gpu_hash_64_3_sm5(uint32_t threads, uint2 *d_hash_512, const uint32_t round) {}
 #endif
--- a/lyra2/cuda_lyra2v3.cu
+++ b/lyra2/cuda_lyra2v3.cu
@ -0,0 +1,481 @@
				@@ -0,0 +1,481 @@
+/**
+ * Lyra2 (v3) CUDA Implementation
+ *
+ * Based on VTC sources
+ */
+#include <stdio.h>
+#include <stdint.h>
+#include <memory.h>
+#include "cuda_helper.h"
+
+#include "cuda_lyra2v3_sm3.cuh"
+
+
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors */
+#define __CUDA_ARCH__ 500
+#endif
+
+#define TPB 32
+
+#if __CUDA_ARCH__ >= 500
+
+#include "cuda_lyra2_vectors.h"
+
+#define Nrow 4
+#define Ncol 4
+#define memshift 3
+
+
+__device__ uint2x4 *DMatrix;
+
+__device__ __forceinline__ uint2 LD4S(const int index)
+{
+	extern __shared__ uint2 shared_mem[];
+	return shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x];
+}
+
+__device__ __forceinline__ void ST4S(const int index, const uint2 data)
+{
+	extern __shared__ uint2 shared_mem[];
+	shared_mem[(index * blockDim.y + threadIdx.y) * blockDim.x + threadIdx.x] = data;
+}
+
+__device__ __forceinline__ uint2 shuffle2(uint2 a, uint32_t b, uint32_t c)
+{
+	return make_uint2(__shfl(a.x, b, c), __shfl(a.y, b, c));
+}
+
+__device__ __forceinline__
+void Gfunc_v5(uint2 &a, uint2 &b, uint2 &c, uint2 &d)
+{
+	a += b; d ^= a; d = SWAPUINT2(d);
+	c += d; b ^= c; b = ROR2(b, 24);
+	a += b; d ^= a; d = ROR2(d, 16);
+	c += d; b ^= c; b = ROR2(b, 63);
+}
+
+__device__ __forceinline__
+void round_lyra_v5(uint2x4 s[4])
+{
+	Gfunc_v5(s[0].x, s[1].x, s[2].x, s[3].x);
+	Gfunc_v5(s[0].y, s[1].y, s[2].y, s[3].y);
+	Gfunc_v5(s[0].z, s[1].z, s[2].z, s[3].z);
+	Gfunc_v5(s[0].w, s[1].w, s[2].w, s[3].w);
+
+	Gfunc_v5(s[0].x, s[1].y, s[2].z, s[3].w);
+	Gfunc_v5(s[0].y, s[1].z, s[2].w, s[3].x);
+	Gfunc_v5(s[0].z, s[1].w, s[2].x, s[3].y);
+	Gfunc_v5(s[0].w, s[1].x, s[2].y, s[3].z);
+}
+
+__device__ __forceinline__
+void round_lyra_v5(uint2 s[4])
+{
+	Gfunc_v5(s[0], s[1], s[2], s[3]);
+	s[1] = shuffle2(s[1], threadIdx.x + 1, 4);
+	s[2] = shuffle2(s[2], threadIdx.x + 2, 4);
+	s[3] = shuffle2(s[3], threadIdx.x + 3, 4);
+	Gfunc_v5(s[0], s[1], s[2], s[3]);
+	s[1] = shuffle2(s[1], threadIdx.x + 3, 4);
+	s[2] = shuffle2(s[2], threadIdx.x + 2, 4);
+	s[3] = shuffle2(s[3], threadIdx.x + 1, 4);
+}
+
+__device__ __forceinline__
+void reduceDuplexRowSetup2(uint2 state[4])
+{
+	uint2 state1[Ncol][3], state0[Ncol][3], state2[3];
+	int i, j;
+
+	#pragma unroll
+	for (int i = 0; i < Ncol; i++)
+	{
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state0[Ncol - i - 1][j] = state[j];
+		round_lyra_v5(state);
+	}
+
+	//#pragma unroll 4
+	for (i = 0; i < Ncol; i++)
+	{
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= state0[i][j];
+
+		round_lyra_v5(state);
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state1[Ncol - i - 1][j] = state0[i][j];
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state1[Ncol - i - 1][j] ^= state[j];
+	}
+
+	for (i = 0; i < Ncol; i++)
+	{
+		const uint32_t s0 = memshift * Ncol * 0 + i * memshift;
+		const uint32_t s2 = memshift * Ncol * 2 + memshift * (Ncol - 1) - i*memshift;
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= state1[i][j] + state0[i][j];
+
+		round_lyra_v5(state);
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state2[j] = state1[i][j];
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state2[j] ^= state[j];
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s2 + j, state2[j]);
+
+		uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
+		uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
+		uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0) {
+			state0[i][0] ^= Data2;
+			state0[i][1] ^= Data0;
+			state0[i][2] ^= Data1;
+		} else {
+			state0[i][0] ^= Data0;
+			state0[i][1] ^= Data1;
+			state0[i][2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s0 + j, state0[i][j]);
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state0[i][j] = state2[j];
+
+	}
+
+	for (i = 0; i < Ncol; i++)
+	{
+		const uint32_t s1 = memshift * Ncol * 1 + i*memshift;
+		const uint32_t s3 = memshift * Ncol * 3 + memshift * (Ncol - 1) - i*memshift;
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= state1[i][j] + state0[Ncol - i - 1][j];
+
+		round_lyra_v5(state);
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state0[Ncol - i - 1][j] ^= state[j];
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s3 + j, state0[Ncol - i - 1][j]);
+
+		uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
+		uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
+		uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0) {
+			state1[i][0] ^= Data2;
+			state1[i][1] ^= Data0;
+			state1[i][2] ^= Data1;
+		} else  {
+			state1[i][0] ^= Data0;
+			state1[i][1] ^= Data1;
+			state1[i][2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			ST4S(s1 + j, state1[i][j]);
+	}
+}
+
+__device__
+void reduceDuplexRowt2(const int rowIn, const int rowInOut, const int rowOut, uint2 state[4])
+{
+	uint2 state1[3], state2[3];
+	const uint32_t ps1 = memshift * Ncol * rowIn;
+	const uint32_t ps2 = memshift * Ncol * rowInOut;
+	const uint32_t ps3 = memshift * Ncol * rowOut;
+
+	for (int i = 0; i < Ncol; i++)
+	{
+		const uint32_t s1 = ps1 + i*memshift;
+		const uint32_t s2 = ps2 + i*memshift;
+		const uint32_t s3 = ps3 + i*memshift;
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state1[j] = LD4S(s1 + j);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state2[j] = LD4S(s2 + j);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j] + state2[j];
+
+		round_lyra_v5(state);
+
+		uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
+		uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
+		uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
+
+		if (threadIdx.x == 0) {
+			state2[0] ^= Data2;
+			state2[1] ^= Data0;
+			state2[2] ^= Data1;
+		} else {
+			state2[0] ^= Data0;
+			state2[1] ^= Data1;
+			state2[2] ^= Data2;
+		}
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			ST4S(s2 + j, state2[j]);
+
+		#pragma unroll
+		for (int j = 0; j < 3; j++)
+			ST4S(s3 + j, LD4S(s3 + j) ^ state[j]);
+	}
+}
+
+__device__
+void reduceDuplexRowt2x4(const int rowInOut, uint2 state[4])
+{
+	const int rowIn = 2;
+	const int rowOut = 3;
+
+	int i, j;
+	uint2 last[3];
+	const uint32_t ps1 = memshift * Ncol * rowIn;
+	const uint32_t ps2 = memshift * Ncol * rowInOut;
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		last[j] = LD4S(ps2 + j);
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= LD4S(ps1 + j) + last[j];
+
+	round_lyra_v5(state);
+
+	uint2 Data0 = shuffle2(state[0], threadIdx.x - 1, 4);
+	uint2 Data1 = shuffle2(state[1], threadIdx.x - 1, 4);
+	uint2 Data2 = shuffle2(state[2], threadIdx.x - 1, 4);
+
+	if (threadIdx.x == 0) {
+		last[0] ^= Data2;
+		last[1] ^= Data0;
+		last[2] ^= Data1;
+	} else {
+		last[0] ^= Data0;
+		last[1] ^= Data1;
+		last[2] ^= Data2;
+	}
+
+	if (rowInOut == rowOut)
+	{
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			last[j] ^= state[j];
+	}
+
+	for (i = 1; i < Ncol; i++)
+	{
+		const uint32_t s1 = ps1 + i*memshift;
+		const uint32_t s2 = ps2 + i*memshift;
+
+		#pragma unroll
+		for (j = 0; j < 3; j++)
+			state[j] ^= LD4S(s1 + j) + LD4S(s2 + j);
+
+		round_lyra_v5(state);
+	}
+
+	#pragma unroll
+	for (int j = 0; j < 3; j++)
+		state[j] ^= last[j];
+}
+
+__global__
+__launch_bounds__(TPB, 1)
+void lyra2v3_gpu_hash_32_1(uint32_t threads, uint2 *inputHash)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	const uint2x4 blake2b_IV[2] = {
+		0xf3bcc908UL, 0x6a09e667UL, 0x84caa73bUL, 0xbb67ae85UL,
+		0xfe94f82bUL, 0x3c6ef372UL, 0x5f1d36f1UL, 0xa54ff53aUL,
+		0xade682d1UL, 0x510e527fUL, 0x2b3e6c1fUL, 0x9b05688cUL,
+		0xfb41bd6bUL, 0x1f83d9abUL, 0x137e2179UL, 0x5be0cd19UL
+	};
+
+	const uint2x4 Mask[2] = {
+		0x00000020UL, 0x00000000UL, 0x00000020UL, 0x00000000UL,
+		0x00000020UL, 0x00000000UL, 0x00000001UL, 0x00000000UL,
+		0x00000004UL, 0x00000000UL, 0x00000004UL, 0x00000000UL,
+		0x00000080UL, 0x00000000UL, 0x00000000UL, 0x01000000UL
+	};
+
+	uint2x4 state[4];
+
+	if (thread < threads)
+	{
+		state[0].x = state[1].x = __ldg(&inputHash[thread + threads * 0]);
+		state[0].y = state[1].y = __ldg(&inputHash[thread + threads * 1]);
+		state[0].z = state[1].z = __ldg(&inputHash[thread + threads * 2]);
+		state[0].w = state[1].w = __ldg(&inputHash[thread + threads * 3]);
+		state[2] = blake2b_IV[0];
+		state[3] = blake2b_IV[1];
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v5(state);
+
+		state[0] ^= Mask[0];
+		state[1] ^= Mask[1];
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v5(state);
+
+		DMatrix[blockDim.x * gridDim.x * 0 + thread] = state[0];
+		DMatrix[blockDim.x * gridDim.x * 1 + thread] = state[1];
+		DMatrix[blockDim.x * gridDim.x * 2 + thread] = state[2];
+		DMatrix[blockDim.x * gridDim.x * 3 + thread] = state[3];
+	}
+}
+
+__global__
+__launch_bounds__(TPB, 1)
+void lyra2v3_gpu_hash_32_2(uint32_t threads)
+{
+	const uint32_t thread = blockDim.y * blockIdx.x + threadIdx.y;
+
+	if (thread < threads)
+	{
+		uint2 state[4];
+		state[0] = ((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[1] = ((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[2] = ((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+		state[3] = ((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x];
+
+		reduceDuplexRowSetup2(state);
+
+		uint32_t rowa;
+		int prev = 3;
+		unsigned int instance = 0;
+		for (int i = 0; i < 3; i++)
+		{
+			instance = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4);
+			rowa     = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4) & 0x3;
+
+			//rowa = __shfl(state[0].x, 0, 4) & 3;
+			reduceDuplexRowt2(prev, rowa, i, state);
+			prev = i;
+		}
+
+		instance = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4);
+		rowa     = __shfl(state[(instance >> 2) & 0x3].x, instance & 0x3, 4) & 0x3;
+
+		//rowa = __shfl(state[0].x, 0, 4) & 3;
+		reduceDuplexRowt2x4(rowa, state);
+
+		((uint2*)DMatrix)[(0 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[0];
+		((uint2*)DMatrix)[(1 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[1];
+		((uint2*)DMatrix)[(2 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[2];
+		((uint2*)DMatrix)[(3 * gridDim.x * blockDim.y + thread) * blockDim.x + threadIdx.x] = state[3];
+	}
+}
+
+__global__
+__launch_bounds__(TPB, 1)
+void lyra2v3_gpu_hash_32_3(uint32_t threads, uint2 *outputHash)
+{
+	const uint32_t thread = blockDim.x * blockIdx.x + threadIdx.x;
+
+	uint2x4 state[4];
+
+	if (thread < threads)
+	{
+		state[0] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 0 + thread]);
+		state[1] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 1 + thread]);
+		state[2] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 2 + thread]);
+		state[3] = __ldg4(&DMatrix[blockDim.x * gridDim.x * 3 + thread]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra_v5(state);
+
+		outputHash[thread + threads * 0] = state[0].x;
+		outputHash[thread + threads * 1] = state[0].y;
+		outputHash[thread + threads * 2] = state[0].z;
+		outputHash[thread + threads * 3] = state[0].w;
+	}
+}
+
+#else
+#include "cuda_helper.h"
+#if __CUDA_ARCH__ < 200
+__device__ void* DMatrix;
+#endif
+__global__ void lyra2v3_gpu_hash_32_1(uint32_t threads, uint2 *inputHash) {}
+__global__ void lyra2v3_gpu_hash_32_2(uint32_t threads) {}
+__global__ void lyra2v3_gpu_hash_32_3(uint32_t threads, uint2 *outputHash) {}
+#endif
+
+
+__host__
+void lyra2v3_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix)
+{
+	cuda_get_arch(thr_id);
+	// just assign the device pointer allocated in main loop
+	cudaMemcpyToSymbol(DMatrix, &d_matrix, sizeof(uint64_t*), 0, cudaMemcpyHostToDevice);
+}
+
+__host__
+void lyra2v3_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, int order)
+{
+	int dev_id = device_map[thr_id % MAX_GPUS];
+
+	if (device_sm[dev_id] >= 500) {
+
+		const uint32_t tpb = TPB;
+
+		dim3 grid2((threads + tpb - 1) / tpb);
+		dim3 block2(tpb);
+		dim3 grid4((threads * 4 + tpb - 1) / tpb);
+		dim3 block4(4, tpb / 4);
+
+		lyra2v3_gpu_hash_32_1 <<< grid2, block2 >>> (threads, (uint2*)g_hash);
+		lyra2v3_gpu_hash_32_2 <<< grid4, block4, 48 * sizeof(uint2) * tpb >>> (threads);
+		lyra2v3_gpu_hash_32_3 <<< grid2, block2 >>> (threads, (uint2*)g_hash);
+
+	} else {
+
+		uint32_t tpb = 16;
+		if (cuda_arch[dev_id] >= 350) tpb = TPB35;
+		else if (cuda_arch[dev_id] >= 300) tpb = TPB30;
+		else if (cuda_arch[dev_id] >= 200) tpb = TPB20;
+
+		dim3 grid((threads + tpb - 1) / tpb);
+		dim3 block(tpb);
+		lyra2v3_gpu_hash_32_v3 <<< grid, block >>> (threads, startNounce, (uint2*)g_hash);
+
+	}
+}
+
+
--- a/lyra2/cuda_lyra2v3_sm3.cuh
+++ b/lyra2/cuda_lyra2v3_sm3.cuh
@ -0,0 +1,348 @@
				@@ -0,0 +1,348 @@
+/* SM 2/3/3.5 Variant for lyra2REv2 */
+
+#ifdef __INTELLISENSE__
+/* just for vstudio code colors, only uncomment that temporary, dont commit it */
+//#undef __CUDA_ARCH__
+//#define __CUDA_ARCH__ 500
+#endif
+
+#define TPB20 64
+#define TPB30 64
+#define TPB35 64
+
+#if __CUDA_ARCH__ >= 200 && __CUDA_ARCH__ < 500
+
+#include "cuda_lyra2_vectors.h"
+
+#define Nrow 4
+#define Ncol 4
+
+#define vectype ulonglong4
+#define memshift 4
+
+__device__ vectype *DMatrix;
+
+static __device__ __forceinline__
+void Gfunc_v35(unsigned long long &a, unsigned long long &b, unsigned long long &c, unsigned long long &d)
+{
+	a += b; d ^= a; d = ROTR64(d, 32);
+	c += d; b ^= c; b = ROTR64(b, 24);
+	a += b; d ^= a; d = ROTR64(d, 16);
+	c += d; b ^= c; b = ROTR64(b, 63);
+}
+
+static __device__ __forceinline__
+void round_lyra_v35(vectype* s)
+{
+	Gfunc_v35(s[0].x, s[1].x, s[2].x, s[3].x);
+	Gfunc_v35(s[0].y, s[1].y, s[2].y, s[3].y);
+	Gfunc_v35(s[0].z, s[1].z, s[2].z, s[3].z);
+	Gfunc_v35(s[0].w, s[1].w, s[2].w, s[3].w);
+
+	Gfunc_v35(s[0].x, s[1].y, s[2].z, s[3].w);
+	Gfunc_v35(s[0].y, s[1].z, s[2].w, s[3].x);
+	Gfunc_v35(s[0].z, s[1].w, s[2].x, s[3].y);
+	Gfunc_v35(s[0].w, s[1].x, s[2].y, s[3].z);
+}
+
+static __device__ __forceinline__
+void reduceDuplexV3(vectype state[4], uint32_t thread)
+{
+	vectype state1[3];
+	uint32_t ps1 = (Nrow * Ncol * memshift * thread);
+	uint32_t ps2 = (memshift * (Ncol - 1) * Nrow + memshift * 1 + Nrow * Ncol * memshift * thread);
+
+	#pragma unroll 4
+	for (int i = 0; i < Ncol; i++)
+	{
+		uint32_t s1 = ps1 + Nrow * i *memshift;
+		uint32_t s2 = ps2 - Nrow * i *memshift;
+
+		for (int j = 0; j < 3; j++)
+			state1[j] = __ldg4(&(DMatrix + s1)[j]);
+
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j];
+		round_lyra_v35(state);
+
+		for (int j = 0; j < 3; j++)
+			state1[j] ^= state[j];
+
+		for (int j = 0; j < 3; j++)
+			(DMatrix + s2)[j] = state1[j];
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowSetupV3(const int rowIn, const int rowInOut, const int rowOut, vectype state[4], uint32_t thread)
+{
+	vectype state2[3], state1[3];
+
+	uint32_t ps1 = (memshift * rowIn    + Nrow * Ncol * memshift * thread);
+	uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread);
+	uint32_t ps3 = (Nrow * memshift * (Ncol - 1) + memshift *  rowOut + Nrow * Ncol * memshift * thread);
+
+	for (int i = 0; i < Ncol; i++)
+	{
+		uint32_t s1 = ps1 + Nrow*i*memshift;
+		uint32_t s2 = ps2 + Nrow*i*memshift;
+		uint32_t s3 = ps3 - Nrow*i*memshift;
+
+		for (int j = 0; j < 3; j++)
+			state1[j] = __ldg4(&(DMatrix + s1 )[j]);
+		for (int j = 0; j < 3; j++)
+			state2[j] = __ldg4(&(DMatrix + s2 )[j]);
+		for (int j = 0; j < 3; j++) {
+			vectype tmp = state1[j] + state2[j];
+			state[j] ^= tmp;
+		}
+
+		round_lyra_v35(state);
+
+		for (int j = 0; j < 3; j++) {
+			state1[j] ^= state[j];
+			(DMatrix + s3)[j] = state1[j];
+		}
+
+		((uint2*)state2)[0] ^= ((uint2*)state)[11];
+		for (int j = 0; j < 11; j++)
+			((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
+
+		for (int j = 0; j < 3; j++)
+			(DMatrix + s2)[j] = state2[j];
+	}
+}
+
+static __device__ __forceinline__
+void reduceDuplexRowtV3(const int rowIn, const int rowInOut, const int rowOut, vectype* state, uint32_t thread)
+{
+	vectype state1[3], state2[3];
+	uint32_t ps1 = (memshift * rowIn    + Nrow * Ncol * memshift * thread);
+	uint32_t ps2 = (memshift * rowInOut + Nrow * Ncol * memshift * thread);
+	uint32_t ps3 = (memshift * rowOut   + Nrow * Ncol * memshift * thread);
+
+	#pragma nounroll
+	for (int i = 0; i < Ncol; i++)
+	{
+		uint32_t s1 = ps1 + Nrow * i*memshift;
+		uint32_t s2 = ps2 + Nrow * i*memshift;
+		uint32_t s3 = ps3 + Nrow * i*memshift;
+
+		for (int j = 0; j < 3; j++)
+			state1[j] = __ldg4(&(DMatrix + s1)[j]);
+
+		for (int j = 0; j < 3; j++)
+			state2[j] = __ldg4(&(DMatrix + s2)[j]);
+
+		for (int j = 0; j < 3; j++)
+			state1[j] += state2[j];
+
+		for (int j = 0; j < 3; j++)
+			state[j] ^= state1[j];
+
+		round_lyra_v35(state);
+
+		((uint2*)state2)[0] ^= ((uint2*)state)[11];
+
+		for (int j = 0; j < 11; j++)
+			((uint2*)state2)[j + 1] ^= ((uint2*)state)[j];
+
+		if (rowInOut != rowOut) {
+
+			for (int j = 0; j < 3; j++)
+				(DMatrix + s2)[j] = state2[j];
+
+			for (int j = 0; j < 3; j++)
+				(DMatrix + s3)[j] ^= state[j];
+
+		} else {
+
+			for (int j = 0; j < 3; j++)
+				state2[j] ^= state[j];
+
+			for (int j = 0; j < 3; j++)
+				(DMatrix + s2)[j] = state2[j];
+		}
+	}
+}
+
+#if __CUDA_ARCH__ >= 300
+__global__ __launch_bounds__(TPB35, 1)
+void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	vectype state[4];
+	vectype blake2b_IV[2];
+	vectype padding[2];
+
+	if (threadIdx.x == 0) {
+
+		((uint16*)blake2b_IV)[0] = make_uint16(
+			0xf3bcc908, 0x6a09e667 , 0x84caa73b, 0xbb67ae85,
+			0xfe94f82b, 0x3c6ef372 , 0x5f1d36f1, 0xa54ff53a,
+			0xade682d1, 0x510e527f , 0x2b3e6c1f, 0x9b05688c,
+			0xfb41bd6b, 0x1f83d9ab , 0x137e2179, 0x5be0cd19
+		);
+		((uint16*)padding)[0] = make_uint16(
+			0x20, 0x0 , 0x20, 0x0 , 0x20, 0x0 , 0x01, 0x0,
+			0x04, 0x0 , 0x04, 0x0 , 0x80, 0x0 , 0x0, 0x01000000
+		);
+	}
+
+	if (thread < threads)
+	{
+		((uint2*)state)[0] = __ldg(&outputHash[thread]);
+		((uint2*)state)[1] = __ldg(&outputHash[thread + threads]);
+		((uint2*)state)[2] = __ldg(&outputHash[thread + 2 * threads]);
+		((uint2*)state)[3] = __ldg(&outputHash[thread + 3 * threads]);
+
+		state[1] = state[0];
+		state[2] = shuffle4(((vectype*)blake2b_IV)[0], 0);
+		state[3] = shuffle4(((vectype*)blake2b_IV)[1], 0);
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v35(state);
+
+		state[0] ^= shuffle4(((vectype*)padding)[0], 0);
+		state[1] ^= shuffle4(((vectype*)padding)[1], 0);
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v35(state);
+
+		uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
+
+		//#pragma unroll 4
+		for (int i = 0; i < 4; i++)
+		{
+			uint32_t s1 = ps1 - 4 * memshift * i;
+			for (int j = 0; j < 3; j++)
+				(DMatrix + s1)[j] = (state)[j];
+
+			round_lyra_v35(state);
+		}
+
+		reduceDuplexV3(state, thread);
+		reduceDuplexRowSetupV3(1, 0, 2, state, thread);
+		reduceDuplexRowSetupV3(2, 1, 3, state, thread);
+
+		unsigned int instance = 0;
+		uint32_t rowa;
+		int prev = 3;
+		for (int i = 0; i < 4; i++)
+		{
+			//rowa = ((uint2*)state)[0].x & 3;
+
+			instance = ((uint2*)state)[instance & 0xf].x;
+			rowa = ((uint2*)state)[instance & 0xf].x & 0x3;
+			reduceDuplexRowtV3(prev, rowa, i, state, thread);
+			prev = i;
+		}
+
+		uint32_t shift = (memshift * rowa + 16 * memshift * thread);
+
+		for (int j = 0; j < 3; j++)
+			state[j] ^= __ldg4(&(DMatrix + shift)[j]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra_v35(state);
+
+		outputHash[thread] = ((uint2*)state)[0];
+		outputHash[thread + threads] = ((uint2*)state)[1];
+		outputHash[thread + 2 * threads] = ((uint2*)state)[2];
+		outputHash[thread + 3 * threads] = ((uint2*)state)[3];
+
+	} //thread
+}
+#elif __CUDA_ARCH__ >= 200
+__global__ __launch_bounds__(TPB20, 1)
+void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash)
+{
+	uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	vectype state[4];
+	vectype blake2b_IV[2];
+	vectype padding[2];
+
+	((uint16*)blake2b_IV)[0] = make_uint16(
+		0xf3bcc908, 0x6a09e667, 0x84caa73b, 0xbb67ae85,
+		0xfe94f82b, 0x3c6ef372, 0x5f1d36f1, 0xa54ff53a,
+		0xade682d1, 0x510e527f, 0x2b3e6c1f, 0x9b05688c,
+		0xfb41bd6b, 0x1f83d9ab, 0x137e2179, 0x5be0cd19
+	);
+	((uint16*)padding)[0] = make_uint16(
+		0x20, 0x0, 0x20, 0x0, 0x20, 0x0, 0x01, 0x0,
+		0x04, 0x0, 0x04, 0x0, 0x80, 0x0, 0x0, 0x01000000
+	);
+
+	if (thread < threads)
+	{
+
+		((uint2*)state)[0] = outputHash[thread];
+		((uint2*)state)[1] = outputHash[thread + threads];
+		((uint2*)state)[2] = outputHash[thread + 2 * threads];
+		((uint2*)state)[3] = outputHash[thread + 3 * threads];
+
+		state[1] = state[0];
+		state[2] = ((vectype*)blake2b_IV)[0];
+		state[3] = ((vectype*)blake2b_IV)[1];
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v35(state);
+
+		state[0] ^= ((vectype*)padding)[0];
+		state[1] ^= ((vectype*)padding)[1];
+
+		for (int i = 0; i<12; i++)
+			round_lyra_v35(state);
+
+		uint32_t ps1 = (4 * memshift * 3 + 16 * memshift * thread);
+
+		//#pragma unroll 4
+		for (int i = 0; i < 4; i++)
+		{
+			uint32_t s1 = ps1 - 4 * memshift * i;
+			for (int j = 0; j < 3; j++)
+				(DMatrix + s1)[j] = (state)[j];
+
+			round_lyra_v35(state);
+		}
+
+		reduceDuplexV3(state, thread);
+		reduceDuplexRowSetupV3(1, 0, 2, state, thread);
+		reduceDuplexRowSetupV3(2, 1, 3, state, thread);
+
+		uint instance = 0;
+		uint32_t rowa;
+		int prev = 3;
+		for (int i = 0; i < 4; i++)
+		{
+			// rowa = ((uint2*)state)[0].x & 3;
+
+			instance = ((uint2*)state)[instance & 0xf];
+			rowa = ((uint2*)state)[instance & 0xf] & 0x3;
+			reduceDuplexRowtV3(prev, rowa, i, state, thread);
+			prev = i;
+		}
+
+		uint32_t shift = (memshift * rowa + 16 * memshift * thread);
+
+		for (int j = 0; j < 3; j++)
+			state[j] ^= __ldg4(&(DMatrix + shift)[j]);
+
+		for (int i = 0; i < 12; i++)
+			round_lyra_v35(state);
+
+		outputHash[thread] = ((uint2*)state)[0];
+		outputHash[thread + threads] = ((uint2*)state)[1];
+		outputHash[thread + 2 * threads] = ((uint2*)state)[2];
+		outputHash[thread + 3 * threads] = ((uint2*)state)[3];
+
+	} //thread
+}
+#endif
+
+#else
+/* host & sm5+ */
+__global__ void lyra2v3_gpu_hash_32_v3(uint32_t threads, uint32_t startNounce, uint2 *outputHash) {}
+#endif
--- a/lyra2/lyra2RE.cu
+++ b/lyra2/lyra2RE.cu
@ -26,7 +26,7 @@ extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNon
				@@ -26,7 +26,7 @@ extern void skein256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNon
 extern void skein256_cpu_init(int thr_id, uint32_t threads);

 extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
-extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, bool gtx750ti);
+extern void lyra2_cpu_hash_32(int thr_id, uint32_t threads, uint64_t *d_outputHash, bool gtx750ti);

 extern void groestl256_cpu_init(int thr_id, uint32_t threads);
 extern void groestl256_cpu_free(int thr_id);
@ -130,7 +130,7 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
				@@ -130,7 +130,7 @@ extern "C" int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce,
 		//blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		//keccak256_sm3_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
 		blakeKeccak256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
-		lyra2_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], gtx750ti);
+		lyra2_cpu_hash_32(thr_id, throughput, d_hash[thr_id], gtx750ti);
 		skein256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);

 		*hashes_done = pdata[19] - first_nonce + throughput;
--- a/lyra2/lyra2REv3.cu
+++ b/lyra2/lyra2REv3.cu
@ -0,0 +1,183 @@
				@@ -0,0 +1,183 @@
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_cubehash.h"
+#include "lyra2/Lyra2.h"
+}
+
+#include <miner.h>
+#include <cuda_helper.h>
+
+static uint64_t *d_hash[MAX_GPUS];
+static uint64_t* d_matrix[MAX_GPUS];
+
+extern void blake256_cpu_init(int thr_id, uint32_t threads);
+extern void blake256_cpu_setBlock_80(uint32_t *pdata);
+extern void blake256_cpu_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNonce, uint64_t *Hash, int order);
+
+extern void cubehash256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *d_hash, int order);
+
+extern void lyra2v3_setTarget(const void *pTargetIn);
+extern void lyra2v3_cpu_init(int thr_id, uint32_t threads, uint64_t* d_matrix);
+extern void lyra2v3_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNonce, uint64_t *d_outputHash, int order);
+
+extern void lyra2v3_cpu_hash_32_targ(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces);
+
+extern void bmw256_setTarget(const void *ptarget);
+extern void bmw256_cpu_init(int thr_id, uint32_t threads);
+extern void bmw256_cpu_free(int thr_id);
+extern void bmw256_cpu_hash_32(int thr_id, uint32_t threads, uint32_t startNounce, uint64_t *g_hash, uint32_t *resultnonces);
+
+extern "C" void lyra2v3_hash(void *state, const void *input)
+{
+	uint32_t hashA[8], hashB[8];
+
+	sph_blake256_context      ctx_blake;
+	sph_cubehash256_context   ctx_cube;
+	sph_bmw256_context        ctx_bmw;
+
+	sph_blake256_set_rounds(14);
+
+	sph_blake256_init(&ctx_blake);
+	sph_blake256(&ctx_blake, input, 80);
+	sph_blake256_close(&ctx_blake, hashA);
+
+	LYRA2_3(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4);
+
+	sph_cubehash256_init(&ctx_cube);
+	sph_cubehash256(&ctx_cube, hashB, 32);
+	sph_cubehash256_close(&ctx_cube, hashA);
+
+	LYRA2_3(hashB, 32, hashA, 32, hashA, 32, 1, 4, 4);
+
+	sph_bmw256_init(&ctx_bmw);
+	sph_bmw256(&ctx_bmw, hashB, 32);
+	sph_bmw256_close(&ctx_bmw, hashA);
+
+	memcpy(state, hashA, 32);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_lyra2v3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] < 500) ? 18 : is_windows() ? 19 : 20;
+	if (strstr(device_name[dev_id], "GTX 1")) intensity = 20;
+	if (strstr(device_name[dev_id], "RTX 20")) intensity = 20;
+	uint32_t throughput = cuda_default_throughput(dev_id, 1UL << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	if (opt_benchmark)
+		ptarget[7] = 0x000f;
+
+
+	if (!init[thr_id])
+	{
+		size_t matrix_sz = 16 * sizeof(uint64_t) * 4 * 3;
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		blake256_cpu_init(thr_id, throughput);
+		bmw256_cpu_init(thr_id, throughput);
+
+		cuda_get_arch(thr_id); // cuda_arch[] also used in cubehash256
+
+		// SM 3 implentation requires a bit more memory
+		if (device_sm[dev_id] < 500 || cuda_arch[dev_id] < 500)
+			matrix_sz = 16 * sizeof(uint64_t) * 4 * 4;
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput));
+		lyra2v3_cpu_init(thr_id, throughput, d_matrix[thr_id]);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], (size_t)32 * throughput));
+
+		api_set_throughput(thr_id, throughput);
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	blake256_cpu_setBlock_80(pdata);
+	bmw256_setTarget(ptarget);
+
+	do {
+		int order = 0;
+
+		blake256_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		lyra2v3_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		cubehash256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		lyra2v3_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+		memset(work->nonces, 0, sizeof(work->nonces));
+		bmw256_cpu_hash_32(thr_id, throughput, pdata[19], d_hash[thr_id], work->nonces);
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		if (work->nonces[0] != 0)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			lyra2v3_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					lyra2v3_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart && !abort_flag);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_lyra2v3(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+	cudaFree(d_matrix[thr_id]);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/miner.h
+++ b/miner.h
@ -274,13 +274,15 @@ void gostd(void *output, const void *input, size_t len);
				@@ -274,13 +274,15 @@ void gostd(void *output, const void *input, size_t len);

 struct work;

+extern int scanhash_allium(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_bastion(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_blake256(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blakerounds);
+extern int scanhash_blake2b(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_blake2s(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_bmw(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_c11(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
-extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
-extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_cryptolight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant);
+extern int scanhash_cryptonight(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int variant);
 extern int scanhash_decred(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_deep(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_equihash(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@ -298,26 +300,31 @@ extern int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsi
				@@ -298,26 +300,31 @@ extern int scanhash_lbry(int thr_id, struct work *work, uint32_t max_nonce, unsi
 extern int scanhash_luffa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_lyra2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_lyra2v2(int thr_id,struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_lyra2v3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_lyra2Z(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_myriad(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_neoscrypt(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_nist5(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_pentablake(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_polytimos(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_quark(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_qubit(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_sha256d(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_sha256t(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_sha256q(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_sib(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_skeincoin(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_skein2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_skunk(int thr_id, struct work *work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_s3(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_sonoa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_tribus(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_bitcore(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
+extern int scanhash_exosis(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_vanilla(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done, int8_t blake_rounds);
 extern int scanhash_veltor(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
 extern int scanhash_whirl(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done);
@ -341,9 +348,11 @@ extern int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonc
				@@ -341,9 +348,11 @@ extern int scanhash_scrypt_jane(int thr_id, struct work *work, uint32_t max_nonc
 /* free device allocated memory per algo */
 void algo_free_all(int thr_id);

+extern void free_allium(int thr_id);
 extern void free_bastion(int thr_id);
 extern void free_bitcore(int thr_id);
 extern void free_blake256(int thr_id);
+extern void free_blake2b(int thr_id);
 extern void free_blake2s(int thr_id);
 extern void free_bmw(int thr_id);
 extern void free_c11(int thr_id);
@ -352,6 +361,7 @@ extern void free_cryptonight(int thr_id);
				@@ -352,6 +361,7 @@ extern void free_cryptonight(int thr_id);
 extern void free_decred(int thr_id);
 extern void free_deep(int thr_id);
 extern void free_equihash(int thr_id);
+extern void free_exosis(int thr_id);
 extern void free_keccak256(int thr_id);
 extern void free_fresh(int thr_id);
 extern void free_fugue256(int thr_id);
@ -366,23 +376,27 @@ extern void free_lbry(int thr_id);
				@@ -366,23 +376,27 @@ extern void free_lbry(int thr_id);
 extern void free_luffa(int thr_id);
 extern void free_lyra2(int thr_id);
 extern void free_lyra2v2(int thr_id);
+extern void free_lyra2v3(int thr_id);
 extern void free_lyra2Z(int thr_id);
 extern void free_myriad(int thr_id);
 extern void free_neoscrypt(int thr_id);
 extern void free_nist5(int thr_id);
 extern void free_pentablake(int thr_id);
 extern void free_phi(int thr_id);
+extern void free_phi2(int thr_id);
 extern void free_polytimos(int thr_id);
 extern void free_quark(int thr_id);
 extern void free_qubit(int thr_id);
 extern void free_sha256d(int thr_id);
 extern void free_sha256t(int thr_id);
+extern void free_sha256q(int thr_id);
 extern void free_sia(int thr_id);
 extern void free_sib(int thr_id);
 extern void free_skeincoin(int thr_id);
 extern void free_skein2(int thr_id);
 extern void free_skunk(int thr_id);
 extern void free_s3(int thr_id);
+extern void free_sonoa(int thr_id);
 extern void free_timetravel(int thr_id);
 extern void free_tribus(int thr_id);
 extern void free_bitcore(int thr_id);
@ -574,6 +588,8 @@ extern uint32_t device_plimit[MAX_GPUS];
				@@ -574,6 +588,8 @@ extern uint32_t device_plimit[MAX_GPUS];
 extern uint32_t gpus_intensity[MAX_GPUS];
 extern int opt_cudaschedule;

+extern int cryptonight_fork;
+
 // cuda.cpp
 int cuda_num_devices();
 void cuda_devicenames();
@ -668,7 +684,7 @@ struct stratum_job {
				@@ -668,7 +684,7 @@ struct stratum_job {
 	unsigned char version[4];
 	unsigned char nbits[4];
 	unsigned char ntime[4];
-	unsigned char claim[32]; // lbry
+	unsigned char extra[64]; // like lbry claimtrie
 	bool clean;
 	unsigned char nreward[2];
 	uint32_t height;
@ -890,14 +906,19 @@ void applog_hash64(void *hash);
				@@ -890,14 +906,19 @@ void applog_hash64(void *hash);
 void applog_compare_hash(void *hash, void *hash_ref);

 void print_hash_tests(void);
+void allium_hash(void *state, const void *input);
 void bastionhash(void* output, const unsigned char* input);
 void blake256hash(void *output, const void *input, int8_t rounds);
 void blake2b_hash(void *output, const void *input);
 void blake2s_hash(void *output, const void *input);
 void bmw_hash(void *state, const void *input);
 void c11hash(void *output, const void *input);
-void cryptolight_hash(void* output, const void* input, int len);
-void cryptonight_hash(void* output, const void* input, size_t len);
+void cryptolight_hash_variant(void* output, const void* input, int len, int variant);
+void cryptolight_hash(void* output, const void* input);
+void cryptonight_hash_variant(void* output, const void* input, size_t len, int variant);
+void cryptonight_hash(void* output, const void* input);
+void monero_hash(void* output, const void* input);
+void stellite_hash(void* output, const void* input);
 void decred_hash(void *state, const void *input);
 void deephash(void *state, const void *input);
 void luffa_hash(void *state, const void *input);
@ -914,12 +935,14 @@ void jha_hash(void *output, const void *input);
				@@ -914,12 +935,14 @@ void jha_hash(void *output, const void *input);
 void lbry_hash(void *output, const void *input);
 void lyra2re_hash(void *state, const void *input);
 void lyra2v2_hash(void *state, const void *input);
+void lyra2v3_hash(void *state, const void *input);
 void lyra2Z_hash(void *state, const void *input);
 void myriadhash(void *state, const void *input);
 void neoscrypt(uchar *output, const uchar *input, uint32_t profile);
 void nist5hash(void *state, const void *input);
 void pentablakehash(void *output, const void *input);
-void phihash(void *output, const void *input);
+void phi_hash(void *output, const void *input);
+void phi2_hash(void *output, const void *input);
 void polytimos_hash(void *output, const void *input);
 void quarkhash(void *state, const void *input);
 void qubithash(void *state, const void *input);
@ -927,6 +950,8 @@ void scrypthash(void* output, const void* input);
				@@ -927,6 +950,8 @@ void scrypthash(void* output, const void* input);
 void scryptjane_hash(void* output, const void* input);
 void sha256d_hash(void *output, const void *input);
 void sha256t_hash(void *output, const void *input);
+void sha256q_hash(void *output, const void *input);
+void sia_blake2b_hash(void *output, const void *input);
 void sibhash(void *output, const void *input);
 void skeincoinhash(void *output, const void *input);
 void skein2hash(void *output, const void *input);
@ -934,6 +959,7 @@ void skunk_hash(void *state, const void *input);
				@@ -934,6 +959,7 @@ void skunk_hash(void *state, const void *input);
 void s3hash(void *output, const void *input);
 void timetravel_hash(void *output, const void *input);
 void bitcore_hash(void *output, const void *input);
+void exosis_hash(void *output, const void *input);
 void tribus_hash(void *output, const void *input);
 void veltorhash(void *output, const void *input);
 void wcoinhash(void *state, const void *input);
--- a/neoscrypt/cuda_neoscrypt.cu
+++ b/neoscrypt/cuda_neoscrypt.cu
@ -179,7 +179,7 @@ static void shift256R4(uint32_t* ret, const uint8 &vec4, const uint32_t shift2)
				@@ -179,7 +179,7 @@ static void shift256R4(uint32_t* ret, const uint8 &vec4, const uint32_t shift2)
 	idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
 	a += b; d = rotateL(d^a, 16); \
 	c += d; b = rotateR(b^c, 12); \
-	idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \
+	idx = BLAKE2S_SIGMA[idx0][idx1 + 1]; a += key[idx]; \
 	a += b; d = rotateR(d^a, 8); \
 	c += d; b = rotateR(b^c, 7); \
 }
@ -392,7 +392,7 @@ void Blake2S(uint32_t *out, const uint32_t* const __restrict__  inout, const  ui
				@@ -392,7 +392,7 @@ void Blake2S(uint32_t *out, const uint32_t* const __restrict__  inout, const  ui
 	idx = BLAKE2S_SIGMA[idx0][idx1]; a += key[idx]; \
 	a += b; d = __byte_perm(d^a, 0, 0x1032); \
 	c += d; b = rotateR(b^c, 12); \
-	idx = BLAKE2S_SIGMA[idx0][idx1+1]; a += key[idx]; \
+	idx = BLAKE2S_SIGMA[idx0][idx1 + 1]; a += key[idx]; \
 	a += b; d = __byte_perm(d^a, 0, 0x0321); \
 	c += d; b = rotateR(b^c, 7); \
 }
@ -1260,7 +1260,7 @@ uint32_t fastkdf32_v3(uint32_t thread, const uint32_t nonce, uint32_t* const sal
				@@ -1260,7 +1260,7 @@ uint32_t fastkdf32_v3(uint32_t thread, const uint32_t nonce, uint32_t* const sal
 	idx = BLAKE2S_SIGMA_host[idx0][idx1]; a += key[idx]; \
 	a += b; d = ROTR32(d^a,16); \
 	c += d; b = ROTR32(b^c, 12); \
-	idx = BLAKE2S_SIGMA_host[idx0][idx1+1]; a += key[idx]; \
+	idx = BLAKE2S_SIGMA_host[idx0][idx1 + 1]; a += key[idx]; \
 	a += b; d = ROTR32(d^a,8); \
 	c += d; b = ROTR32(b^c, 7); \
 }
--- a/phi/cuda_phi2.cu
+++ b/phi/cuda_phi2.cu
@ -0,0 +1,89 @@
				@@ -0,0 +1,89 @@
+#include <stdio.h>
+#include <memory.h>
+
+#include "cuda_helper.h"
+
+__global__ __launch_bounds__(128, 8)
+void phi_filter_gpu(const uint32_t threads, const uint32_t* d_hash, uint32_t* d_branch2, uint32_t* d_NonceBranch)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t offset = thread * 16U; // 64U / sizeof(uint32_t);
+		uint4 *psrc = (uint4*) (&d_hash[offset]);
+		d_NonceBranch[thread] = ((uint8_t*)psrc)[0] & 1;
+		if (d_NonceBranch[thread]) return;
+		if (d_branch2) {
+			uint4 *pdst = (uint4*)(&d_branch2[offset]);
+			uint4 data;
+			data = psrc[0]; pdst[0] = data;
+			data = psrc[1]; pdst[1] = data;
+			data = psrc[2]; pdst[2] = data;
+			data = psrc[3]; pdst[3] = data;
+		}
+	}
+}
+
+__global__ __launch_bounds__(128, 8)
+void phi_merge_gpu(const uint32_t threads, uint32_t* d_hash, uint32_t* d_branch2, uint32_t* const d_NonceBranch)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads && !d_NonceBranch[thread])
+	{
+		const uint32_t offset = thread * 16U;
+		uint4 *psrc = (uint4*) (&d_branch2[offset]);
+		uint4 *pdst = (uint4*) (&d_hash[offset]);
+		uint4 data;
+		data = psrc[0]; pdst[0] = data;
+		data = psrc[1]; pdst[1] = data;
+		data = psrc[2]; pdst[2] = data;
+		data = psrc[3]; pdst[3] = data;
+	}
+}
+
+__global__
+void phi_final_compress_gpu(const uint32_t threads, uint32_t* d_hash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t offset = thread * 16U;
+		uint2 *psrc = (uint2*) (&d_hash[offset]);
+		uint2 *pdst = (uint2*) (&d_hash[offset]);
+		uint2 data;
+		data = psrc[4]; pdst[0] ^= data;
+		data = psrc[5]; pdst[1] ^= data;
+		data = psrc[6]; pdst[2] ^= data;
+		data = psrc[7]; pdst[3] ^= data;
+	}
+}
+
+__host__
+uint32_t phi_filter_cuda(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_br2, uint32_t* d_nonces)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+	// extract algo permution hashes to a second branch buffer
+	phi_filter_gpu <<<grid, block>>> (threads, inpHashes, d_br2, d_nonces);
+	return threads;
+}
+
+__host__
+void phi_merge_cuda(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_br2, uint32_t* d_nonces)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+	// put back second branch hashes to the common buffer d_hash
+	phi_merge_gpu <<<grid, block>>> (threads, outpHashes, d_br2, d_nonces);
+}
+
+__host__
+void phi_final_compress_cuda(const int thr_id, const uint32_t threads, uint32_t *d_hashes)
+{
+	const uint32_t threadsperblock = 128;
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+	phi_final_compress_gpu <<<grid, block>>> (threads, d_hashes);
+}
--- a/phi/cuda_phi2_cubehash512.cu
+++ b/phi/cuda_phi2_cubehash512.cu
@ -0,0 +1,319 @@
				@@ -0,0 +1,319 @@
+/* phi2 cubehash-512 144-bytes input (80 + 64) */
+
+#include <cuda_helper.h>
+#include <cuda_vectors.h>
+
+#define CUBEHASH_ROUNDS 16 /* this is r for CubeHashr/b */
+#define CUBEHASH_BLOCKBYTES 32 /* this is b for CubeHashr/b */
+
+#if __CUDA_ARCH__ < 350
+#define LROT(x,bits) ((x << bits) | (x >> (32 - bits)))
+#else
+#define LROT(x, bits) __funnelshift_l(x, x, bits)
+#endif
+
+#define ROTATEUPWARDS7(a)  LROT(a,7)
+#define ROTATEUPWARDS11(a) LROT(a,11)
+
+#define SWAP(a,b) { uint32_t u = a; a = b; b = u; }
+
+#ifdef NO_MIDSTATE
+
+__device__ __constant__
+static const uint32_t c_IV_512[32] = {
+	0x2AEA2A61, 0x50F494D4, 0x2D538B8B, 0x4167D83E,
+	0x3FEE2313, 0xC701CF8C, 0xCC39968E, 0x50AC5695,
+	0x4D42C787, 0xA647A8B3, 0x97CF0BEF, 0x825B4537,
+	0xEEF864D2, 0xF22090C4, 0xD0E5CD33, 0xA23911AE,
+	0xFCD398D9, 0x148FE485, 0x1B017BEF, 0xB6444532,
+	0x6A536159, 0x2FF5781C, 0x91FA7934, 0x0DBADEA9,
+	0xD65C8A2B, 0xA5A70E75, 0xB1C62456, 0xBC796576,
+	0x1921C8F7, 0xE7989AF1, 0x7795D246, 0xD43E3B44
+};
+
+#endif
+
+__device__ __forceinline__
+static void rrounds(uint32_t x[2][2][2][2][2])
+{
+    int r;
+    int j;
+    int k;
+    int l;
+    int m;
+
+//#pragma unroll 16
+    for (r = 0;r < CUBEHASH_ROUNDS;++r) {
+
+        /* "add x_0jklm into x_1jklmn modulo 2^32" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[1][j][k][l][m] += x[0][j][k][l][m];
+
+        /* "rotate x_0jklm upwards by 7 bits" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[0][j][k][l][m] = ROTATEUPWARDS7(x[0][j][k][l][m]);
+
+        /* "swap x_00klm with x_01klm" */
+#pragma unroll 2
+        for (k = 0;k < 2;++k)
+#pragma unroll 2
+            for (l = 0;l < 2;++l)
+#pragma unroll 2
+                for (m = 0;m < 2;++m)
+                    SWAP(x[0][0][k][l][m],x[0][1][k][l][m])
+
+        /* "xor x_1jklm into x_0jklm" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[0][j][k][l][m] ^= x[1][j][k][l][m];
+
+        /* "swap x_1jk0m with x_1jk1m" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (m = 0;m < 2;++m)
+                    SWAP(x[1][j][k][0][m],x[1][j][k][1][m])
+
+        /* "add x_0jklm into x_1jklm modulo 2^32" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[1][j][k][l][m] += x[0][j][k][l][m];
+
+        /* "rotate x_0jklm upwards by 11 bits" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[0][j][k][l][m] = ROTATEUPWARDS11(x[0][j][k][l][m]);
+
+        /* "swap x_0j0lm with x_0j1lm" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (l = 0;l < 2;++l)
+#pragma unroll 2
+                for (m = 0;m < 2;++m)
+                    SWAP(x[0][j][0][l][m],x[0][j][1][l][m])
+
+        /* "xor x_1jklm into x_0jklm" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+#pragma unroll 2
+                    for (m = 0;m < 2;++m)
+                        x[0][j][k][l][m] ^= x[1][j][k][l][m];
+
+        /* "swap x_1jkl0 with x_1jkl1" */
+#pragma unroll 2
+        for (j = 0;j < 2;++j)
+#pragma unroll 2
+            for (k = 0;k < 2;++k)
+#pragma unroll 2
+                for (l = 0;l < 2;++l)
+                    SWAP(x[1][j][k][l][0],x[1][j][k][l][1])
+
+    }
+}
+
+__device__ __forceinline__
+static void block_tox(uint32_t* const block, uint32_t x[2][2][2][2][2])
+{
+	// read 32 bytes input from global mem with uint2 chunks
+	AS_UINT2(x[0][0][0][0]) ^= AS_UINT2(&block[0]);
+	AS_UINT2(x[0][0][0][1]) ^= AS_UINT2(&block[2]);
+	AS_UINT2(x[0][0][1][0]) ^= AS_UINT2(&block[4]);
+	AS_UINT2(x[0][0][1][1]) ^= AS_UINT2(&block[6]);
+}
+
+__device__ __forceinline__
+static void hash_fromx(uint32_t hash[16], uint32_t const x[2][2][2][2][2])
+{
+	// used to write final hash to global mem
+	AS_UINT2(&hash[ 0]) = AS_UINT2(x[0][0][0][0]);
+	AS_UINT2(&hash[ 2]) = AS_UINT2(x[0][0][0][1]);
+	AS_UINT2(&hash[ 4]) = AS_UINT2(x[0][0][1][0]);
+	AS_UINT2(&hash[ 6]) = AS_UINT2(x[0][0][1][1]);
+	AS_UINT2(&hash[ 8]) = AS_UINT2(x[0][1][0][0]);
+	AS_UINT2(&hash[10]) = AS_UINT2(x[0][1][0][1]);
+	AS_UINT2(&hash[12]) = AS_UINT2(x[0][1][1][0]);
+	AS_UINT2(&hash[14]) = AS_UINT2(x[0][1][1][1]);
+}
+
+#define Init(x) \
+	AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_IV_512[ 0]); \
+	AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_IV_512[ 2]); \
+	AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_IV_512[ 4]); \
+	AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_IV_512[ 6]); \
+	AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_IV_512[ 8]); \
+	AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_IV_512[10]); \
+	AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_IV_512[12]); \
+	AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_IV_512[14]); \
+	AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_IV_512[16]); \
+	AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_IV_512[18]); \
+	AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_IV_512[20]); \
+	AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_IV_512[22]); \
+	AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_IV_512[24]); \
+	AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_IV_512[26]); \
+	AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_IV_512[28]); \
+	AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_IV_512[30]);
+
+__device__ __forceinline__
+static void Update32(uint32_t x[2][2][2][2][2], uint32_t* const data)
+{
+	/* "xor the block into the first b bytes of the state" */
+	block_tox(data, x);
+	/* "and then transform the state invertibly through r identical rounds" */
+	rrounds(x);
+}
+
+__device__ __forceinline__
+static void Final(uint32_t x[2][2][2][2][2], uint32_t *hashval)
+{
+	/* "the integer 1 is xored into the last state word x_11111" */
+	x[1][1][1][1][1] ^= 1;
+
+	/* "the state is then transformed invertibly through 10r identical rounds" */
+	#pragma unroll 10
+	for (int i = 0; i < 10; i++) rrounds(x);
+
+	/* "output the first h/8 bytes of the state" */
+	hash_fromx(hashval, x);
+}
+
+__host__ void phi2_cubehash512_cpu_init(int thr_id, uint32_t threads) { }
+
+/***************************************************/
+
+/**
+ * Timetravel and x16 CUBEHASH-80 CUDA implementation
+ *  by tpruvot@github - Jan 2017 / May 2018
+ */
+
+__constant__ static uint32_t c_midstate128[32];
+__constant__ static uint32_t c_PaddedMessage_144[36];
+
+#undef SPH_C32
+#undef SPH_C64
+#undef SPH_T32
+#undef SPH_T64
+#include "sph/sph_cubehash.h"
+
+__host__
+void cubehash512_setBlock_144(int thr_id, uint32_t* endiandata)
+{
+	sph_cubehash512_context ctx_cubehash;
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, (void*)endiandata, 64);
+#ifndef NO_MIDSTATE
+	cudaMemcpyToSymbol(c_midstate128, ctx_cubehash.state, 128, 0, cudaMemcpyHostToDevice);
+#endif
+	cudaMemcpyToSymbol(c_PaddedMessage_144, endiandata, sizeof(c_PaddedMessage_144), 0, cudaMemcpyHostToDevice);
+}
+
+__global__
+void cubehash512_gpu_hash_144(const uint32_t threads, const uint32_t startNounce, uint64_t *g_outhash)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNounce + thread;
+		uint32_t message[8];
+		uint32_t x[2][2][2][2][2];
+#ifdef NO_MIDSTATE
+		Init(x);
+
+		// first 32 bytes
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[0]);
+		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[4]);
+		Update32(x, message);
+
+		// second 32 bytes
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[8]);
+		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[12]);
+		Update32(x, message);
+#else
+		AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_midstate128[ 0]);
+		AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_midstate128[ 2]);
+		AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_midstate128[ 4]);
+		AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_midstate128[ 6]);
+		AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_midstate128[ 8]);
+		AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_midstate128[10]);
+		AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_midstate128[12]);
+		AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_midstate128[14]);
+
+		AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_midstate128[16]);
+		AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_midstate128[18]);
+		AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_midstate128[20]);
+		AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_midstate128[22]);
+		AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_midstate128[24]);
+		AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_midstate128[26]);
+		AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_midstate128[28]);
+		AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_midstate128[30]);
+#endif
+		// nonce + state root
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[16]);
+		message[3] = cuda_swab32(nonce);
+		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[20]); // state
+		Update32(x, message);
+
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[24]); // state
+		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage_144[28]); // utxo
+		Update32(x, message);
+
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage_144[32]); // utxo
+		message[4] = 0x80;
+		message[5] = 0;
+		message[6] = 0;
+		message[7] = 0;
+		Update32(x, message);
+
+		uint32_t* output = (uint32_t*) (&g_outhash[(size_t)8 * thread]);
+		Final(x, output);
+	}
+}
+
+__host__
+void cubehash512_cuda_hash_144(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash)
+{
+	const uint32_t threadsperblock = 256;
+	dim3 grid((threads + threadsperblock-1)/threadsperblock);
+	dim3 block(threadsperblock);
+
+	cubehash512_gpu_hash_144 <<<grid, block>>> (threads, startNounce, (uint64_t*) d_hash);
+}
+
--- a/phi/phi.cu
+++ b/phi/phi.cu
@ -19,7 +19,7 @@ extern "C" {
				@@ -19,7 +19,7 @@ extern "C" {

 #include "miner.h"
 #include "cuda_helper.h"
-#include "cuda_x11.h"
+#include "x11/cuda_x11.h"

 extern void skein512_cpu_setBlock_80(void *pdata);
 extern void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *d_hash, int swap);
@ -38,7 +38,7 @@ extern void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash,
				@@ -38,7 +38,7 @@ extern void tribus_echo512_final(int thr_id, uint32_t threads, uint32_t *d_hash,
 static uint32_t *d_hash[MAX_GPUS];
 static uint32_t *d_resNonce[MAX_GPUS];

-extern "C" void phihash(void *output, const void *input)
+extern "C" void phi_hash(void *output, const void *input)
 {
 	unsigned char _ALIGN(128) hash[128] = { 0 };

@ -162,7 +162,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u
				@@ -162,7 +162,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u
 			uint32_t _ALIGN(64) vhash[8];
 			if (!use_compat_kernels[thr_id]) work->nonces[0] += startNonce;
 			be32enc(&endiandata[19], work->nonces[0]);
-			phihash(vhash, endiandata);
+			phi_hash(vhash, endiandata);

 			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
 				work->valid_nonces = 1;
@ -173,7 +173,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u
				@@ -173,7 +173,7 @@ extern "C" int scanhash_phi(int thr_id, struct work* work, uint32_t max_nonce, u
 				if (work->nonces[1] != UINT32_MAX) {
 					work->nonces[1] += startNonce;
 					be32enc(&endiandata[19], work->nonces[1]);
-					phihash(vhash, endiandata);
+					phi_hash(vhash, endiandata);
 					bn_set_target_ratio(work, vhash, 1);
 					work->valid_nonces++;
 					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
--- a/phi/phi2.cu
+++ b/phi/phi2.cu
@ -0,0 +1,268 @@
				@@ -0,0 +1,268 @@
+//
+//  PHI2 algo (with smart contracts header)
+//  CubeHash + Lyra2 x2 + JH + Gost or Echo + Skein
+//
+//  Implemented by tpruvot in May 2018
+//
+
+extern "C" {
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_streebog.h"
+#include "sph/sph_echo.h"
+#include "lyra2/Lyra2.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "x11/cuda_x11.h"
+
+#include <stdio.h>
+#include <memory.h>
+
+extern void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
+extern void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+extern void cubehash512_setBlock_144(int thr_id, uint32_t* endiandata);
+extern void cubehash512_cuda_hash_144(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+extern void lyra2_cpu_init(int thr_id, uint32_t threads, uint64_t *d_matrix);
+extern void lyra2_cuda_hash_64(int thr_id, const uint32_t threads, uint64_t* d_hash_256, uint32_t* d_hash_512, bool gtx750ti);
+
+extern void streebog_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+extern void phi_streebog_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t *g_hash, uint32_t *d_filter);
+extern void phi_echo512_cpu_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t* g_hash, uint32_t* d_filter);
+
+extern uint32_t phi_filter_cuda(const int thr_id, const uint32_t threads, const uint32_t *inpHashes, uint32_t* d_br2, uint32_t* d_nonces);
+extern void phi_merge_cuda(const int thr_id, const uint32_t threads, uint32_t *outpHashes, uint32_t* d_br2, uint32_t* d_nonces);
+extern void phi_final_compress_cuda(const int thr_id, const uint32_t threads, uint32_t *d_hashes);
+
+static uint64_t* d_matrix[MAX_GPUS];
+static uint32_t* d_hash_512[MAX_GPUS];
+static uint64_t* d_hash_256[MAX_GPUS];
+static uint32_t* d_hash_br2[MAX_GPUS];
+static uint32_t* d_nonce_br[MAX_GPUS];
+
+static bool has_roots;
+
+extern "C" void phi2_hash(void *output, const void *input)
+{
+	unsigned char _ALIGN(128) hash[64];
+	unsigned char _ALIGN(128) hashA[64];
+	unsigned char _ALIGN(128) hashB[64];
+
+	sph_cubehash512_context ctx_cubehash;
+	sph_jh512_context ctx_jh;
+	sph_gost512_context ctx_gost;
+	sph_echo512_context ctx_echo;
+	sph_skein512_context ctx_skein;
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, input, has_roots ? 144 : 80);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hashB);
+
+	LYRA2(&hashA[ 0], 32, &hashB[ 0], 32, &hashB[ 0], 32, 1, 8, 8);
+	LYRA2(&hashA[32], 32, &hashB[32], 32, &hashB[32], 32, 1, 8, 8);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512(&ctx_jh, (const void*)hashA, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	if (hash[0] & 1) {
+		sph_gost512_init(&ctx_gost);
+		sph_gost512(&ctx_gost, (const void*)hash, 64);
+		sph_gost512_close(&ctx_gost, (void*)hash);
+	} else {
+		sph_echo512_init(&ctx_echo);
+		sph_echo512(&ctx_echo, (const void*)hash, 64);
+		sph_echo512_close(&ctx_echo, (void*)hash);
+
+		sph_echo512_init(&ctx_echo);
+		sph_echo512(&ctx_echo, (const void*)hash, 64);
+		sph_echo512_close(&ctx_echo, (void*)hash);
+	}
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	for (int i=0; i<32; i++)
+		hash[i] ^= hash[i+32];
+
+	memcpy(output, hash, 32);
+}
+
+//#define _DEBUG
+#define _DEBUG_PREFIX "phi-"
+#include "cuda_debug.cuh"
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+static __thread bool gtx750ti = false;
+
+extern "C" int scanhash_phi2(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+
+	const uint32_t first_nonce = pdata[19];
+	const int dev_id = device_map[thr_id];
+
+	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 17 : 16;
+	if (device_sm[dev_id] == 500) intensity = 15;
+	if (device_sm[dev_id] == 600) intensity = 17;
+
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+	if (init[thr_id]) throughput = max(throughput & 0xffffff80, 128); // for shared mem
+
+	if (opt_benchmark)
+		ptarget[7] = 0xff;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+		gtx750ti = (strstr(device_name[dev_id], "GTX 750 Ti") != NULL);
+
+		size_t matrix_sz = device_sm[dev_id] > 500 ? sizeof(uint64_t) * 16 : sizeof(uint64_t) * 8 * 8 * 3 * 4;
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_matrix[thr_id], matrix_sz * throughput), -1);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_256[thr_id], (size_t)32 * throughput), -1);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_512[thr_id], (size_t)64 * throughput), -1);
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_nonce_br[thr_id], sizeof(uint32_t) * throughput), -1);
+		if (use_compat_kernels[thr_id]) {
+			CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash_br2[thr_id], (size_t)64 * throughput), -1);
+		}
+
+		lyra2_cpu_init(thr_id, throughput, d_matrix[thr_id]);
+		quark_jh512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		if (use_compat_kernels[thr_id]) x11_echo512_cpu_init(thr_id, throughput);
+
+		cuda_check_cpu_init(thr_id, throughput);
+		init[thr_id] = true;
+	}
+
+	has_roots = false;
+	uint32_t endiandata[36];
+	for (int k = 0; k < 36; k++) {
+		be32enc(&endiandata[k], pdata[k]);
+		if (k >= 20 && pdata[k]) has_roots = true;
+	}
+
+	cuda_check_cpu_setTarget(ptarget);
+	if (has_roots)
+		cubehash512_setBlock_144(thr_id, endiandata);
+	else
+		cubehash512_setBlock_80(thr_id, endiandata);
+
+	do {
+		int order = 0;
+		if (has_roots)
+			cubehash512_cuda_hash_144(thr_id, throughput, pdata[19], d_hash_512[thr_id]);
+		else
+			cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash_512[thr_id]);
+		order++;
+		TRACE("cube   ");
+
+		lyra2_cuda_hash_64(thr_id, throughput, d_hash_256[thr_id], d_hash_512[thr_id], gtx750ti);
+		order++;
+		TRACE("lyra   ");
+
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_512[thr_id], order++);
+		TRACE("jh     ");
+
+		order++;
+		if (!use_compat_kernels[thr_id]) {
+			phi_filter_cuda(thr_id, throughput, d_hash_512[thr_id], NULL, d_nonce_br[thr_id]);
+			phi_streebog_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]);
+			phi_echo512_cpu_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]);
+			phi_echo512_cpu_hash_64_filtered(thr_id, throughput, d_hash_512[thr_id], d_nonce_br[thr_id]);
+		} else {
+			// todo: nonces vector to reduce amount of hashes to compute
+			phi_filter_cuda(thr_id, throughput, d_hash_512[thr_id], d_hash_br2[thr_id], d_nonce_br[thr_id]);
+			streebog_cpu_hash_64(thr_id, throughput, d_hash_512[thr_id]);
+			x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order);
+			x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_br2[thr_id], order);
+			phi_merge_cuda(thr_id, throughput, d_hash_512[thr_id], d_hash_br2[thr_id], d_nonce_br[thr_id]);
+		}
+		TRACE("mix    ");
+
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash_512[thr_id], order++);
+		TRACE("skein  ");
+
+		phi_final_compress_cuda(thr_id, throughput, d_hash_512[thr_id]);
+		TRACE("xor  ");
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash_512[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			phi2_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				*hashes_done = pdata[19] - first_nonce + throughput;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash_512[thr_id], 1);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					phi2_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				if (pdata[19] > max_nonce) pdata[19] = max_nonce;
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU! thr=%x", work->nonces[0], throughput);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_phi2(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+	cudaFree(d_matrix[thr_id]);
+	cudaFree(d_hash_512[thr_id]);
+	cudaFree(d_hash_256[thr_id]);
+	cudaFree(d_nonce_br[thr_id]);
+	if (use_compat_kernels[thr_id]) cudaFree(d_hash_br2[thr_id]);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/res/ccminer.rc
+++ b/res/ccminer.rc
@ -60,8 +60,8 @@ IDI_ICON1               ICON                    "ccminer.ico"
				@@ -60,8 +60,8 @@ IDI_ICON1               ICON                    "ccminer.ico"
 //

 VS_VERSION_INFO VERSIONINFO
- FILEVERSION 2,2,5,0
- PRODUCTVERSION 2,2,5,0
+ FILEVERSION 2,3,1,0
+ PRODUCTVERSION 2,3,1,0
 FILEFLAGSMASK 0x3fL
 #ifdef _DEBUG
 FILEFLAGS 0x21L
@ -76,10 +76,10 @@ BEGIN
				@@ -76,10 +76,10 @@ BEGIN
    BEGIN
        BLOCK "040904e4"
        BEGIN
-            VALUE "FileVersion", "2.2.5"
-            VALUE "LegalCopyright", "Copyright (C) 2018"
+            VALUE "FileVersion", "2.3.1"
+            VALUE "LegalCopyright", "Copyright (C) 2019"
            VALUE "ProductName", "ccminer"
-            VALUE "ProductVersion", "2.2.5"
+            VALUE "ProductVersion", "2.3.1"
        END
    END
    BLOCK "VarFileInfo"
--- a/scrypt.cpp
+++ b/scrypt.cpp
@ -50,7 +50,17 @@ using namespace Concurrency;
				@@ -50,7 +50,17 @@ using namespace Concurrency;

 #if _MSC_VER > 1800
 #undef _THROW1
+#if __cplusplus < 201101L
 #define _THROW1(x) throw(std::bad_alloc)
+#else
+#define _THROW1(x) noexcept(false)
+#endif
+#elif !defined(_MSC_VER)
+#if __cplusplus < 201101L
+#define _THROW1(x) throw(std::bad_alloc)
+#else
+#define _THROW1(x) noexcept(false)
+#endif
 #endif

 // A thin wrapper around the builtin __m128i type
@ -63,9 +73,9 @@ public:
				@@ -63,9 +73,9 @@ public:
 	void * operator new[](size_t size) _THROW1(_STD bad_alloc) { void *p; if ((p = _aligned_malloc(size, 16)) == 0) { static const std::bad_alloc nomem; _RAISE(nomem); } return (p); }
 	void operator delete[](void *p) { _aligned_free(p); }
 #else
-	void * operator new(size_t size) throw(std::bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); }
+	void * operator new(size_t size) _THROW1(_STD bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); }
 	void operator delete(void *p) { free(p); }
-	void * operator new[](size_t size) throw(std::bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); }
+	void * operator new[](size_t size) _THROW1(_STD bad_alloc) { void *p; if (posix_memalign(&p, 16, size) < 0) { static const std::bad_alloc nomem; throw nomem; } return (p); }
 	void operator delete[](void *p) { free(p); }
 #endif
 	uint32x4_t() { };
--- a/scrypt/test_kernel.cu
+++ b/scrypt/test_kernel.cu
@ -47,7 +47,7 @@ texture<uint4, 2, cudaReadModeElementType> texRef2D_4_V;
				@@ -47,7 +47,7 @@ texture<uint4, 2, cudaReadModeElementType> texRef2D_4_V;

 template <int ALGO> __device__  __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3);

-static __host__ __device__ uint4& operator^=(uint4& left, const uint4& right) {
+static __device__ uint4& operator^=(uint4& left, const uint4& right) {
 	left.x ^= right.x;
 	left.y ^= right.y;
 	left.z ^= right.z;
@ -55,7 +55,7 @@ static __host__ __device__ uint4& operator^=(uint4& left, const uint4& right) {
				@@ -55,7 +55,7 @@ static __host__ __device__ uint4& operator^=(uint4& left, const uint4& right) {
 	return left;
 }

-static __host__ __device__ uint4& operator+=(uint4& left, const uint4& right) {
+static __device__ uint4& operator+=(uint4& left, const uint4& right) {
 	left.x += right.x;
 	left.y += right.y;
 	left.z += right.z;
@ -63,7 +63,6 @@ static __host__ __device__ uint4& operator+=(uint4& left, const uint4& right) {
				@@ -63,7 +63,6 @@ static __host__ __device__ uint4& operator+=(uint4& left, const uint4& right) {
 	return left;
 }

-
 /* write_keys writes the 8 keys being processed by a warp to the global
 * scratchpad. To effectively use memory bandwidth, it performs the writes
 * (and reads, for read_keys) 128 bytes at a time per memory location
--- a/scrypt/titan_kernel.cu
+++ b/scrypt/titan_kernel.cu
@ -50,7 +50,7 @@ __constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP)-1
				@@ -50,7 +50,7 @@ __constant__ uint32_t c_SCRATCH_WU_PER_WARP_1; // (SCRATCH * WU_PER_WARP)-1

 template <int ALGO> __device__  __forceinline__ void block_mixer(uint4 &b, uint4 &bx, const int x1, const int x2, const int x3);

-static __host__ __device__ uint4& operator ^= (uint4& left, const uint4& right) {
+static __device__ uint4& operator ^= (uint4& left, const uint4& right) {
 	left.x ^= right.x;
 	left.y ^= right.y;
 	left.z ^= right.z;
@ -58,7 +58,7 @@ static __host__ __device__ uint4& operator ^= (uint4& left, const uint4& right)
				@@ -58,7 +58,7 @@ static __host__ __device__ uint4& operator ^= (uint4& left, const uint4& right)
 	return left;
 }

-static __host__ __device__ uint4& operator += (uint4& left, const uint4& right) {
+static __device__ uint4& operator += (uint4& left, const uint4& right) {
 	left.x += right.x;
 	left.y += right.y;
 	left.z += right.z;
--- a/sha256/cuda_sha256q.cu
+++ b/sha256/cuda_sha256q.cu
@ -0,0 +1,507 @@
				@@ -0,0 +1,507 @@
+/*
+ * sha256(-q) CUDA implementation.
+ * pyritepirate 2018
+ * tpruvot 2017
+ */
+
+#include <stdio.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include <cuda_helper.h>
+#include <miner.h>
+
+__constant__ static uint32_t __align__(8) c_midstate76[8];
+__constant__ static uint32_t __align__(8) c_dataEnd80[4];
+
+const __constant__  uint32_t __align__(8) c_H256[8] = {
+	0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
+	0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
+};
+__constant__ static uint32_t __align__(8) c_K[64];
+__constant__ static uint32_t __align__(8) c_target[2];
+__device__ uint64_t d_target[1];
+
+static uint32_t* d_resNonces[MAX_GPUS] = { 0 };
+
+// ------------------------------------------------------------------------------------------------
+
+static const uint32_t cpu_H256[8] = {
+	0x6A09E667U, 0xBB67AE85U, 0x3C6EF372U, 0xA54FF53AU,
+	0x510E527FU, 0x9B05688CU, 0x1F83D9ABU, 0x5BE0CD19U
+};
+
+static const uint32_t cpu_K[64] = {
+	0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
+	0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
+	0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
+	0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
+	0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
+	0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
+	0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
+	0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2
+};
+
+#define ROTR ROTR32
+
+__host__
+static void sha256_step1_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
+	uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t in, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+	uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
+	uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
+	uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+
+	t1 = h + bsg21 + vxandx + Kshared + in;
+	t2 = bsg20 + andorv;
+	d = d + t1;
+	h = t1 + t2;
+}
+
+__host__
+static void sha256_step2_host(uint32_t a, uint32_t b, uint32_t c, uint32_t &d,
+	uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t* in, uint32_t pc, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+
+	int pcidx1 = (pc-2)  & 0xF;
+	int pcidx2 = (pc-7)  & 0xF;
+	int pcidx3 = (pc-15) & 0xF;
+
+	uint32_t inx0 = in[pc];
+	uint32_t inx1 = in[pcidx1];
+	uint32_t inx2 = in[pcidx2];
+	uint32_t inx3 = in[pcidx3];
+
+	uint32_t ssg21 = ROTR(inx1, 17) ^ ROTR(inx1, 19) ^ SPH_T32((inx1) >> 10); //ssg2_1(inx1);
+	uint32_t ssg20 = ROTR(inx3, 7) ^ ROTR(inx3, 18) ^ SPH_T32((inx3) >> 3); //ssg2_0(inx3);
+	uint32_t vxandx = (((f) ^ (g)) & (e)) ^ (g); // xandx(e, f, g);
+	uint32_t bsg21 = ROTR(e, 6) ^ ROTR(e, 11) ^ ROTR(e, 25); // bsg2_1(e);
+	uint32_t bsg20 = ROTR(a, 2) ^ ROTR(a, 13) ^ ROTR(a, 22); //bsg2_0(a);
+	uint32_t andorv = ((b) & (c)) | (((b) | (c)) & (a)); //andor32(a,b,c);
+
+	in[pc] = ssg21 + inx2 + ssg20 + inx0;
+
+	t1 = h + bsg21 + vxandx + Kshared + in[pc];
+	t2 = bsg20 + andorv;
+	d =  d + t1;
+	h = t1 + t2;
+}
+
+__host__
+static void sha256_round_body_host(uint32_t* in, uint32_t* state, const uint32_t* Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha256_step1_host(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]);
+	sha256_step1_host(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]);
+	sha256_step1_host(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]);
+	sha256_step1_host(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]);
+	sha256_step1_host(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]);
+	sha256_step1_host(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]);
+	sha256_step1_host(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]);
+	sha256_step1_host(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]);
+	sha256_step1_host(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]);
+	sha256_step1_host(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]);
+	sha256_step1_host(g,h,a,b,c,d,e,f,in[10], Kshared[10]);
+	sha256_step1_host(f,g,h,a,b,c,d,e,in[11], Kshared[11]);
+	sha256_step1_host(e,f,g,h,a,b,c,d,in[12], Kshared[12]);
+	sha256_step1_host(d,e,f,g,h,a,b,c,in[13], Kshared[13]);
+	sha256_step1_host(c,d,e,f,g,h,a,b,in[14], Kshared[14]);
+	sha256_step1_host(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
+
+	for (int i=0; i<3; i++)
+	{
+		sha256_step2_host(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
+		sha256_step2_host(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
+		sha256_step2_host(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
+		sha256_step2_host(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
+		sha256_step2_host(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
+		sha256_step2_host(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
+		sha256_step2_host(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
+		sha256_step2_host(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
+		sha256_step2_host(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
+		sha256_step2_host(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
+		sha256_step2_host(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
+		sha256_step2_host(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
+		sha256_step2_host(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
+		sha256_step2_host(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
+		sha256_step2_host(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
+		sha256_step2_host(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
+	}
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+	state[4] += e;
+	state[5] += f;
+	state[6] += g;
+	state[7] += h;
+}
+
+#define xor3b(a,b,c) (a ^ b ^ c)
+
+__device__ __forceinline__ uint32_t bsg2_0(const uint32_t x)
+{
+	return xor3b(ROTR32(x,2),ROTR32(x,13),ROTR32(x,22));
+}
+
+__device__ __forceinline__ uint32_t bsg2_1(const uint32_t x)
+{
+	return xor3b(ROTR32(x,6),ROTR32(x,11),ROTR32(x,25));
+}
+
+__device__ __forceinline__ uint32_t ssg2_0(const uint32_t x)
+{
+	return xor3b(ROTR32(x,7),ROTR32(x,18),(x>>3));
+}
+
+__device__ __forceinline__ uint32_t ssg2_1(const uint32_t x)
+{
+	return xor3b(ROTR32(x,17),ROTR32(x,19),(x>>10));
+}
+
+__device__ __forceinline__ uint32_t andor32(const uint32_t a, const uint32_t b, const uint32_t c)
+{
+	uint32_t result;
+	asm("{\n\t"
+		".reg .u32 m,n,o;\n\t"
+		"and.b32 m,  %1, %2;\n\t"
+		" or.b32 n,  %1, %2;\n\t"
+		"and.b32 o,   n, %3;\n\t"
+		" or.b32 %0,  m, o ;\n\t"
+		"}\n\t" : "=r"(result) : "r"(a), "r"(b), "r"(c)
+	);
+	return result;
+}
+
+__device__ __forceinline__ uint2 vectorizeswap(uint64_t v) {
+	uint2 result;
+	asm("mov.b64 {%0,%1},%2; \n\t"
+		: "=r"(result.y), "=r"(result.x) : "l"(v));
+	return result;
+}
+
+__device__
+static void sha2_step1(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t in, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+	uint32_t vxandx = xandx(e, f, g);
+	uint32_t bsg21 = bsg2_1(e);
+	uint32_t bsg20 = bsg2_0(a);
+	uint32_t andorv = andor32(a,b,c);
+
+	t1 = h + bsg21 + vxandx + Kshared + in;
+	t2 = bsg20 + andorv;
+	d = d + t1;
+	h = t1 + t2;
+}
+
+__device__
+static void sha2_step2(uint32_t a, uint32_t b, uint32_t c, uint32_t &d, uint32_t e, uint32_t f, uint32_t g, uint32_t &h,
+	uint32_t* in, uint32_t pc, const uint32_t Kshared)
+{
+	uint32_t t1,t2;
+
+	int pcidx1 = (pc-2) & 0xF;
+	int pcidx2 = (pc-7) & 0xF;
+	int pcidx3 = (pc-15) & 0xF;
+
+	uint32_t inx0 = in[pc];
+	uint32_t inx1 = in[pcidx1];
+	uint32_t inx2 = in[pcidx2];
+	uint32_t inx3 = in[pcidx3];
+
+	uint32_t ssg21 = ssg2_1(inx1);
+	uint32_t ssg20 = ssg2_0(inx3);
+	uint32_t vxandx = xandx(e, f, g);
+	uint32_t bsg21 = bsg2_1(e);
+	uint32_t bsg20 = bsg2_0(a);
+	uint32_t andorv = andor32(a,b,c);
+
+	in[pc] = ssg21 + inx2 + ssg20 + inx0;
+
+	t1 = h + bsg21 + vxandx + Kshared + in[pc];
+	t2 = bsg20 + andorv;
+	d =  d + t1;
+	h = t1 + t2;
+}
+
+__device__
+static void sha256_round_body(uint32_t* in, uint32_t* state, uint32_t* const Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha2_step1(a,b,c,d,e,f,g,h,in[ 0], Kshared[ 0]);
+	sha2_step1(h,a,b,c,d,e,f,g,in[ 1], Kshared[ 1]);
+	sha2_step1(g,h,a,b,c,d,e,f,in[ 2], Kshared[ 2]);
+	sha2_step1(f,g,h,a,b,c,d,e,in[ 3], Kshared[ 3]);
+	sha2_step1(e,f,g,h,a,b,c,d,in[ 4], Kshared[ 4]);
+	sha2_step1(d,e,f,g,h,a,b,c,in[ 5], Kshared[ 5]);
+	sha2_step1(c,d,e,f,g,h,a,b,in[ 6], Kshared[ 6]);
+	sha2_step1(b,c,d,e,f,g,h,a,in[ 7], Kshared[ 7]);
+	sha2_step1(a,b,c,d,e,f,g,h,in[ 8], Kshared[ 8]);
+	sha2_step1(h,a,b,c,d,e,f,g,in[ 9], Kshared[ 9]);
+	sha2_step1(g,h,a,b,c,d,e,f,in[10], Kshared[10]);
+	sha2_step1(f,g,h,a,b,c,d,e,in[11], Kshared[11]);
+	sha2_step1(e,f,g,h,a,b,c,d,in[12], Kshared[12]);
+	sha2_step1(d,e,f,g,h,a,b,c,in[13], Kshared[13]);
+	sha2_step1(c,d,e,f,g,h,a,b,in[14], Kshared[14]);
+	sha2_step1(b,c,d,e,f,g,h,a,in[15], Kshared[15]);
+
+	#pragma unroll
+	for (int i=0; i<3; i++)
+	{
+		sha2_step2(a,b,c,d,e,f,g,h,in,0, Kshared[16+16*i]);
+		sha2_step2(h,a,b,c,d,e,f,g,in,1, Kshared[17+16*i]);
+		sha2_step2(g,h,a,b,c,d,e,f,in,2, Kshared[18+16*i]);
+		sha2_step2(f,g,h,a,b,c,d,e,in,3, Kshared[19+16*i]);
+		sha2_step2(e,f,g,h,a,b,c,d,in,4, Kshared[20+16*i]);
+		sha2_step2(d,e,f,g,h,a,b,c,in,5, Kshared[21+16*i]);
+		sha2_step2(c,d,e,f,g,h,a,b,in,6, Kshared[22+16*i]);
+		sha2_step2(b,c,d,e,f,g,h,a,in,7, Kshared[23+16*i]);
+		sha2_step2(a,b,c,d,e,f,g,h,in,8, Kshared[24+16*i]);
+		sha2_step2(h,a,b,c,d,e,f,g,in,9, Kshared[25+16*i]);
+		sha2_step2(g,h,a,b,c,d,e,f,in,10,Kshared[26+16*i]);
+		sha2_step2(f,g,h,a,b,c,d,e,in,11,Kshared[27+16*i]);
+		sha2_step2(e,f,g,h,a,b,c,d,in,12,Kshared[28+16*i]);
+		sha2_step2(d,e,f,g,h,a,b,c,in,13,Kshared[29+16*i]);
+		sha2_step2(c,d,e,f,g,h,a,b,in,14,Kshared[30+16*i]);
+		sha2_step2(b,c,d,e,f,g,h,a,in,15,Kshared[31+16*i]);
+	}
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+	state[4] += e;
+	state[5] += f;
+	state[6] += g;
+	state[7] += h;
+}
+
+__device__
+static void sha256_round_last(uint32_t* in, uint32_t* state, uint32_t* const Kshared)
+{
+	uint32_t a = state[0];
+	uint32_t b = state[1];
+	uint32_t c = state[2];
+	uint32_t d = state[3];
+	uint32_t e = state[4];
+	uint32_t f = state[5];
+	uint32_t g = state[6];
+	uint32_t h = state[7];
+
+	sha2_step1(a,b,c,d, e,f,g,h, in[ 0], Kshared[ 0]);
+	sha2_step1(h,a,b,c, d,e,f,g, in[ 1], Kshared[ 1]);
+	sha2_step1(g,h,a,b, c,d,e,f, in[ 2], Kshared[ 2]);
+	sha2_step1(f,g,h,a, b,c,d,e, in[ 3], Kshared[ 3]);
+	sha2_step1(e,f,g,h, a,b,c,d, in[ 4], Kshared[ 4]);
+	sha2_step1(d,e,f,g, h,a,b,c, in[ 5], Kshared[ 5]);
+	sha2_step1(c,d,e,f, g,h,a,b, in[ 6], Kshared[ 6]);
+	sha2_step1(b,c,d,e, f,g,h,a, in[ 7], Kshared[ 7]);
+	sha2_step1(a,b,c,d, e,f,g,h, in[ 8], Kshared[ 8]);
+	sha2_step1(h,a,b,c, d,e,f,g, in[ 9], Kshared[ 9]);
+	sha2_step1(g,h,a,b, c,d,e,f, in[10], Kshared[10]);
+	sha2_step1(f,g,h,a, b,c,d,e, in[11], Kshared[11]);
+	sha2_step1(e,f,g,h, a,b,c,d, in[12], Kshared[12]);
+	sha2_step1(d,e,f,g, h,a,b,c, in[13], Kshared[13]);
+	sha2_step1(c,d,e,f, g,h,a,b, in[14], Kshared[14]);
+	sha2_step1(b,c,d,e, f,g,h,a, in[15], Kshared[15]);
+
+	#pragma unroll
+	for (int i=0; i<2; i++)
+	{
+		sha2_step2(a,b,c,d, e,f,g,h, in, 0, Kshared[16+16*i]);
+		sha2_step2(h,a,b,c, d,e,f,g, in, 1, Kshared[17+16*i]);
+		sha2_step2(g,h,a,b, c,d,e,f, in, 2, Kshared[18+16*i]);
+		sha2_step2(f,g,h,a, b,c,d,e, in, 3, Kshared[19+16*i]);
+		sha2_step2(e,f,g,h, a,b,c,d, in, 4, Kshared[20+16*i]);
+		sha2_step2(d,e,f,g, h,a,b,c, in, 5, Kshared[21+16*i]);
+		sha2_step2(c,d,e,f, g,h,a,b, in, 6, Kshared[22+16*i]);
+		sha2_step2(b,c,d,e, f,g,h,a, in, 7, Kshared[23+16*i]);
+		sha2_step2(a,b,c,d, e,f,g,h, in, 8, Kshared[24+16*i]);
+		sha2_step2(h,a,b,c, d,e,f,g, in, 9, Kshared[25+16*i]);
+		sha2_step2(g,h,a,b, c,d,e,f, in,10, Kshared[26+16*i]);
+		sha2_step2(f,g,h,a, b,c,d,e, in,11, Kshared[27+16*i]);
+		sha2_step2(e,f,g,h, a,b,c,d, in,12, Kshared[28+16*i]);
+		sha2_step2(d,e,f,g, h,a,b,c, in,13, Kshared[29+16*i]);
+		sha2_step2(c,d,e,f, g,h,a,b, in,14, Kshared[30+16*i]);
+		sha2_step2(b,c,d,e, f,g,h,a, in,15, Kshared[31+16*i]);
+	}
+
+	sha2_step2(a,b,c,d, e,f,g,h, in, 0, Kshared[16+16*2]);
+	sha2_step2(h,a,b,c, d,e,f,g, in, 1, Kshared[17+16*2]);
+	sha2_step2(g,h,a,b, c,d,e,f, in, 2, Kshared[18+16*2]);
+	sha2_step2(f,g,h,a, b,c,d,e, in, 3, Kshared[19+16*2]);
+	sha2_step2(e,f,g,h, a,b,c,d, in, 4, Kshared[20+16*2]);
+	sha2_step2(d,e,f,g, h,a,b,c, in, 5, Kshared[21+16*2]);
+	sha2_step2(c,d,e,f, g,h,a,b, in, 6, Kshared[22+16*2]);
+	sha2_step2(b,c,d,e, f,g,h,a, in, 7, Kshared[23+16*2]);
+	sha2_step2(a,b,c,d, e,f,g,h, in, 8, Kshared[24+16*2]);
+	sha2_step2(h,a,b,c, d,e,f,g, in, 9, Kshared[25+16*2]);
+	sha2_step2(g,h,a,b, c,d,e,f, in,10, Kshared[26+16*2]);
+	sha2_step2(f,g,h,a, b,c,d,e, in,11, Kshared[27+16*2]);
+	sha2_step2(e,f,g,h, a,b,c,d, in,12, Kshared[28+16*2]);
+	sha2_step2(d,e,f,g, h,a,b,c, in,13, Kshared[29+16*2]);
+
+	state[6] += g;
+	state[7] += h;
+}
+
+__device__ __forceinline__
+uint64_t cuda_swab32ll(uint64_t x) {
+	return MAKE_ULONGLONG(cuda_swab32(_LODWORD(x)), cuda_swab32(_HIDWORD(x)));
+}
+
+__global__
+/*__launch_bounds__(256,3)*/
+void sha256q_gpu_hash_shared(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonces)
+{
+	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
+
+	__shared__ uint32_t s_K[64*4];
+	//s_K[thread & 63] = c_K[thread & 63];
+	if (threadIdx.x < 64U) s_K[threadIdx.x] = c_K[threadIdx.x];
+
+	if (thread < threads)
+	{
+		const uint32_t nonce = startNonce + thread;
+
+		uint32_t dat[16];
+		AS_UINT2(dat) = AS_UINT2(c_dataEnd80);
+		dat[ 2] = c_dataEnd80[2];
+		dat[ 3] = nonce;
+		dat[ 4] = 0x80000000;
+		dat[15] = 0x280;
+		#pragma unroll
+		for (int i=5; i<15; i++) dat[i] = 0;
+
+		uint32_t buf[8];
+		#pragma unroll
+		for (int i=0; i<8; i+=2) AS_UINT2(&buf[i]) = AS_UINT2(&c_midstate76[i]);
+		//for (int i=0; i<8; i++) buf[i] = c_midstate76[i];
+
+		sha256_round_body(dat, buf, s_K);
+
+		// second sha256
+
+		#pragma unroll
+		for (int i=0; i<8; i++) dat[i] = buf[i];
+		dat[8] = 0x80000000;
+		#pragma unroll
+		for (int i=9; i<15; i++) dat[i] = 0;
+		dat[15] = 0x100;
+
+		#pragma unroll
+		for (int i=0; i<8; i++) buf[i] = c_H256[i];
+
+		sha256_round_body(dat, buf, s_K);
+
+		// third sha256
+
+		#pragma unroll
+		for (int i=0; i<8; i++) dat[i] = buf[i];
+		dat[8] = 0x80000000;
+		#pragma unroll
+		for (int i=9; i<15; i++) dat[i] = 0;
+		dat[15] = 0x100;
+
+		#pragma unroll
+		for (int i=0; i<8; i++) buf[i] = c_H256[i];
+
+		sha256_round_body(dat, buf, s_K);
+
+		// last sha256
+
+		#pragma unroll
+		for (int i=0; i<8; i++) dat[i] = buf[i];
+		dat[8] = 0x80000000;
+		#pragma unroll
+		for (int i=9; i<15; i++) dat[i] = 0;
+		dat[15] = 0x100;
+
+		#pragma unroll
+		for (int i=0; i<8; i++) buf[i] = c_H256[i];
+
+		sha256_round_last(dat, buf, s_K);
+
+
+		// valid nonces
+		uint64_t high = cuda_swab32ll(((uint64_t*)buf)[3]);
+		if (high <= c_target[0]) {
+			//printf("%08x %08x - %016llx %016llx - %08x %08x\n", buf[7], buf[6], high, d_target[0], c_target[1], c_target[0]);
+			resNonces[1] = atomicExch(resNonces, nonce);
+			//d_target[0] = high;
+		}
+	}
+}
+
+__host__
+void sha256q_init(int thr_id)
+{
+	cuda_get_arch(thr_id);
+	cudaMemcpyToSymbol(c_K, cpu_K, sizeof(cpu_K), 0, cudaMemcpyHostToDevice);
+	CUDA_SAFE_CALL(cudaMalloc(&d_resNonces[thr_id], 2*sizeof(uint32_t)));
+}
+
+__host__
+void sha256q_free(int thr_id)
+{
+	if (d_resNonces[thr_id]) cudaFree(d_resNonces[thr_id]);
+	d_resNonces[thr_id] = NULL;
+}
+
+__host__
+void sha256q_setBlock_80(uint32_t *pdata, uint32_t *ptarget)
+{
+	uint32_t _ALIGN(64) in[16], buf[8], end[4];
+	for (int i=0;i<16;i++) in[i] = cuda_swab32(pdata[i]);
+	for (int i=0;i<8;i++) buf[i] = cpu_H256[i];
+	for (int i=0;i<4;i++) end[i] = cuda_swab32(pdata[16+i]);
+	sha256_round_body_host(in, buf, cpu_K);
+
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_midstate76, buf, 32, 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_dataEnd80,  end, sizeof(end), 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(c_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice));
+	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_target, &ptarget[6], 8, 0, cudaMemcpyHostToDevice));
+}
+
+__host__
+void sha256q_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces)
+{
+	const uint32_t threadsperblock = 128;
+
+	dim3 grid(threads/threadsperblock);
+	dim3 block(threadsperblock);
+
+	CUDA_SAFE_CALL(cudaMemset(d_resNonces[thr_id], 0xFF, 2 * sizeof(uint32_t)));
+	cudaThreadSynchronize();
+	sha256q_gpu_hash_shared <<<grid, block>>> (threads, startNonce, d_resNonces[thr_id]);
+	cudaThreadSynchronize();
+
+	CUDA_SAFE_CALL(cudaMemcpy(resNonces, d_resNonces[thr_id], 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost));
+	if (resNonces[0] == resNonces[1]) {
+		resNonces[1] = UINT32_MAX;
+	}
+}
--- a/sha256/sha256q.cu
+++ b/sha256/sha256q.cu
@ -0,0 +1,136 @@
				@@ -0,0 +1,136 @@
+/**
+ * SHA256 4x
+ * by pyritepirate - 2018
+ * by tpruvot@github - 2017
+ */
+
+#include <miner.h>
+#include <cuda_helper.h>
+#include <openssl/sha.h>
+
+// CPU Check
+extern "C" void sha256q_hash(void *output, const void *input)
+{
+	unsigned char _ALIGN(64) hash[64];
+	SHA256_CTX sha256;
+
+	SHA256_Init(&sha256);
+	SHA256_Update(&sha256, (unsigned char *)input, 80);
+	SHA256_Final(hash, &sha256);
+
+	SHA256_Init(&sha256);
+	SHA256_Update(&sha256, hash, 32);
+	SHA256_Final(hash, &sha256);
+
+	SHA256_Init(&sha256);
+	SHA256_Update(&sha256, hash, 32);
+	SHA256_Final(hash, &sha256);
+
+	SHA256_Init(&sha256);
+	SHA256_Update(&sha256, hash, 32);
+	SHA256_Final((unsigned char *)output, &sha256);
+}
+
+static bool init[MAX_GPUS] = { 0 };
+extern void sha256q_init(int thr_id);
+extern void sha256q_free(int thr_id);
+extern void sha256q_setBlock_80(uint32_t *pdata, uint32_t *ptarget);
+extern void sha256q_hash_80(int thr_id, uint32_t threads, uint32_t startNonce, uint32_t *resNonces);
+
+extern "C" int scanhash_sha256q(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t _ALIGN(64) endiandata[20];
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << 23);
+	if (init[thr_id]) throughput = min(throughput, (max_nonce - first_nonce));
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x03;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		sha256q_init(thr_id);
+
+		init[thr_id] = true;
+	}
+
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	sha256q_setBlock_80(endiandata, ptarget);
+
+	do {
+		// Hash with CUDA
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		sha256q_hash_80(thr_id, throughput, pdata[19], work->nonces);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+
+			endiandata[19] = swab32(work->nonces[0]);
+			sha256q_hash(vhash, endiandata);
+			if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != UINT32_MAX) {
+					endiandata[19] = swab32(work->nonces[1]);
+					sha256q_hash(vhash, endiandata);
+					if (vhash[7] <= ptarget[7] && fulltest(vhash, ptarget)) {
+						work->valid_nonces++;
+						bn_set_target_ratio(work, vhash, 1);
+					}
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1;
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > ptarget[7]) {
+				gpu_increment_reject(thr_id);
+				if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+
+	return 0;
+}
+
+// cleanup
+extern "C" void free_sha256q(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	sha256q_free(thr_id);
+
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/sia/sia-rpc.cpp
+++ b/sia/sia-rpc.cpp
@ -74,10 +74,10 @@ char* sia_getheader(CURL *curl, struct pool_infos *pool)
				@@ -74,10 +74,10 @@ char* sia_getheader(CURL *curl, struct pool_infos *pool)
 	struct data_buffer all_data = { 0 };
 	struct curl_slist *headers = NULL;
 	char data[256] = { 0 };
-	char url[512];
+	char url[512*3];

 	// nanopool
-	snprintf(url, 512, "%s/miner/header?address=%s&worker=%s", //&longpoll
+	snprintf(url, sizeof(url), "%s/miner/header?address=%s&worker=%s", //&longpoll
 		pool->url, pool->user, pool->pass);

 	if (opt_protocol)
@ -148,7 +148,7 @@ bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work)
				@@ -148,7 +148,7 @@ bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work)
 	struct data_buffer all_data = { 0 };
 	struct curl_slist *headers = NULL;
 	char buf[256] = { 0 };
-	char url[512];
+	char url[512*3];

 	if (opt_protocol)
 		applog_hex(work->data, 80);
@ -156,7 +156,7 @@ bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work)
				@@ -156,7 +156,7 @@ bool sia_submit(CURL *curl, struct pool_infos *pool, struct work *work)
 	//applog_hex(&work->data[10], 4);

 	// nanopool
-	snprintf(url, 512, "%s/miner/header?address=%s&worker=%s",
+	snprintf(url, sizeof(url), "%s/miner/header?address=%s&worker=%s",
 		pool->url, pool->user, pool->pass);

 	if (opt_protocol)
--- a/sia/sia.cu
+++ b/sia/sia.cu
@ -40,7 +40,7 @@ static __constant__ const int8_t blake2b_sigma[12][16] = {
				@@ -40,7 +40,7 @@ static __constant__ const int8_t blake2b_sigma[12][16] = {
 // host mem align
 #define A 64

-extern "C" void blake2b_hash(void *output, const void *input)
+extern "C" void sia_blake2b_hash(void *output, const void *input)
 {
 	uint8_t _ALIGN(A) hash[32];
 	blake2b_ctx ctx;
@ -102,7 +102,7 @@ static void H(const int r, const int i, uint64_t &a, uint64_t &b, uint64_t &c, u
				@@ -102,7 +102,7 @@ static void H(const int r, const int i, uint64_t &a, uint64_t &b, uint64_t &c, u

 __global__
 //__launch_bounds__(128, 8) /* to force 64 regs */
-void blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2)
+void sia_blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_t *resNonce, const uint2 target2)
 {
 	const uint32_t nonce = (blockDim.x * blockIdx.x + threadIdx.x) + startNonce;
 	__shared__ uint64_t s_target;
@ -154,7 +154,7 @@ void blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_
				@@ -154,7 +154,7 @@ void blake2b_gpu_hash(const uint32_t threads, const uint32_t startNonce, uint32_
 }

 __host__
-uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2, uint32_t &secNonce)
+uint32_t sia_blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint32_t startNonce, const uint2 target2, uint32_t &secNonce)
 {
 	uint32_t resNonces[NBN] = { UINT32_MAX, UINT32_MAX };
 	uint32_t result = UINT32_MAX;
@ -166,7 +166,7 @@ uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint3
				@@ -166,7 +166,7 @@ uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint3
 	if (cudaMemset(d_resNonces[thr_id], 0xff, NBN*sizeof(uint32_t)) != cudaSuccess)
 		return result;

-	blake2b_gpu_hash <<<grid, block, 8>>> (threads, startNonce, d_resNonces[thr_id], target2);
+	sia_blake2b_gpu_hash <<<grid, block, 8>>> (threads, startNonce, d_resNonces[thr_id], target2);
 	cudaThreadSynchronize();

 	if (cudaSuccess == cudaMemcpy(resNonces, d_resNonces[thr_id], NBN*sizeof(uint32_t), cudaMemcpyDeviceToHost)) {
@ -178,7 +178,7 @@ uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint3
				@@ -178,7 +178,7 @@ uint32_t blake2b_hash_cuda(const int thr_id, const uint32_t threads, const uint3
 }

 __host__
-void blake2b_setBlock(uint32_t *data)
+void sia_blake2b_setBlock(uint32_t *data)
 {
 	CUDA_SAFE_CALL(cudaMemcpyToSymbol(d_data, data, 80, 0, cudaMemcpyHostToDevice));
 }
@ -224,10 +224,10 @@ int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned lon
				@@ -224,10 +224,10 @@ int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned lon

 	const uint2 target = make_uint2(ptarget[6], ptarget[7]);

-	blake2b_setBlock(inputdata);
+	sia_blake2b_setBlock(inputdata);

 	do {
-		work->nonces[0] = blake2b_hash_cuda(thr_id, throughput, pdata[8], target, work->nonces[1]);
+		work->nonces[0] = sia_blake2b_hash_cuda(thr_id, throughput, pdata[8], target, work->nonces[1]);

 		*hashes_done = pdata[8] - first_nonce + throughput;

@ -235,7 +235,7 @@ int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned lon
				@@ -235,7 +235,7 @@ int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned lon
 		{
 			work->valid_nonces = 0;
 			inputdata[8] = work->nonces[0];
-			blake2b_hash(hash, inputdata);
+			sia_blake2b_hash(hash, inputdata);
 			if (swab32(hash[0]) <= Htarg) {
 				// sia hash target is reversed (start of hash)
 				swab256(vhashcpu, hash);
@ -250,7 +250,7 @@ int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned lon
				@@ -250,7 +250,7 @@ int scanhash_sia(int thr_id, struct work *work, uint32_t max_nonce, unsigned lon

 			if (work->nonces[1] != UINT32_MAX) {
 				inputdata[8] = work->nonces[1];
-				blake2b_hash(hash, inputdata);
+				sia_blake2b_hash(hash, inputdata);
 				if (swab32(hash[0]) <= Htarg) {
 					swab256(vhashcpu, hash);
 					if (fulltest(vhashcpu, ptarget)) {
--- a/util.cpp
+++ b/util.cpp
@ -616,7 +616,7 @@ err_out:
				@@ -616,7 +616,7 @@ err_out:
 json_t *json_rpc_call_pool(CURL *curl, struct pool_infos *pool, const char *req,
 	bool longpoll_scan, bool longpoll, int *curl_err)
 {
-	char userpass[512];
+	char userpass[768];
 	// todo, malloc and store that in pool array
 	snprintf(userpass, sizeof(userpass), "%s%c%s", pool->user,
 		strlen(pool->pass)?':':'\0', pool->pass);
@ -627,7 +627,7 @@ json_t *json_rpc_call_pool(CURL *curl, struct pool_infos *pool, const char *req,
				@@ -627,7 +627,7 @@ json_t *json_rpc_call_pool(CURL *curl, struct pool_infos *pool, const char *req,
 /* called only from longpoll thread, we have the lp_url */
 json_t *json_rpc_longpoll(CURL *curl, char *lp_url, struct pool_infos *pool, const char *req, int *curl_err)
 {
-	char userpass[512];
+	char userpass[768];
 	snprintf(userpass, sizeof(userpass), "%s%c%s", pool->user,
 		strlen(pool->pass)?':':'\0', pool->pass);

@ -1442,7 +1442,7 @@ static uint32_t getblocheight(struct stratum_ctx *sctx)
				@@ -1442,7 +1442,7 @@ static uint32_t getblocheight(struct stratum_ctx *sctx)
 static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 {
 	const char *job_id, *prevhash, *coinb1, *coinb2, *version, *nbits, *stime;
-	const char *claim = NULL, *nreward = NULL;
+	const char *extradata = NULL, *nreward = NULL;
 	size_t coinb1_size, coinb2_size;
 	bool clean, ret = false;
 	int merkle_count, i, p=0;
@ -1452,7 +1452,8 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
				@@ -1452,7 +1452,8 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	int ntime;
 	char algo[64] = { 0 };
 	get_currentalgo(algo, sizeof(algo));
-	bool has_claim = !strcasecmp(algo, "lbry");
+	bool has_claim = !strcmp(algo, "lbry");
+	bool has_roots = !strcmp(algo, "phi2") && json_array_size(params) == 10;

 	if (sctx->is_equihash) {
 		return equi_stratum_notify(sctx, params);
@ -1461,11 +1462,17 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
				@@ -1461,11 +1462,17 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	job_id = json_string_value(json_array_get(params, p++));
 	prevhash = json_string_value(json_array_get(params, p++));
 	if (has_claim) {
-		claim = json_string_value(json_array_get(params, p++));
-		if (!claim || strlen(claim) != 64) {
+		extradata = json_string_value(json_array_get(params, p++));
+		if (!extradata || strlen(extradata) != 64) {
 			applog(LOG_ERR, "Stratum notify: invalid claim parameter");
 			goto out;
 		}
+	} else if (has_roots) {
+		extradata = json_string_value(json_array_get(params, p++));
+		if (!extradata || strlen(extradata) != 128) {
+			applog(LOG_ERR, "Stratum notify: invalid UTXO root parameter");
+			goto out;
+		}
 	}
 	coinb1 = json_string_value(json_array_get(params, p++));
 	coinb2 = json_string_value(json_array_get(params, p++));
@ -1529,7 +1536,8 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
				@@ -1529,7 +1536,8 @@ static bool stratum_notify(struct stratum_ctx *sctx, json_t *params)
 	free(sctx->job.job_id);
 	sctx->job.job_id = strdup(job_id);
 	hex2bin(sctx->job.prevhash, prevhash, 32);
-	if (has_claim) hex2bin(sctx->job.claim, claim, 32);
+	if (has_claim) hex2bin(sctx->job.extra, extradata, 32);
+	if (has_roots) hex2bin(sctx->job.extra, extradata, 64);

 	sctx->job.height = getblocheight(sctx);

@ -2164,6 +2172,9 @@ void print_hash_tests(void)
				@@ -2164,6 +2172,9 @@ void print_hash_tests(void)

 	printf(CL_WHT "CPU HASH ON EMPTY BUFFER RESULTS:" CL_N "\n");

+	allium_hash(&hash[0], &buf[0]);
+	printpfx("allium", hash);
+
 	bastionhash(&hash[0], &buf[0]);
 	printpfx("bastion", hash);

@ -2173,6 +2184,9 @@ void print_hash_tests(void)
				@@ -2173,6 +2184,9 @@ void print_hash_tests(void)
 	blake256hash(&hash[0], &buf[0], 14);
 	printpfx("blake", hash);

+	blake2b_hash(&hash[0], &buf[0]);
+	printpfx("blake2b", hash);
+
 	blake2s_hash(&hash[0], &buf[0]);
 	printpfx("blake2s", hash);

@ -2182,10 +2196,10 @@ void print_hash_tests(void)
				@@ -2182,10 +2196,10 @@ void print_hash_tests(void)
 	c11hash(&hash[0], &buf[0]);
 	printpfx("c11", hash);

-	cryptolight_hash(&hash[0], &buf[0], 76);
+	cryptolight_hash(&hash[0], &buf[0]);
 	printpfx("cryptolight", hash);

-	cryptonight_hash(&hash[0], &buf[0], 76);
+	cryptonight_hash(&hash[0], &buf[0]);
 	printpfx("cryptonight", hash);

 	memset(buf, 0, 180);
@ -2232,9 +2246,15 @@ void print_hash_tests(void)
				@@ -2232,9 +2246,15 @@ void print_hash_tests(void)
 	lyra2v2_hash(&hash[0], &buf[0]);
 	printpfx("lyra2v2", hash);

+	lyra2v3_hash(&hash[0], &buf[0]);
+	printpfx("lyra2v3", hash);
+
 	lyra2Z_hash(&hash[0], &buf[0]);
 	printpfx("lyra2z", hash);

+	monero_hash(&hash[0], &buf[0]);
+	printpfx("monero", hash);
+
 	myriadhash(&hash[0], &buf[0]);
 	printpfx("myriad", hash);

@ -2247,7 +2267,7 @@ void print_hash_tests(void)
				@@ -2247,7 +2267,7 @@ void print_hash_tests(void)
 	pentablakehash(&hash[0], &buf[0]);
 	printpfx("pentablake", hash);

-	phihash(&hash[0], &buf[0]);
+	phi2_hash(&hash[0], &buf[0]);
 	printpfx("phi", hash);

 	polytimos_hash(&hash[0], &buf[0]);
@ -2271,7 +2291,10 @@ void print_hash_tests(void)
				@@ -2271,7 +2291,10 @@ void print_hash_tests(void)
 	sha256t_hash(&hash[0], &buf[0]);
 	printpfx("sha256t", hash);

-	blake2b_hash(&hash[0], &buf[0]);
+	sha256q_hash(&hash[0], &buf[0]);
+	printpfx("sha256q", hash);
+  
+	sia_blake2b_hash(&hash[0], &buf[0]);
 	printpfx("sia", hash);

 	sibhash(&hash[0], &buf[0]);
@ -2289,6 +2312,9 @@ void print_hash_tests(void)
				@@ -2289,6 +2312,9 @@ void print_hash_tests(void)
 	skunk_hash(&hash[0], &buf[0]);
 	printpfx("skunk", hash);

+	stellite_hash(&hash[0], &buf[0]);
+	printpfx("stelitte", hash);
+
 	s3hash(&hash[0], &buf[0]);
 	printpfx("S3", hash);

@ -2297,6 +2323,9 @@ void print_hash_tests(void)
				@@ -2297,6 +2323,9 @@ void print_hash_tests(void)

 	bitcore_hash(&hash[0], &buf[0]);
 	printpfx("bitcore", hash);
+	
+	exosis_hash(&hash[0], &buf[0]);
+	printpfx("exosis", hash);

 	blake256hash(&hash[0], &buf[0], 8);
 	printpfx("vanilla", hash);
--- a/x11/cuda_streebog_maxwell.cu
+++ b/x11/cuda_streebog_maxwell.cu
@ -207,7 +207,7 @@ __launch_bounds__(TPB, 3)
				@@ -207,7 +207,7 @@ __launch_bounds__(TPB, 3)
 #else
 __launch_bounds__(TPB, 3)
 #endif
-void streebog_gpu_hash_64_maxwell(uint64_t *g_hash)
+void streebog_gpu_hash_64_sm5(uint64_t *g_hash, uint32_t* const d_filter, const uint32_t filter_val)
 {
 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	uint2 buf[8], t[8], temp[8], K0[8], hash[8];
@ -222,13 +222,16 @@ void streebog_gpu_hash_64_maxwell(uint64_t *g_hash)
				@@ -222,13 +222,16 @@ void streebog_gpu_hash_64_maxwell(uint64_t *g_hash)
 	shared[6][threadIdx.x] = __ldg(&T62[threadIdx.x]);
 	shared[7][threadIdx.x] = __ldg(&T72[threadIdx.x]);

+	//__threadfence_block();
+	__syncthreads();
+
+	if (d_filter && d_filter[thread] != filter_val) return;
+
 	uint64_t* inout = &g_hash[thread<<3];

 	*(uint2x4*)&hash[0] = __ldg4((uint2x4*)&inout[0]);
 	*(uint2x4*)&hash[4] = __ldg4((uint2x4*)&inout[4]);

-	__threadfence_block();
-
 	K0[0] = vectorize(0x74a5d4ce2efc83b3);

 	#pragma unroll 8
@ -301,9 +304,17 @@ void streebog_gpu_hash_64_maxwell(uint64_t *g_hash)
				@@ -301,9 +304,17 @@ void streebog_gpu_hash_64_maxwell(uint64_t *g_hash)
 }

 __host__
-void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *d_hash)
+void streebog_hash_64_maxwell(int thr_id, uint32_t threads, uint32_t *g_hash)
+{
+	dim3 grid((threads + TPB-1) / TPB);
+	dim3 block(TPB);
+	streebog_gpu_hash_64_sm5 <<<grid, block>>> ((uint64_t*)g_hash, NULL, 0);
+}
+
+__host__
+void phi_streebog_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t *g_hash, uint32_t *d_filter)
 {
 	dim3 grid((threads + TPB-1) / TPB);
 	dim3 block(TPB);
-	streebog_gpu_hash_64_maxwell <<<grid, block>>> ((uint64_t*)d_hash);
+	streebog_gpu_hash_64_sm5 <<<grid, block>>> ((uint64_t*)g_hash, d_filter, 1);
 }
--- a/x11/cuda_x11_cubehash512.cu
+++ b/x11/cuda_x11_cubehash512.cu
@ -259,16 +259,32 @@ void x11_cubehash512_cpu_init(int thr_id, uint32_t threads) { }
				@@ -259,16 +259,32 @@ void x11_cubehash512_cpu_init(int thr_id, uint32_t threads) { }

 /***************************************************/

-#define WANT_CUBEHASH80
-#ifdef WANT_CUBEHASH80
+/**
+ * Timetravel and x16 CUBEHASH-80 CUDA implementation
+ *  by tpruvot@github - Jan 2017 / May 2018
+ */

-__constant__
-static uint32_t c_PaddedMessage80[20];
+__constant__ static uint32_t c_midstate128[32];
+__constant__ static uint32_t c_PaddedMessage80[20];
+
+#undef SPH_C32
+#undef SPH_C64
+#undef SPH_T32
+#undef SPH_T64
+#include "sph/sph_cubehash.h"

 __host__
 void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata)
 {
+	sph_cubehash512_context ctx_cubehash;
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, (void*)endiandata, 64);
+#ifndef NO_MIDSTATE
+	cudaMemcpyToSymbol(c_midstate128, ctx_cubehash.state, 128, 0, cudaMemcpyHostToDevice);
+	cudaMemcpyToSymbol(c_PaddedMessage80, &endiandata[16], 16, 0, cudaMemcpyHostToDevice);
+#else
 	cudaMemcpyToSymbol(c_PaddedMessage80, endiandata, sizeof(c_PaddedMessage80), 0, cudaMemcpyHostToDevice);
+#endif
 }

 __global__
@ -278,11 +294,11 @@ void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce,
				@@ -278,11 +294,11 @@ void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce,
 	if (thread < threads)
 	{
 		const uint32_t nonce = startNounce + thread;
-
+		uint32_t message[8];
 		uint32_t x[2][2][2][2][2];
+#ifdef NO_MIDSTATE
 		Init(x);

-		uint32_t message[8];
 		// first 32 bytes
 		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[0]);
 		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage80[4]);
@ -293,8 +309,31 @@ void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce,
				@@ -293,8 +309,31 @@ void cubehash512_gpu_hash_80(const uint32_t threads, const uint32_t startNounce,
 		AS_UINT4(&message[4]) = AS_UINT4(&c_PaddedMessage80[12]);
 		Update32(x, message);

-		// last 16 bytes + Padding
+		// last 16 bytes
 		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[16]);
+#else
+		AS_UINT2(x[0][0][0][0]) = AS_UINT2(&c_midstate128[ 0]);
+		AS_UINT2(x[0][0][0][1]) = AS_UINT2(&c_midstate128[ 2]);
+		AS_UINT2(x[0][0][1][0]) = AS_UINT2(&c_midstate128[ 4]);
+		AS_UINT2(x[0][0][1][1]) = AS_UINT2(&c_midstate128[ 6]);
+		AS_UINT2(x[0][1][0][0]) = AS_UINT2(&c_midstate128[ 8]);
+		AS_UINT2(x[0][1][0][1]) = AS_UINT2(&c_midstate128[10]);
+		AS_UINT2(x[0][1][1][0]) = AS_UINT2(&c_midstate128[12]);
+		AS_UINT2(x[0][1][1][1]) = AS_UINT2(&c_midstate128[14]);
+
+		AS_UINT2(x[1][0][0][0]) = AS_UINT2(&c_midstate128[16]);
+		AS_UINT2(x[1][0][0][1]) = AS_UINT2(&c_midstate128[18]);
+		AS_UINT2(x[1][0][1][0]) = AS_UINT2(&c_midstate128[20]);
+		AS_UINT2(x[1][0][1][1]) = AS_UINT2(&c_midstate128[22]);
+		AS_UINT2(x[1][1][0][0]) = AS_UINT2(&c_midstate128[24]);
+		AS_UINT2(x[1][1][0][1]) = AS_UINT2(&c_midstate128[26]);
+		AS_UINT2(x[1][1][1][0]) = AS_UINT2(&c_midstate128[28]);
+		AS_UINT2(x[1][1][1][1]) = AS_UINT2(&c_midstate128[30]);
+
+		// last 16 bytes
+		AS_UINT4(&message[0]) = AS_UINT4(&c_PaddedMessage80[0]);
+#endif
+		// nonce + Padding
 		message[3] = cuda_swab32(nonce);
 		message[4] = 0x80;
 		message[5] = 0;
@ -317,4 +356,3 @@ void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const ui
				@@ -317,4 +356,3 @@ void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const ui
 	cubehash512_gpu_hash_80 <<<grid, block>>> (threads, startNounce, (uint64_t*) d_hash);
 }

-#endif
--- a/x11/exosis.cu
+++ b/x11/exosis.cu
@ -0,0 +1,497 @@
				@@ -0,0 +1,497 @@
+/**
+ * Timetravel (exosis) CUDA implementation
+ *  by tpruvot@github, exosis
+ */
+
+#include <stdio.h>
+#include <memory.h>
+#include <unistd.h>
+
+#define HASH_FUNC_BASE_TIMESTAMP 1538556426U
+#define HASH_FUNC_COUNT 8
+#define HASH_FUNC_COUNT_PERMUTATIONS 40320U
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "cuda_x11.h"
+
+static uint32_t *d_hash[MAX_GPUS];
+
+enum Algo {
+	BLAKE = 0,
+	BMW,
+	GROESTL,
+	SKEIN,
+	JH,
+	KECCAK,
+	LUFFA,
+	CUBEHASH,
+	MAX_ALGOS_COUNT
+};
+
+static const char* algo_strings[] = {
+	"blake",
+	"bmw512",
+	"groestl",
+	"skein",
+	"jh512",
+	"keccak",
+	"luffa",
+	"cube",
+	NULL
+};
+
+inline void swap8(uint8_t *a, uint8_t *b)
+{
+	uint8_t t = *a;
+	*a = *b;
+	*b = t;
+}
+
+inline void initPerm(uint8_t n[], int count)
+{
+	for (int i = 0; i < count; i++)
+		n[i] = i;
+}
+
+static int nextPerm(uint8_t n[], int count)
+{
+	int tail, i, j;
+
+	if (count <= 1)
+		return 0;
+
+	for (i = count - 1; i>0 && n[i - 1] >= n[i]; i--);
+	tail = i;
+
+	if (tail > 0) {
+		for (j = count - 1; j>tail && n[j] <= n[tail - 1]; j--);
+		swap8(&n[tail - 1], &n[j]);
+	}
+
+	for (i = tail, j = count - 1; i<j; i++, j--)
+		swap8(&n[i], &n[j]);
+
+	return (tail != 0);
+}
+
+static void getAlgoString(char *str, int seq)
+{
+	uint8_t algoList[HASH_FUNC_COUNT];
+	char *sptr;
+
+	initPerm(algoList, HASH_FUNC_COUNT);
+
+	for (int k = 0; k < seq; k++) {
+		nextPerm(algoList, HASH_FUNC_COUNT);
+	}
+
+	sptr = str;
+	for (int j = 0; j < HASH_FUNC_COUNT; j++) {
+		if (algoList[j] >= 10)
+			sprintf(sptr, "%c", 'A' + (algoList[j] - 10));
+		else
+			sprintf(sptr, "%u", (uint32_t) algoList[j]);
+		sptr++;
+	}
+	*sptr = '\0';
+}
+
+static __thread uint32_t s_ntime = 0;
+static uint32_t s_sequence = UINT32_MAX;
+static uint8_t s_firstalgo = 0xFF;
+static char hashOrder[HASH_FUNC_COUNT + 1] = { 0 };
+
+#define INITIAL_DATE HASH_FUNC_BASE_TIMESTAMP
+static inline uint32_t getCurrentAlgoSeq(uint32_t ntime)
+{
+	// unlike x11evo, the permutation changes often (with ntime)
+	return (uint32_t) (ntime - INITIAL_DATE) % HASH_FUNC_COUNT_PERMUTATIONS;
+}
+
+// To finish...
+static void get_travel_order(uint32_t ntime, char *permstr)
+{
+	uint32_t seq = getCurrentAlgoSeq(ntime);
+	if (s_sequence != seq) {
+		getAlgoString(permstr, seq);
+		s_sequence = seq;
+	}
+}
+
+// CPU Hash
+extern "C" void exosis_hash(void *output, const void *input)
+{
+	uint32_t _ALIGN(64) hash[64/4] = { 0 };
+
+	sph_blake512_context     ctx_blake;
+	sph_bmw512_context       ctx_bmw;
+	sph_groestl512_context   ctx_groestl;
+	sph_skein512_context     ctx_skein;
+	sph_jh512_context        ctx_jh;
+	sph_keccak512_context    ctx_keccak;
+	sph_luffa512_context     ctx_luffa1;
+	sph_cubehash512_context  ctx_cubehash1;
+
+	if (s_sequence == UINT32_MAX) {
+		uint32_t *data = (uint32_t*) input;
+		const uint32_t ntime = (opt_benchmark || !data[17]) ? (uint32_t) time(NULL) : data[17];
+		get_travel_order(ntime, hashOrder);
+	}
+
+	void *in = (void*) input;
+	int size = 80;
+
+	const int hashes = (int) strlen(hashOrder);
+
+	for (int i = 0; i < hashes; i++)
+	{
+		const char elem = hashOrder[i];
+		uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+		switch (algo) {
+		case BLAKE:
+			sph_blake512_init(&ctx_blake);
+			sph_blake512(&ctx_blake, in, size);
+			sph_blake512_close(&ctx_blake, hash);
+			break;
+		case BMW:
+			sph_bmw512_init(&ctx_bmw);
+			sph_bmw512(&ctx_bmw, in, size);
+			sph_bmw512_close(&ctx_bmw, hash);
+			break;
+		case GROESTL:
+			sph_groestl512_init(&ctx_groestl);
+			sph_groestl512(&ctx_groestl, in, size);
+			sph_groestl512_close(&ctx_groestl, hash);
+			break;
+		case SKEIN:
+			sph_skein512_init(&ctx_skein);
+			sph_skein512(&ctx_skein, in, size);
+			sph_skein512_close(&ctx_skein, hash);
+			break;
+		case JH:
+			sph_jh512_init(&ctx_jh);
+			sph_jh512(&ctx_jh, in, size);
+			sph_jh512_close(&ctx_jh, hash);
+			break;
+		case KECCAK:
+			sph_keccak512_init(&ctx_keccak);
+			sph_keccak512(&ctx_keccak, in, size);
+			sph_keccak512_close(&ctx_keccak, hash);
+			break;
+		case LUFFA:
+			sph_luffa512_init(&ctx_luffa1);
+			sph_luffa512(&ctx_luffa1, in, size);
+			sph_luffa512_close(&ctx_luffa1, hash);
+			break;
+		case CUBEHASH:
+			sph_cubehash512_init(&ctx_cubehash1);
+			sph_cubehash512(&ctx_cubehash1, in, size);
+			sph_cubehash512_close(&ctx_cubehash1, hash);
+			break;
+		}
+
+		in = (void*) hash;
+		size = 64;
+	}
+
+	memcpy(output, hash, 32);
+}
+
+static uint32_t get_next_time(uint32_t ntime, char* curOrder)
+{
+	char nextOrder[HASH_FUNC_COUNT + 1] = { 0 };
+	uint32_t secs = 15;
+	do {
+		uint32_t nseq = getCurrentAlgoSeq(ntime+secs);
+		getAlgoString(nextOrder, nseq);
+		secs += 15;
+	} while (curOrder[0] == nextOrder[0]);
+	return secs;
+}
+
+//#define _DEBUG
+#define _DEBUG_PREFIX "tt-"
+#include "cuda_debug.cuh"
+
+void quark_bmw512_cpu_setBlock_80(void *pdata);
+void quark_bmw512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+
+void groestl512_setBlock_80(int thr_id, uint32_t *endiandata);
+void groestl512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+void skein512_cpu_setBlock_80(void *pdata);
+void skein512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int swap);
+
+void qubit_luffa512_cpu_init(int thr_id, uint32_t threads);
+void qubit_luffa512_cpu_setBlock_80(void *pdata);
+void qubit_luffa512_cpu_hash_80(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, int order);
+
+void jh512_setBlock_80(int thr_id, uint32_t *endiandata);
+void jh512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+void keccak512_setBlock_80(int thr_id, uint32_t *endiandata);
+void keccak512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+void cubehash512_setBlock_80(int thr_id, uint32_t* endiandata);
+void cubehash512_cuda_hash_80(const int thr_id, const uint32_t threads, const uint32_t startNounce, uint32_t *d_hash);
+
+void quark_blake512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_outputHash, int order);
+
+static bool init[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_exosis(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	int intensity = (device_sm[device_map[thr_id]] >= 500 && !is_windows()) ? 20 : 19;
+	uint32_t throughput = cuda_default_throughput(thr_id, 1U << intensity); // 19=256*256*8;
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	// if (opt_benchmark) pdata[17] = swab32(0x5886a4be); // TO DEBUG GROESTL 80
+
+	if (opt_debug || s_ntime != pdata[17] || s_sequence == UINT32_MAX) {
+		uint32_t ntime = swab32(work->data[17]);
+		get_travel_order(ntime, hashOrder);
+		s_ntime = pdata[17];
+		if (opt_debug && !thr_id) {
+			applog(LOG_DEBUG, "exosis hash order %s (%08x)", hashOrder, ntime);
+		}
+	}
+
+	if (opt_benchmark)
+		ptarget[7] = 0x5;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(device_map[thr_id]);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			// reduce cpu usage
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+			CUDA_LOG_ERROR();
+		}
+		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		qubit_luffa512_cpu_init(thr_id, throughput); // only constants (480 bytes)
+		x11_luffa512_cpu_init(thr_id, throughput);
+		x11_cubehash512_cpu_init(thr_id, throughput);
+
+		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
+		CUDA_CALL_OR_RET_X(cudaMemset(d_hash[thr_id], 0, (size_t) 64 * throughput), -1);
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	uint32_t endiandata[20];
+	for (int k=0; k < 19; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	cuda_check_cpu_setTarget(ptarget);
+
+	const int hashes = (int) strlen(hashOrder);
+	const char first = hashOrder[0];
+	const uint8_t algo80 = first >= 'A' ? first - 'A' + 10 : first - '0';
+	if (algo80 != s_firstalgo) {
+		s_firstalgo = algo80;
+		applog(LOG_INFO, "Exosis first algo is now %s", algo_strings[algo80 % HASH_FUNC_COUNT]);
+	}
+
+	switch (algo80) {
+		case BLAKE:
+			quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+			break;
+		case BMW:
+			quark_bmw512_cpu_setBlock_80(endiandata);
+			break;
+		case GROESTL:
+			groestl512_setBlock_80(thr_id, endiandata);
+			break;
+		case SKEIN:
+			skein512_cpu_setBlock_80((void*)endiandata);
+			break;
+		case JH:
+			jh512_setBlock_80(thr_id, endiandata);
+			break;
+		case KECCAK:
+			keccak512_setBlock_80(thr_id, endiandata);
+			break;
+		case LUFFA:
+			qubit_luffa512_cpu_setBlock_80((void*)endiandata);
+			break;
+		case CUBEHASH:
+			cubehash512_setBlock_80(thr_id, endiandata);
+			break;
+		default: {
+			uint32_t next = get_next_time(swab32(s_ntime), hashOrder);
+			if (!thr_id)
+				applog(LOG_WARNING, "kernel %c unimplemented, next in %u mn", first, next/60);
+			sleep(next > 30 ? 60 : 10);
+			return -1;
+		}
+	}
+
+	do {
+		int order = 0;
+
+		// Hash with CUDA
+
+		switch (algo80) {
+			case BLAKE:
+				quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("blake80:");
+				break;
+			case BMW:
+				quark_bmw512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+				TRACE("bmw80  :");
+				break;
+			case GROESTL:
+				groestl512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("grstl80:");
+				break;
+			case SKEIN:
+				skein512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], 1); order++;
+				TRACE("skein80:");
+				break;
+			case JH:
+				jh512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("jh51280:");
+				break;
+			case KECCAK:
+				keccak512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("kecck80:");
+				break;
+			case LUFFA:
+				qubit_luffa512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id], order++);
+				TRACE("luffa80:");
+				break;
+			case CUBEHASH:
+				cubehash512_cuda_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+				TRACE("cube 80:");
+				break;
+		}
+
+		for (int i = 1; i < hashes; i++)
+		{
+			const char elem = hashOrder[i];
+			const uint8_t algo64 = elem >= 'A' ? elem - 'A' + 10 : elem - '0';
+
+			switch (algo64) {
+			case BLAKE:
+				quark_blake512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("blake  :");
+				break;
+			case BMW:
+				quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("bmw    :");
+				break;
+			case GROESTL:
+				quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("groestl:");
+				break;
+			case SKEIN:
+				quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("skein  :");
+				break;
+			case JH:
+				quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("jh512  :");
+				break;
+			case KECCAK:
+				quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("keccak :");
+				break;
+			case LUFFA:
+				x11_luffa512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("luffa  :");
+				break;
+			case CUBEHASH:
+				x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+				TRACE("cube   :");
+				break;
+			}
+		}
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+		work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		if (work->nonces[0] != UINT32_MAX)
+		{
+			uint32_t _ALIGN(64) vhash[8];
+			const uint32_t Htarg = ptarget[7];
+			be32enc(&endiandata[19], work->nonces[0]);
+			exosis_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+				work->valid_nonces = 1;
+				work_set_target_ratio(work, vhash);
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				pdata[19] = work->nonces[0];
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					exosis_hash(vhash, endiandata);
+					if (vhash[7] <= Htarg && fulltest(vhash, ptarget)) {
+						bn_set_target_ratio(work, vhash, 1);
+						work->valid_nonces++;
+					}
+					pdata[19] = max(pdata[19], work->nonces[1]) + 1;
+				}
+				return work->valid_nonces;
+			} else if (vhash[7] > Htarg) {
+				gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+				pdata[19] = work->nonces[0] + 1;
+				continue;
+			}
+		}
+
+		if ((uint64_t) throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+		pdata[19] += throughput;
+
+	} while (!work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+// cleanup
+extern "C" void free_exosis(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaThreadSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+
+	cuda_check_cpu_free(thr_id);
+	init[thr_id] = false;
+
+	cudaDeviceSynchronize();
+}
--- a/x11/timetravel.cu
+++ b/x11/timetravel.cu
@ -20,11 +20,6 @@ extern "C" {
				@@ -20,11 +20,6 @@ extern "C" {
 #include "sph/sph_keccak.h"
 #include "sph/sph_luffa.h"
 #include "sph/sph_cubehash.h"
-#if HASH_FUNC_COUNT > 8
-#include "sph/sph_shavite.h"
-#include "sph/sph_simd.h"
-#include "sph/sph_echo.h"
-#endif
 }

 #include "miner.h"
@ -42,11 +37,6 @@ enum Algo {
				@@ -42,11 +37,6 @@ enum Algo {
 	KECCAK,
 	LUFFA,
 	CUBEHASH,
-#if HASH_FUNC_COUNT > 8
-	SHAVITE,
-	SIMD,
-	ECHO,
-#endif
 	MAX_ALGOS_COUNT
 };

@ -153,11 +143,6 @@ extern "C" void timetravel_hash(void *output, const void *input)
				@@ -153,11 +143,6 @@ extern "C" void timetravel_hash(void *output, const void *input)
 	sph_keccak512_context    ctx_keccak;
 	sph_luffa512_context     ctx_luffa1;
 	sph_cubehash512_context  ctx_cubehash1;
-#if HASH_FUNC_COUNT > 8
-	sph_shavite512_context   ctx_shavite1;
-	sph_simd512_context      ctx_simd1;
-	sph_echo512_context      ctx_echo1;
-#endif

 	if (s_sequence == UINT32_MAX) {
 		uint32_t *data = (uint32_t*) input;
@ -175,11 +160,6 @@ extern "C" void timetravel_hash(void *output, const void *input)
				@@ -175,11 +160,6 @@ extern "C" void timetravel_hash(void *output, const void *input)
 		const char elem = hashOrder[i];
 		uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0';

-		if (i > 0) {
-			in = (void*) hash;
-			size = 64;
-		}
-
 		switch (algo) {
 		case BLAKE:
 			sph_blake512_init(&ctx_blake);
@ -195,7 +175,6 @@ extern "C" void timetravel_hash(void *output, const void *input)
				@@ -195,7 +175,6 @@ extern "C" void timetravel_hash(void *output, const void *input)
 			sph_groestl512_init(&ctx_groestl);
 			sph_groestl512(&ctx_groestl, in, size);
 			sph_groestl512_close(&ctx_groestl, hash);
-			//applog_hex((void*)hash, 32);
 			break;
 		case SKEIN:
 			sph_skein512_init(&ctx_skein);
@ -222,24 +201,10 @@ extern "C" void timetravel_hash(void *output, const void *input)
				@@ -222,24 +201,10 @@ extern "C" void timetravel_hash(void *output, const void *input)
 			sph_cubehash512(&ctx_cubehash1, in, size);
 			sph_cubehash512_close(&ctx_cubehash1, hash);
 			break;
-#if HASH_FUNC_COUNT > 8
-		case SHAVITE:
-			sph_shavite512_init(&ctx_shavite1);
-			sph_shavite512(&ctx_shavite1, in, size);
-			sph_shavite512_close(&ctx_shavite1, hash);
-			break;
-		case SIMD:
-			sph_simd512_init(&ctx_simd1);
-			sph_simd512(&ctx_simd1, in, size);
-			sph_simd512_close(&ctx_simd1, hash);
-			break;
-		case ECHO:
-			sph_echo512_init(&ctx_echo1);
-			sph_echo512(&ctx_echo1, in, size);
-			sph_echo512_close(&ctx_echo1, hash);
-			break;
-#endif
 		}
+
+		in = (void*) hash;
+		size = 64;
 	}

 	memcpy(output, hash, 32);
@ -330,13 +295,7 @@ extern "C" int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_n
				@@ -330,13 +295,7 @@ extern "C" int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_n
 		qubit_luffa512_cpu_init(thr_id, throughput); // only constants (480 bytes)
 		x11_luffa512_cpu_init(thr_id, throughput);
 		x11_cubehash512_cpu_init(thr_id, throughput);
-#if HASH_FUNC_COUNT > 8
-		x11_shavite512_cpu_init(thr_id, throughput);
-		x11_echo512_cpu_init(thr_id, throughput);
-		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
-			return 0;
-		}
-#endif
+
 		CUDA_CALL_OR_RET_X(cudaMalloc(&d_hash[thr_id], (size_t) 64 * throughput), -1);
 		CUDA_CALL_OR_RET_X(cudaMemset(d_hash[thr_id], 0, (size_t) 64 * throughput), -1);

@ -471,20 +430,6 @@ extern "C" int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_n
				@@ -471,20 +430,6 @@ extern "C" int scanhash_timetravel(int thr_id, struct work* work, uint32_t max_n
 				x11_cubehash512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 				TRACE("cube   :");
 				break;
-#if HASH_FUNC_COUNT > 8
-			case SHAVITE:
-				x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-				TRACE("shavite:");
-				break;
-			case SIMD:
-				x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-				TRACE("simd   :");
-				break;
-			case ECHO:
-				x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-				TRACE("echo   :");
-				break;
-#endif
 			}
 		}

@ -544,9 +489,7 @@ extern "C" void free_timetravel(int thr_id)
				@@ -544,9 +489,7 @@ extern "C" void free_timetravel(int thr_id)

 	quark_blake512_cpu_free(thr_id);
 	quark_groestl512_cpu_free(thr_id);
-#if HASH_FUNC_COUNT > 8
-	x11_simd512_cpu_free(thr_id);
-#endif
+
 	cuda_check_cpu_free(thr_id);
 	init[thr_id] = false;

--- a/x12/x12.cu
+++ b/x12/x12.cu
@ -22,6 +22,8 @@ extern "C" {
				@@ -22,6 +22,8 @@ extern "C" {

 static uint32_t *d_hash[MAX_GPUS];

+extern void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+
 extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

@ -96,13 +98,15 @@ extern "C" void x12hash(void *output, const void *input)
				@@ -96,13 +98,15 @@ extern "C" void x12hash(void *output, const void *input)
 }

 static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };

 extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
-	int intensity = (device_sm[device_map[thr_id]] > 500 && !is_windows()) ? 20 : 19;
+	const int dev_id = device_map[thr_id];
+	int intensity = (device_sm[dev_id] > 500 && !is_windows()) ? 20 : 19;
 	uint32_t throughput =  cuda_default_throughput(thr_id, 1 << intensity); // 19=256*256*8;
 	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);

@ -111,7 +115,7 @@ extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, u
				@@ -111,7 +115,7 @@ extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, u

 	if (!init[thr_id])
 	{
-		cudaSetDevice(device_map[thr_id]);
+		cudaSetDevice(dev_id);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
 			// reduce cpu usage
@ -120,13 +124,17 @@ extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, u
				@@ -120,13 +124,17 @@ extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, u
 		}
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);

+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_init(thr_id, throughput);
+
 		quark_blake512_cpu_init(thr_id, throughput);
 		x11_luffaCubehash512_cpu_init(thr_id, throughput);
 		x11_shavite512_cpu_init(thr_id, throughput);
 		if (x11_simd512_cpu_init(thr_id, throughput) != 0) {
 			return 0;
 		}
-		x11_echo512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
 		quark_bmw512_cpu_init(thr_id, throughput);
@ -156,7 +164,11 @@ extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, u
				@@ -156,7 +164,11 @@ extern "C" int scanhash_x12(int thr_id, struct work* work, uint32_t max_nonce, u
 		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
 		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		else {
+			x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+		}
 		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
--- a/x16/cuda_x16_echo512_64.cu
+++ b/x16/cuda_x16_echo512_64.cu
@ -79,11 +79,12 @@ static void echo_round_alexis(const uint32_t sharedMemory[4][256], uint32_t *W,
				@@ -79,11 +79,12 @@ static void echo_round_alexis(const uint32_t sharedMemory[4][256], uint32_t *W,
 }

 __global__ __launch_bounds__(128, 5) /* will force 80 registers */
-static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
+static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t* g_hash, uint32_t* const d_filter, const uint32_t filter_val)
 {
 	__shared__ uint32_t sharedMemory[4][256];

 	aes_gpu_init128(sharedMemory);
+	__syncthreads();

 	const uint32_t thread = (blockDim.x * blockIdx.x + threadIdx.x);
 	uint32_t k0;
@ -91,6 +92,9 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
				@@ -91,6 +92,9 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
 	uint32_t hash[16];
 	if (thread < threads)
 	{
+		// phi2 filter (2 hash chain branches)
+		if (d_filter && d_filter[thread] != filter_val) return;
+
 		uint32_t *Hash = &g_hash[thread<<4];

 		*(uint2x4*)&h[ 0] = __ldg4((uint2x4*)&Hash[ 0]);
@ -99,8 +103,6 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
				@@ -99,8 +103,6 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
 		*(uint2x4*)&hash[ 0] = *(uint2x4*)&h[ 0];
 		*(uint2x4*)&hash[ 8] = *(uint2x4*)&h[ 8];

-		__syncthreads();
-
 		const uint32_t P[48] = {
 			0xe7e9f5f5, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af, 0xa4213d7e, 0xf5e7e9f5, 0xb3b36b23, 0xb3dbe7af,
 			//8-12
@ -217,7 +219,6 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
				@@ -217,7 +219,6 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
 			W[48 + i + 4] = a ^ cd ^ bcx;
 			W[48 + i + 8] = d ^ ab ^ cdx;
 			W[48 + i + 12] = c ^ ab ^ abx ^ bcx ^ cdx;
-
 		}

 		for (int k = 1; k < 10; k++)
@ -237,12 +238,23 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
				@@ -237,12 +238,23 @@ static void x16_echo512_gpu_hash_64(uint32_t threads, uint32_t *g_hash)
 }

 __host__
-void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash){
-
+void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash)
+{
 	const uint32_t threadsperblock = 128;

 	dim3 grid((threads + threadsperblock-1)/threadsperblock);
 	dim3 block(threadsperblock);

-	x16_echo512_gpu_hash_64<<<grid, block>>>(threads, d_hash);
+	x16_echo512_gpu_hash_64 <<<grid, block>>> (threads, d_hash, NULL, 0);
 }
+
+__host__
+void phi_echo512_cpu_hash_64_filtered(int thr_id, const uint32_t threads, uint32_t* g_hash, uint32_t* d_filter)
+{
+	const uint32_t threadsperblock = 128;
+
+	dim3 grid((threads + threadsperblock - 1) / threadsperblock);
+	dim3 block(threadsperblock);
+
+	x16_echo512_gpu_hash_64 <<<grid, block>>> (threads, g_hash, d_filter, 0);
+}
--- a/x16/x16r.cu
+++ b/x16/x16r.cu
@ -250,7 +250,7 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
				@@ -250,7 +250,7 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,

 	if (!init[thr_id])
 	{
-		cudaSetDevice(device_map[thr_id]);
+		cudaSetDevice(dev_id);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
 			// reduce cpu usage
@ -491,8 +491,9 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
				@@ -491,8 +491,9 @@ extern "C" int scanhash_x16r(int thr_id, struct work* work, uint32_t max_nonce,
 			case ECHO:
 				if (use_compat_kernels[thr_id])
 					x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-				else
+				else {
 					x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+				}
 				TRACE("echo   :");
 				break;
 			case HAMSI:
--- a/x16/x16s.cu
+++ b/x16/x16s.cu
@ -248,7 +248,7 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce,
				@@ -248,7 +248,7 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce,

 	if (!init[thr_id])
 	{
-		cudaSetDevice(device_map[thr_id]);
+		cudaSetDevice(dev_id);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
 			// reduce cpu usage
@ -489,8 +489,9 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce,
				@@ -489,8 +489,9 @@ extern "C" int scanhash_x16s(int thr_id, struct work* work, uint32_t max_nonce,
 			case ECHO:
 				if (use_compat_kernels[thr_id])
 					x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-				else
+				else {
 					x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+				}
 				TRACE("echo   :");
 				break;
 			case HAMSI:
--- a/x17/sonoa.cu
+++ b/x17/sonoa.cu
@ -0,0 +1,632 @@
				@@ -0,0 +1,632 @@
+/**
+ * x97 SONO
+ **/
+
+extern "C" {
+#include "sph/sph_blake.h"
+#include "sph/sph_bmw.h"
+#include "sph/sph_groestl.h"
+#include "sph/sph_skein.h"
+#include "sph/sph_jh.h"
+#include "sph/sph_keccak.h"
+#include "sph/sph_luffa.h"
+#include "sph/sph_cubehash.h"
+#include "sph/sph_shavite.h"
+#include "sph/sph_simd.h"
+#include "sph/sph_echo.h"
+#include "sph/sph_hamsi.h"
+#include "sph/sph_fugue.h"
+#include "sph/sph_shabal.h"
+#include "sph/sph_whirlpool.h"
+#include "sph/sph_sha2.h"
+#include "sph/sph_haval.h"
+}
+
+#include "miner.h"
+#include "cuda_helper.h"
+#include "x11/cuda_x11.h"
+
+#define NBN 2
+
+static uint32_t *d_hash[MAX_GPUS];
+
+extern void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+
+extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x13_fugue512_cpu_init(int thr_id, uint32_t threads);
+extern void x13_fugue512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x13_fugue512_cpu_free(int thr_id);
+
+extern void x14_shabal512_cpu_init(int thr_id, uint32_t threads);
+extern void x14_shabal512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+
+extern void x15_whirlpool_cpu_init(int thr_id, uint32_t threads, int flag);
+extern void x15_whirlpool_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);
+extern void x15_whirlpool_cpu_free(int thr_id);
+
+extern void x17_sha512_cpu_init(int thr_id, uint32_t threads);
+extern void x17_sha512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash);
+
+extern void x17_haval256_cpu_init(int thr_id, uint32_t threads);
+extern void x17_haval256_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_hash, const int outlen);
+
+// CPU Hash Validation
+extern "C" void sonoa_hash(void *output, const void *input)
+{
+	unsigned char _ALIGN(128) hash[64];
+
+	sph_blake512_context ctx_blake;
+	sph_bmw512_context ctx_bmw;
+	sph_groestl512_context ctx_groestl;
+	sph_jh512_context ctx_jh;
+	sph_keccak512_context ctx_keccak;
+	sph_skein512_context ctx_skein;
+	sph_luffa512_context ctx_luffa;
+	sph_cubehash512_context ctx_cubehash;
+	sph_shavite512_context ctx_shavite;
+	sph_simd512_context ctx_simd;
+	sph_echo512_context ctx_echo;
+	sph_hamsi512_context ctx_hamsi;
+	sph_fugue512_context ctx_fugue;
+	sph_shabal512_context ctx_shabal;
+	sph_whirlpool_context ctx_whirlpool;
+	sph_sha512_context ctx_sha512;
+	sph_haval256_5_context ctx_haval;
+
+
+	sph_blake512_init(&ctx_blake);
+	sph_blake512(&ctx_blake, input, 80);
+	sph_blake512_close(&ctx_blake, (void*)hash);
+
+	sph_bmw512_init(&ctx_bmw);
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_groestl512_init(&ctx_groestl);
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512_init(&ctx_skein);
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512_init(&ctx_jh);
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512_init(&ctx_keccak);
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512_init(&ctx_luffa);
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512_init(&ctx_cubehash);
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512_init(&ctx_shavite);
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512_init(&ctx_simd);
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512_init(&ctx_echo);
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_hamsi512_init(&ctx_hamsi);
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+	sph_fugue512_init(&ctx_fugue);
+	sph_fugue512(&ctx_fugue, (const void*)hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*)hash);
+
+
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+	sph_fugue512(&ctx_fugue, (const void*)hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*)hash);
+
+	sph_shabal512_init(&ctx_shabal);
+	sph_shabal512(&ctx_shabal, (const void*)hash, 64);
+	sph_shabal512_close(&ctx_shabal, (void*)hash);
+
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_shabal512(&ctx_shabal, (const void*)hash, 64);
+	sph_shabal512_close(&ctx_shabal, (void*)hash);
+
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+	sph_fugue512(&ctx_fugue, (const void*)hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*)hash);
+
+	sph_shabal512(&ctx_shabal, (const void*)hash, 64);
+	sph_shabal512_close(&ctx_shabal, (void*)hash);
+
+	sph_whirlpool_init(&ctx_whirlpool);
+	sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64);
+	sph_whirlpool_close(&ctx_whirlpool, (void*)hash);
+
+
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+	sph_fugue512(&ctx_fugue, (const void*)hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*)hash);
+
+	sph_shabal512(&ctx_shabal, (const void*)hash, 64);
+	sph_shabal512_close(&ctx_shabal, (void*)hash);
+
+	sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64);
+	sph_whirlpool_close(&ctx_whirlpool, (void*)hash);
+
+	sph_sha512_init(&ctx_sha512);
+	sph_sha512(&ctx_sha512, (const void*)hash, 64);
+	sph_sha512_close(&ctx_sha512, (void*)hash);
+
+	sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64);
+	sph_whirlpool_close(&ctx_whirlpool, (void*)hash);
+
+
+	sph_bmw512(&ctx_bmw, (const void*)hash, 64);
+	sph_bmw512_close(&ctx_bmw, (void*)hash);
+
+	sph_groestl512(&ctx_groestl, (const void*)hash, 64);
+	sph_groestl512_close(&ctx_groestl, (void*)hash);
+
+	sph_skein512(&ctx_skein, (const void*)hash, 64);
+	sph_skein512_close(&ctx_skein, (void*)hash);
+
+	sph_jh512(&ctx_jh, (const void*)hash, 64);
+	sph_jh512_close(&ctx_jh, (void*)hash);
+
+	sph_keccak512(&ctx_keccak, (const void*)hash, 64);
+	sph_keccak512_close(&ctx_keccak, (void*)hash);
+
+	sph_luffa512(&ctx_luffa, (const void*)hash, 64);
+	sph_luffa512_close(&ctx_luffa, (void*)hash);
+
+	sph_cubehash512(&ctx_cubehash, (const void*)hash, 64);
+	sph_cubehash512_close(&ctx_cubehash, (void*)hash);
+
+	sph_shavite512(&ctx_shavite, (const void*)hash, 64);
+	sph_shavite512_close(&ctx_shavite, (void*)hash);
+
+	sph_simd512(&ctx_simd, (const void*)hash, 64);
+	sph_simd512_close(&ctx_simd, (void*)hash);
+
+	sph_echo512(&ctx_echo, (const void*)hash, 64);
+	sph_echo512_close(&ctx_echo, (void*)hash);
+
+	sph_hamsi512(&ctx_hamsi, (const void*)hash, 64);
+	sph_hamsi512_close(&ctx_hamsi, (void*)hash);
+
+	sph_fugue512(&ctx_fugue, (const void*)hash, 64);
+	sph_fugue512_close(&ctx_fugue, (void*)hash);
+
+	sph_shabal512(&ctx_shabal, (const void*)hash, 64);
+	sph_shabal512_close(&ctx_shabal, (void*)hash);
+
+	sph_whirlpool(&ctx_whirlpool, (const void*)hash, 64);
+	sph_whirlpool_close(&ctx_whirlpool, (void*)hash);
+
+	sph_sha512(&ctx_sha512, (const void*)hash, 64);
+	sph_sha512_close(&ctx_sha512, (void*)hash);
+
+	sph_haval256_5_init(&ctx_haval);
+	sph_haval256_5(&ctx_haval, (const void*)hash, 64);
+	sph_haval256_5_close(&ctx_haval, (void*)hash);
+
+	memcpy(output, hash, 32);
+}
+
+#define x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash) \
+  x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash, order++); \
+  if (use_compat_kernels[thr_id]) x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash, order++); \
+  else x16_echo512_cpu_hash_64(thr_id, throughput, d_hash)
+
+
+static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };
+
+extern "C" int scanhash_sonoa(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
+{
+	uint32_t *pdata = work->data;
+	uint32_t *ptarget = work->target;
+	const uint32_t first_nonce = pdata[19];
+	const int dev_id = device_map[thr_id];
+
+	uint32_t default_throughput = 1 << 18;
+	if (device_sm[dev_id] <= 500) default_throughput = 1 << 18;
+	else if (device_sm[dev_id] <= 520) default_throughput = 1 << 18;
+	else if (device_sm[dev_id]  > 520) default_throughput = (1 << 19) + (1 << 18);
+
+	uint32_t throughput = cuda_default_throughput(thr_id, default_throughput);
+	if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
+
+	throughput &= 0xFFFFFF00;
+
+	if (opt_benchmark)
+		((uint32_t*)ptarget)[7] = 0x00ff;
+
+	if (!init[thr_id])
+	{
+		cudaSetDevice(dev_id);
+		if (opt_cudaschedule == -1 && gpu_threads == 1) {
+			cudaDeviceReset();
+			cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync);
+		}
+		gpulog(LOG_INFO,thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);
+
+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_init(thr_id, throughput);
+
+		quark_blake512_cpu_init(thr_id, throughput);
+		quark_groestl512_cpu_init(thr_id, throughput);
+		quark_skein512_cpu_init(thr_id, throughput);
+		quark_bmw512_cpu_init(thr_id, throughput);
+		quark_keccak512_cpu_init(thr_id, throughput);
+		quark_jh512_cpu_init(thr_id, throughput);
+		x11_luffaCubehash512_cpu_init(thr_id, throughput);
+		x11_shavite512_cpu_init(thr_id, throughput);
+		x11_simd512_cpu_init(thr_id, throughput);
+		x13_hamsi512_cpu_init(thr_id, throughput);
+		x13_fugue512_cpu_init(thr_id, throughput);
+		x14_shabal512_cpu_init(thr_id, throughput);
+		x15_whirlpool_cpu_init(thr_id, throughput, 0);
+		x17_sha512_cpu_init(thr_id, throughput);
+		x17_haval256_cpu_init(thr_id, throughput);
+
+		CUDA_SAFE_CALL(cudaMalloc(&d_hash[thr_id], 8 * sizeof(uint64_t) * throughput));
+
+		cuda_check_cpu_init(thr_id, throughput);
+
+		init[thr_id] = true;
+	}
+
+	int warn = 0;
+	uint32_t _ALIGN(64) endiandata[20];
+	for (int k=0; k < 20; k++)
+		be32enc(&endiandata[k], pdata[k]);
+
+	quark_blake512_cpu_setBlock_80(thr_id, endiandata);
+	cuda_check_cpu_setTarget(ptarget);
+
+	do {
+		int order = 0;
+		quark_blake512_cpu_hash_80(thr_id, throughput, pdata[19], d_hash[thr_id]);
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+
+		quark_bmw512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_groestl512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_skein512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_jh512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		quark_keccak512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
+		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x11_simd_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]);
+		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x15_whirlpool_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		x17_sha512_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id]); order++;
+		x17_haval256_cpu_hash_64(thr_id, throughput, pdata[19], d_hash[thr_id], 256); order++;
+
+		*hashes_done = pdata[19] - first_nonce + throughput;
+
+                work->nonces[0] = cuda_check_hash(thr_id, throughput, pdata[19], d_hash[thr_id]);
+                if (work->nonces[0] != UINT32_MAX)
+		{
+			const uint32_t Htarg = ptarget[7];
+			uint32_t _ALIGN(64) vhash[8];
+			be32enc(&endiandata[19], work->nonces[0]);
+			sonoa_hash(vhash, endiandata);
+
+			if (vhash[7] <= Htarg && fulltest(vhash, ptarget))
+			{
+				work->valid_nonces = 1;
+				work->nonces[1] = cuda_check_hash_suppl(thr_id, throughput, pdata[19], d_hash[thr_id], 1);
+				work_set_target_ratio(work, vhash);
+				if (work->nonces[1] != 0) {
+					be32enc(&endiandata[19], work->nonces[1]);
+					sonoa_hash(vhash, endiandata);
+					bn_set_target_ratio(work, vhash, 1);
+					work->valid_nonces++;
+					pdata[19] = max(work->nonces[0], work->nonces[1]) + 1;
+				} else {
+					pdata[19] = work->nonces[0] + 1; // cursor
+				}
+				return work->valid_nonces;
+			}
+			else if (vhash[7] > Htarg) {
+				gpu_increment_reject(thr_id);
+				if (!warn) {
+					warn++;
+					pdata[19] = work->nonces[0] + 1;
+					continue;
+				} else {
+					if (!opt_quiet)
+					gpulog(LOG_WARNING, thr_id, "result for %08x does not validate on CPU!", work->nonces[0]);
+					warn = 0;
+				}
+			}
+		}
+
+		if ((uint64_t)throughput + pdata[19] >= max_nonce) {
+			pdata[19] = max_nonce;
+			break;
+		}
+
+		pdata[19] += throughput;
+
+	} while (pdata[19] < max_nonce && !work_restart[thr_id].restart);
+
+	*hashes_done = pdata[19] - first_nonce;
+	return 0;
+}
+
+extern "C" void free_sonoa(int thr_id)
+{
+	if (!init[thr_id])
+		return;
+
+	cudaDeviceSynchronize();
+
+	cudaFree(d_hash[thr_id]);
+
+	quark_blake512_cpu_free(thr_id);
+	quark_groestl512_cpu_free(thr_id);
+	x11_simd512_cpu_free(thr_id);
+	x13_fugue512_cpu_free(thr_id);
+	x15_whirlpool_cpu_free(thr_id);
+
+	cudaDeviceSynchronize();
+	init[thr_id] = false;
+}
--- a/x17/x17.cu
+++ b/x17/x17.cu
@ -32,6 +32,8 @@ extern "C" {
				@@ -32,6 +32,8 @@ extern "C" {

 static uint32_t *d_hash[MAX_GPUS];

+extern void x16_echo512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t *d_hash);
+
 extern void x13_hamsi512_cpu_init(int thr_id, uint32_t threads);
 extern void x13_hamsi512_cpu_hash_64(int thr_id, uint32_t threads, uint32_t startNounce, uint32_t *d_nonceVector, uint32_t *d_hash, int order);

@ -151,12 +153,14 @@ extern "C" void x17hash(void *output, const void *input)
				@@ -151,12 +153,14 @@ extern "C" void x17hash(void *output, const void *input)
 }

 static bool init[MAX_GPUS] = { 0 };
+static bool use_compat_kernels[MAX_GPUS] = { 0 };

 extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, unsigned long *hashes_done)
 {
 	uint32_t *pdata = work->data;
 	uint32_t *ptarget = work->target;
 	const uint32_t first_nonce = pdata[19];
+	const int dev_id = device_map[thr_id];

 	uint32_t throughput =  cuda_default_throughput(thr_id, 1U << 19); // 19=256*256*8;
 	//if (init[thr_id]) throughput = min(throughput, max_nonce - first_nonce);
@ -166,7 +170,7 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
				@@ -166,7 +170,7 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u

 	if (!init[thr_id])
 	{
-		cudaSetDevice(device_map[thr_id]);
+		cudaSetDevice(dev_id);
 		if (opt_cudaschedule == -1 && gpu_threads == 1) {
 			cudaDeviceReset();
 			// reduce cpu usage
@ -174,6 +178,11 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
				@@ -174,6 +178,11 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
 		}
 		gpulog(LOG_INFO, thr_id, "Intensity set to %g, %u cuda threads", throughput2intensity(throughput), throughput);

+		cuda_get_arch(thr_id);
+		use_compat_kernels[thr_id] = (cuda_arch[dev_id] < 500);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_init(thr_id, throughput);
+
 		quark_blake512_cpu_init(thr_id, throughput);
 		quark_groestl512_cpu_init(thr_id, throughput);
 		quark_skein512_cpu_init(thr_id, throughput);
@ -183,7 +192,6 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
				@@ -183,7 +192,6 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
 		x11_luffaCubehash512_cpu_init(thr_id, throughput);
 		x11_shavite512_cpu_init(thr_id, throughput);
 		x11_simd512_cpu_init(thr_id, throughput);
-		x11_echo512_cpu_init(thr_id, throughput);
 		x13_hamsi512_cpu_init(thr_id, throughput);
 		x13_fugue512_cpu_init(thr_id, throughput);
 		x14_shabal512_cpu_init(thr_id, throughput);
@ -220,7 +228,11 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
				@@ -220,7 +228,11 @@ extern "C" int scanhash_x17(int thr_id, struct work* work, uint32_t max_nonce, u
 		x11_luffaCubehash512_cpu_hash_64(thr_id, throughput, d_hash[thr_id], order++);
 		x11_shavite512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x11_simd512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
-		x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		if (use_compat_kernels[thr_id])
+			x11_echo512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
+		else {
+			x16_echo512_cpu_hash_64(thr_id, throughput, d_hash[thr_id]); order++;
+		}
 		x13_hamsi512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x13_fugue512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);
 		x14_shabal512_cpu_hash_64(thr_id, throughput, pdata[19], NULL, d_hash[thr_id], order++);